1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include <string.h> 22cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h" 23cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 24cabdff1aSopenharmony_ci#include "vp9dsp_mips.h" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci#define VP9_DCT_CONST_BITS 14 27cabdff1aSopenharmony_ci#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n)) 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_cistatic const int32_t cospi_1_64 = 16364; 30cabdff1aSopenharmony_cistatic const int32_t cospi_2_64 = 16305; 31cabdff1aSopenharmony_cistatic const int32_t cospi_3_64 = 16207; 32cabdff1aSopenharmony_cistatic const int32_t cospi_4_64 = 16069; 33cabdff1aSopenharmony_cistatic const int32_t cospi_5_64 = 15893; 34cabdff1aSopenharmony_cistatic const int32_t cospi_6_64 = 15679; 35cabdff1aSopenharmony_cistatic const int32_t cospi_7_64 = 15426; 36cabdff1aSopenharmony_cistatic const int32_t cospi_8_64 = 15137; 37cabdff1aSopenharmony_cistatic const int32_t cospi_9_64 = 14811; 38cabdff1aSopenharmony_cistatic const int32_t cospi_10_64 = 14449; 39cabdff1aSopenharmony_cistatic const int32_t cospi_11_64 = 14053; 40cabdff1aSopenharmony_cistatic const int32_t cospi_12_64 = 13623; 41cabdff1aSopenharmony_cistatic const int32_t cospi_13_64 = 13160; 42cabdff1aSopenharmony_cistatic const int32_t cospi_14_64 = 12665; 43cabdff1aSopenharmony_cistatic const int32_t cospi_15_64 = 12140; 44cabdff1aSopenharmony_cistatic const int32_t cospi_16_64 = 11585; 45cabdff1aSopenharmony_cistatic const int32_t cospi_17_64 = 11003; 46cabdff1aSopenharmony_cistatic const int32_t cospi_18_64 = 10394; 47cabdff1aSopenharmony_cistatic const int32_t cospi_19_64 = 9760; 48cabdff1aSopenharmony_cistatic const int32_t cospi_20_64 = 9102; 49cabdff1aSopenharmony_cistatic const int32_t cospi_21_64 = 8423; 50cabdff1aSopenharmony_cistatic const int32_t cospi_22_64 = 7723; 51cabdff1aSopenharmony_cistatic const int32_t cospi_23_64 = 7005; 52cabdff1aSopenharmony_cistatic const int32_t cospi_24_64 = 6270; 53cabdff1aSopenharmony_cistatic const int32_t cospi_25_64 = 5520; 54cabdff1aSopenharmony_cistatic const int32_t cospi_26_64 = 4756; 55cabdff1aSopenharmony_cistatic const int32_t cospi_27_64 = 3981; 56cabdff1aSopenharmony_cistatic const int32_t cospi_28_64 = 3196; 57cabdff1aSopenharmony_cistatic const int32_t cospi_29_64 = 2404; 58cabdff1aSopenharmony_cistatic const int32_t cospi_30_64 = 1606; 59cabdff1aSopenharmony_cistatic const int32_t cospi_31_64 = 804; 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 62cabdff1aSopenharmony_cistatic const int32_t sinpi_1_9 = 5283; 63cabdff1aSopenharmony_cistatic const int32_t sinpi_2_9 = 9929; 64cabdff1aSopenharmony_cistatic const int32_t sinpi_3_9 = 13377; 65cabdff1aSopenharmony_cistatic const int32_t sinpi_4_9 = 15212; 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ci#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ 68cabdff1aSopenharmony_ci{ \ 69cabdff1aSopenharmony_ci v8i16 k0_m = __msa_fill_h(cnst0); \ 70cabdff1aSopenharmony_ci v4i32 s0_m, s1_m, s2_m, s3_m; \ 71cabdff1aSopenharmony_ci \ 72cabdff1aSopenharmony_ci s0_m = (v4i32) __msa_fill_h(cnst1); \ 73cabdff1aSopenharmony_ci k0_m = __msa_ilvev_h((v8i16) s0_m, k0_m); \ 74cabdff1aSopenharmony_ci \ 75cabdff1aSopenharmony_ci ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ 76cabdff1aSopenharmony_ci ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ 77cabdff1aSopenharmony_ci DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ 78cabdff1aSopenharmony_ci SRARI_W2_SW(s1_m, s0_m, VP9_DCT_CONST_BITS); \ 79cabdff1aSopenharmony_ci out0 = __msa_pckev_h((v8i16) s0_m, (v8i16) s1_m); \ 80cabdff1aSopenharmony_ci \ 81cabdff1aSopenharmony_ci DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ 82cabdff1aSopenharmony_ci SRARI_W2_SW(s1_m, s0_m, VP9_DCT_CONST_BITS); \ 83cabdff1aSopenharmony_ci out1 = __msa_pckev_h((v8i16) s0_m, (v8i16) s1_m); \ 84cabdff1aSopenharmony_ci} 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci#define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \ 87cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3) \ 88cabdff1aSopenharmony_ci{ \ 89cabdff1aSopenharmony_ci v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ 90cabdff1aSopenharmony_ci v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ 91cabdff1aSopenharmony_ci \ 92cabdff1aSopenharmony_ci DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \ 93cabdff1aSopenharmony_ci tp0_m, tp2_m, tp3_m, tp4_m); \ 94cabdff1aSopenharmony_ci DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \ 95cabdff1aSopenharmony_ci tp5_m, tp6_m, tp7_m, tp8_m); \ 96cabdff1aSopenharmony_ci BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ 97cabdff1aSopenharmony_ci BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ 98cabdff1aSopenharmony_ci SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, VP9_DCT_CONST_BITS); \ 99cabdff1aSopenharmony_ci SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, VP9_DCT_CONST_BITS); \ 100cabdff1aSopenharmony_ci PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \ 101cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); \ 102cabdff1aSopenharmony_ci} 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_ci#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \ 105cabdff1aSopenharmony_ci( { \ 106cabdff1aSopenharmony_ci v8i16 dst_m; \ 107cabdff1aSopenharmony_ci v4i32 tp0_m, tp1_m; \ 108cabdff1aSopenharmony_ci \ 109cabdff1aSopenharmony_ci DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ 110cabdff1aSopenharmony_ci SRARI_W2_SW(tp1_m, tp0_m, VP9_DCT_CONST_BITS); \ 111cabdff1aSopenharmony_ci dst_m = __msa_pckev_h((v8i16) tp1_m, (v8i16) tp0_m); \ 112cabdff1aSopenharmony_ci \ 113cabdff1aSopenharmony_ci dst_m; \ 114cabdff1aSopenharmony_ci} ) 115cabdff1aSopenharmony_ci 116cabdff1aSopenharmony_ci#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \ 117cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 118cabdff1aSopenharmony_ci{ \ 119cabdff1aSopenharmony_ci v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ 120cabdff1aSopenharmony_ci v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ 121cabdff1aSopenharmony_ci v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ 122cabdff1aSopenharmony_ci cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ 123cabdff1aSopenharmony_ci v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \ 124cabdff1aSopenharmony_ci -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 }; \ 125cabdff1aSopenharmony_ci \ 126cabdff1aSopenharmony_ci SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ 127cabdff1aSopenharmony_ci cnst2_m = -cnst0_m; \ 128cabdff1aSopenharmony_ci ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ 129cabdff1aSopenharmony_ci SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ 130cabdff1aSopenharmony_ci cnst4_m = -cnst2_m; \ 131cabdff1aSopenharmony_ci ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ 132cabdff1aSopenharmony_ci \ 133cabdff1aSopenharmony_ci ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ 134cabdff1aSopenharmony_ci ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ 135cabdff1aSopenharmony_ci VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ 136cabdff1aSopenharmony_ci cnst1_m, cnst2_m, cnst3_m, in7, in0, \ 137cabdff1aSopenharmony_ci in4, in3); \ 138cabdff1aSopenharmony_ci \ 139cabdff1aSopenharmony_ci SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ 140cabdff1aSopenharmony_ci cnst2_m = -cnst0_m; \ 141cabdff1aSopenharmony_ci ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ 142cabdff1aSopenharmony_ci SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ 143cabdff1aSopenharmony_ci cnst4_m = -cnst2_m; \ 144cabdff1aSopenharmony_ci ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ 145cabdff1aSopenharmony_ci \ 146cabdff1aSopenharmony_ci ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ 147cabdff1aSopenharmony_ci ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ 148cabdff1aSopenharmony_ci \ 149cabdff1aSopenharmony_ci VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ 150cabdff1aSopenharmony_ci cnst1_m, cnst2_m, cnst3_m, in5, in2, \ 151cabdff1aSopenharmony_ci in6, in1); \ 152cabdff1aSopenharmony_ci BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ 153cabdff1aSopenharmony_ci out7 = -s0_m; \ 154cabdff1aSopenharmony_ci out0 = s1_m; \ 155cabdff1aSopenharmony_ci \ 156cabdff1aSopenharmony_ci SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, \ 157cabdff1aSopenharmony_ci cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ 158cabdff1aSopenharmony_ci \ 159cabdff1aSopenharmony_ci ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ 160cabdff1aSopenharmony_ci cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 161cabdff1aSopenharmony_ci cnst1_m = cnst0_m; \ 162cabdff1aSopenharmony_ci \ 163cabdff1aSopenharmony_ci ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ 164cabdff1aSopenharmony_ci ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ 165cabdff1aSopenharmony_ci VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ 166cabdff1aSopenharmony_ci cnst2_m, cnst3_m, cnst1_m, out1, out6, \ 167cabdff1aSopenharmony_ci s0_m, s1_m); \ 168cabdff1aSopenharmony_ci \ 169cabdff1aSopenharmony_ci SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ 170cabdff1aSopenharmony_ci cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 171cabdff1aSopenharmony_ci \ 172cabdff1aSopenharmony_ci ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ 173cabdff1aSopenharmony_ci ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ 174cabdff1aSopenharmony_ci out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ 175cabdff1aSopenharmony_ci out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ 176cabdff1aSopenharmony_ci out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ 177cabdff1aSopenharmony_ci out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ 178cabdff1aSopenharmony_ci \ 179cabdff1aSopenharmony_ci out1 = -out1; \ 180cabdff1aSopenharmony_ci out3 = -out3; \ 181cabdff1aSopenharmony_ci out5 = -out5; \ 182cabdff1aSopenharmony_ci} 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) \ 185cabdff1aSopenharmony_ci{ \ 186cabdff1aSopenharmony_ci v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ 187cabdff1aSopenharmony_ci v8i16 madd_s0_m, madd_s1_m; \ 188cabdff1aSopenharmony_ci \ 189cabdff1aSopenharmony_ci ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ 190cabdff1aSopenharmony_ci DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \ 191cabdff1aSopenharmony_ci c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \ 192cabdff1aSopenharmony_ci SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, VP9_DCT_CONST_BITS); \ 193cabdff1aSopenharmony_ci PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ 194cabdff1aSopenharmony_ci} 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_ci#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ 197cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 198cabdff1aSopenharmony_ci{ \ 199cabdff1aSopenharmony_ci v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ 200cabdff1aSopenharmony_ci v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ 201cabdff1aSopenharmony_ci \ 202cabdff1aSopenharmony_ci ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ 203cabdff1aSopenharmony_ci ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ 204cabdff1aSopenharmony_ci DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ 205cabdff1aSopenharmony_ci cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 206cabdff1aSopenharmony_ci BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ 207cabdff1aSopenharmony_ci m4_m, m5_m, tmp3_m, tmp2_m); \ 208cabdff1aSopenharmony_ci SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ 209cabdff1aSopenharmony_ci PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ 210cabdff1aSopenharmony_ci DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ 211cabdff1aSopenharmony_ci cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 212cabdff1aSopenharmony_ci BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ 213cabdff1aSopenharmony_ci m4_m, m5_m, tmp3_m, tmp2_m); \ 214cabdff1aSopenharmony_ci SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ 215cabdff1aSopenharmony_ci PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ 216cabdff1aSopenharmony_ci} 217cabdff1aSopenharmony_ci 218cabdff1aSopenharmony_ci#define VP9_SET_COSPI_PAIR(c0_h, c1_h) \ 219cabdff1aSopenharmony_ci( { \ 220cabdff1aSopenharmony_ci v8i16 out0_m, r0_m, r1_m; \ 221cabdff1aSopenharmony_ci \ 222cabdff1aSopenharmony_ci r0_m = __msa_fill_h(c0_h); \ 223cabdff1aSopenharmony_ci r1_m = __msa_fill_h(c1_h); \ 224cabdff1aSopenharmony_ci out0_m = __msa_ilvev_h(r1_m, r0_m); \ 225cabdff1aSopenharmony_ci \ 226cabdff1aSopenharmony_ci out0_m; \ 227cabdff1aSopenharmony_ci} ) 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ 230cabdff1aSopenharmony_ci{ \ 231cabdff1aSopenharmony_ci uint8_t *dst_m = (uint8_t *) (dst); \ 232cabdff1aSopenharmony_ci v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ 233cabdff1aSopenharmony_ci v16i8 tmp0_m, tmp1_m; \ 234cabdff1aSopenharmony_ci v16i8 zero_m = { 0 }; \ 235cabdff1aSopenharmony_ci v8i16 res0_m, res1_m, res2_m, res3_m; \ 236cabdff1aSopenharmony_ci \ 237cabdff1aSopenharmony_ci LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ 238cabdff1aSopenharmony_ci ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, \ 239cabdff1aSopenharmony_ci zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m); \ 240cabdff1aSopenharmony_ci ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, \ 241cabdff1aSopenharmony_ci res0_m, res1_m, res2_m, res3_m); \ 242cabdff1aSopenharmony_ci CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ 243cabdff1aSopenharmony_ci PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ 244cabdff1aSopenharmony_ci ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, dst_m, dst_stride); \ 245cabdff1aSopenharmony_ci} 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ 248cabdff1aSopenharmony_ci{ \ 249cabdff1aSopenharmony_ci v8i16 c0_m, c1_m, c2_m, c3_m; \ 250cabdff1aSopenharmony_ci v8i16 step0_m, step1_m; \ 251cabdff1aSopenharmony_ci v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 252cabdff1aSopenharmony_ci v16i8 zeros = { 0 }; \ 253cabdff1aSopenharmony_ci \ 254cabdff1aSopenharmony_ci c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ 255cabdff1aSopenharmony_ci c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ 256cabdff1aSopenharmony_ci step0_m = __msa_ilvr_h(in2, in0); \ 257cabdff1aSopenharmony_ci DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ 258cabdff1aSopenharmony_ci \ 259cabdff1aSopenharmony_ci c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ 260cabdff1aSopenharmony_ci c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ 261cabdff1aSopenharmony_ci step1_m = __msa_ilvr_h(in3, in1); \ 262cabdff1aSopenharmony_ci DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ 263cabdff1aSopenharmony_ci SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ 264cabdff1aSopenharmony_ci \ 265cabdff1aSopenharmony_ci PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ 266cabdff1aSopenharmony_ci SLDI_B2_SW(zeros, tmp0_m, zeros, tmp2_m, 8, tmp1_m, tmp3_m); \ 267cabdff1aSopenharmony_ci BUTTERFLY_4((v8i16) tmp0_m, (v8i16) tmp1_m, \ 268cabdff1aSopenharmony_ci (v8i16) tmp2_m, (v8i16) tmp3_m, \ 269cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 270cabdff1aSopenharmony_ci} 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ 273cabdff1aSopenharmony_ci{ \ 274cabdff1aSopenharmony_ci v8i16 res0_m, res1_m, c0_m, c1_m; \ 275cabdff1aSopenharmony_ci v8i16 k1_m, k2_m, k3_m, k4_m; \ 276cabdff1aSopenharmony_ci v8i16 zero_m = { 0 }; \ 277cabdff1aSopenharmony_ci v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 278cabdff1aSopenharmony_ci v4i32 int0_m, int1_m, int2_m, int3_m; \ 279cabdff1aSopenharmony_ci v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, \ 280cabdff1aSopenharmony_ci sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, \ 281cabdff1aSopenharmony_ci -sinpi_4_9 }; \ 282cabdff1aSopenharmony_ci \ 283cabdff1aSopenharmony_ci SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ 284cabdff1aSopenharmony_ci ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ 285cabdff1aSopenharmony_ci ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ 286cabdff1aSopenharmony_ci DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ 287cabdff1aSopenharmony_ci int0_m = tmp2_m + tmp1_m; \ 288cabdff1aSopenharmony_ci \ 289cabdff1aSopenharmony_ci SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ 290cabdff1aSopenharmony_ci ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ 291cabdff1aSopenharmony_ci DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ 292cabdff1aSopenharmony_ci int1_m = tmp0_m + tmp1_m; \ 293cabdff1aSopenharmony_ci \ 294cabdff1aSopenharmony_ci c0_m = __msa_splati_h(mask_m, 6); \ 295cabdff1aSopenharmony_ci ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ 296cabdff1aSopenharmony_ci ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ 297cabdff1aSopenharmony_ci DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ 298cabdff1aSopenharmony_ci int2_m = tmp0_m + tmp1_m; \ 299cabdff1aSopenharmony_ci \ 300cabdff1aSopenharmony_ci c0_m = __msa_splati_h(mask_m, 6); \ 301cabdff1aSopenharmony_ci c0_m = __msa_ilvev_h(c0_m, k1_m); \ 302cabdff1aSopenharmony_ci \ 303cabdff1aSopenharmony_ci res0_m = __msa_ilvr_h((in1), (in3)); \ 304cabdff1aSopenharmony_ci tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ 305cabdff1aSopenharmony_ci int3_m = tmp2_m + tmp0_m; \ 306cabdff1aSopenharmony_ci \ 307cabdff1aSopenharmony_ci res0_m = __msa_ilvr_h((in2), (in3)); \ 308cabdff1aSopenharmony_ci c1_m = __msa_ilvev_h(k4_m, k3_m); \ 309cabdff1aSopenharmony_ci \ 310cabdff1aSopenharmony_ci tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ 311cabdff1aSopenharmony_ci res1_m = __msa_ilvr_h((in0), (in2)); \ 312cabdff1aSopenharmony_ci c1_m = __msa_ilvev_h(k1_m, zero_m); \ 313cabdff1aSopenharmony_ci \ 314cabdff1aSopenharmony_ci tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ 315cabdff1aSopenharmony_ci int3_m += tmp2_m; \ 316cabdff1aSopenharmony_ci int3_m += tmp3_m; \ 317cabdff1aSopenharmony_ci \ 318cabdff1aSopenharmony_ci SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, VP9_DCT_CONST_BITS); \ 319cabdff1aSopenharmony_ci PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ 320cabdff1aSopenharmony_ci PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ 321cabdff1aSopenharmony_ci} 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_ci#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ 324cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 325cabdff1aSopenharmony_ci{ \ 326cabdff1aSopenharmony_ci v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 327cabdff1aSopenharmony_ci v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ 328cabdff1aSopenharmony_ci v8i16 zero_m = { 0 }; \ 329cabdff1aSopenharmony_ci \ 330cabdff1aSopenharmony_ci ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ 331cabdff1aSopenharmony_ci tmp0_n, tmp1_n, tmp2_n, tmp3_n); \ 332cabdff1aSopenharmony_ci ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ 333cabdff1aSopenharmony_ci ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ 334cabdff1aSopenharmony_ci \ 335cabdff1aSopenharmony_ci out0 = (v8i16) __msa_ilvr_d((v2i64) tmp1_m, (v2i64) tmp0_m); \ 336cabdff1aSopenharmony_ci out1 = (v8i16) __msa_ilvl_d((v2i64) tmp1_m, (v2i64) tmp0_m); \ 337cabdff1aSopenharmony_ci out2 = (v8i16) __msa_ilvr_d((v2i64) tmp3_m, (v2i64) tmp2_m); \ 338cabdff1aSopenharmony_ci out3 = (v8i16) __msa_ilvl_d((v2i64) tmp3_m, (v2i64) tmp2_m); \ 339cabdff1aSopenharmony_ci \ 340cabdff1aSopenharmony_ci out4 = zero_m; \ 341cabdff1aSopenharmony_ci out5 = zero_m; \ 342cabdff1aSopenharmony_ci out6 = zero_m; \ 343cabdff1aSopenharmony_ci out7 = zero_m; \ 344cabdff1aSopenharmony_ci} 345cabdff1aSopenharmony_ci 346cabdff1aSopenharmony_cistatic void vp9_idct4x4_1_add_msa(int16_t *input, uint8_t *dst, 347cabdff1aSopenharmony_ci int32_t dst_stride) 348cabdff1aSopenharmony_ci{ 349cabdff1aSopenharmony_ci int16_t out; 350cabdff1aSopenharmony_ci v8i16 vec; 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); 353cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); 354cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO(out, 4); 355cabdff1aSopenharmony_ci vec = __msa_fill_h(out); 356cabdff1aSopenharmony_ci input[0] = 0; 357cabdff1aSopenharmony_ci 358cabdff1aSopenharmony_ci ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride); 359cabdff1aSopenharmony_ci} 360cabdff1aSopenharmony_ci 361cabdff1aSopenharmony_cistatic void vp9_idct4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst, 362cabdff1aSopenharmony_ci int32_t dst_stride) 363cabdff1aSopenharmony_ci{ 364cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 365cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci /* load vector elements of 4x4 block */ 368cabdff1aSopenharmony_ci in0 = LD_SH(input); 369cabdff1aSopenharmony_ci in2 = LD_SH(input + 8); 370cabdff1aSopenharmony_ci in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0); 371cabdff1aSopenharmony_ci in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2); 372cabdff1aSopenharmony_ci ST_SH2(zero, zero, input, 8); 373cabdff1aSopenharmony_ci /* rows */ 374cabdff1aSopenharmony_ci VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); 375cabdff1aSopenharmony_ci /* columns */ 376cabdff1aSopenharmony_ci TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); 377cabdff1aSopenharmony_ci VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); 378cabdff1aSopenharmony_ci /* rounding (add 2^3, divide by 2^4) */ 379cabdff1aSopenharmony_ci SRARI_H4_SH(in0, in1, in2, in3, 4); 380cabdff1aSopenharmony_ci ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); 381cabdff1aSopenharmony_ci} 382cabdff1aSopenharmony_ci 383cabdff1aSopenharmony_cistatic void vp9_iadst4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst, 384cabdff1aSopenharmony_ci int32_t dst_stride) 385cabdff1aSopenharmony_ci{ 386cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 387cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 388cabdff1aSopenharmony_ci 389cabdff1aSopenharmony_ci /* load vector elements of 4x4 block */ 390cabdff1aSopenharmony_ci in0 = LD_SH(input); 391cabdff1aSopenharmony_ci in2 = LD_SH(input + 8); 392cabdff1aSopenharmony_ci in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0); 393cabdff1aSopenharmony_ci in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2); 394cabdff1aSopenharmony_ci ST_SH2(zero, zero, input, 8); 395cabdff1aSopenharmony_ci /* rows */ 396cabdff1aSopenharmony_ci VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); 397cabdff1aSopenharmony_ci /* columns */ 398cabdff1aSopenharmony_ci TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); 399cabdff1aSopenharmony_ci VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); 400cabdff1aSopenharmony_ci /* rounding (add 2^3, divide by 2^4) */ 401cabdff1aSopenharmony_ci SRARI_H4_SH(in0, in1, in2, in3, 4); 402cabdff1aSopenharmony_ci ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); 403cabdff1aSopenharmony_ci} 404cabdff1aSopenharmony_ci 405cabdff1aSopenharmony_cistatic void vp9_iadst_idct_4x4_add_msa(int16_t *input, uint8_t *dst, 406cabdff1aSopenharmony_ci int32_t dst_stride, int32_t eob) 407cabdff1aSopenharmony_ci{ 408cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 409cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_ci /* load vector elements of 4x4 block */ 412cabdff1aSopenharmony_ci in0 = LD_SH(input); 413cabdff1aSopenharmony_ci in2 = LD_SH(input + 8); 414cabdff1aSopenharmony_ci in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0); 415cabdff1aSopenharmony_ci in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2); 416cabdff1aSopenharmony_ci ST_SH2(zero, zero, input, 8); 417cabdff1aSopenharmony_ci /* cols */ 418cabdff1aSopenharmony_ci VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); 419cabdff1aSopenharmony_ci /* columns */ 420cabdff1aSopenharmony_ci TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); 421cabdff1aSopenharmony_ci VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); 422cabdff1aSopenharmony_ci /* rounding (add 2^3, divide by 2^4) */ 423cabdff1aSopenharmony_ci SRARI_H4_SH(in0, in1, in2, in3, 4); 424cabdff1aSopenharmony_ci ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); 425cabdff1aSopenharmony_ci} 426cabdff1aSopenharmony_ci 427cabdff1aSopenharmony_cistatic void vp9_idct_iadst_4x4_add_msa(int16_t *input, uint8_t *dst, 428cabdff1aSopenharmony_ci int32_t dst_stride, int32_t eob) 429cabdff1aSopenharmony_ci{ 430cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 431cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 432cabdff1aSopenharmony_ci 433cabdff1aSopenharmony_ci /* load vector elements of 4x4 block */ 434cabdff1aSopenharmony_ci in0 = LD_SH(input); 435cabdff1aSopenharmony_ci in2 = LD_SH(input + 8); 436cabdff1aSopenharmony_ci in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0); 437cabdff1aSopenharmony_ci in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2); 438cabdff1aSopenharmony_ci ST_SH2(zero, zero, input, 8); 439cabdff1aSopenharmony_ci /* cols */ 440cabdff1aSopenharmony_ci VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); 441cabdff1aSopenharmony_ci /* columns */ 442cabdff1aSopenharmony_ci TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); 443cabdff1aSopenharmony_ci VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); 444cabdff1aSopenharmony_ci /* rounding (add 2^3, divide by 2^4) */ 445cabdff1aSopenharmony_ci SRARI_H4_SH(in0, in1, in2, in3, 4); 446cabdff1aSopenharmony_ci ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); 447cabdff1aSopenharmony_ci} 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ 450cabdff1aSopenharmony_ci( { \ 451cabdff1aSopenharmony_ci v8i16 c0_m, c1_m; \ 452cabdff1aSopenharmony_ci \ 453cabdff1aSopenharmony_ci SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ 454cabdff1aSopenharmony_ci c0_m = __msa_ilvev_h(c1_m, c0_m); \ 455cabdff1aSopenharmony_ci \ 456cabdff1aSopenharmony_ci c0_m; \ 457cabdff1aSopenharmony_ci} ) 458cabdff1aSopenharmony_ci 459cabdff1aSopenharmony_ci/* multiply and add macro */ 460cabdff1aSopenharmony_ci#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ 461cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 462cabdff1aSopenharmony_ci{ \ 463cabdff1aSopenharmony_ci v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ 464cabdff1aSopenharmony_ci v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 465cabdff1aSopenharmony_ci \ 466cabdff1aSopenharmony_ci ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ 467cabdff1aSopenharmony_ci ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ 468cabdff1aSopenharmony_ci DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, \ 469cabdff1aSopenharmony_ci cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 470cabdff1aSopenharmony_ci SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ 471cabdff1aSopenharmony_ci PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ 472cabdff1aSopenharmony_ci DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, \ 473cabdff1aSopenharmony_ci cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 474cabdff1aSopenharmony_ci SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ 475cabdff1aSopenharmony_ci PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ 476cabdff1aSopenharmony_ci} 477cabdff1aSopenharmony_ci 478cabdff1aSopenharmony_ci/* idct 8x8 macro */ 479cabdff1aSopenharmony_ci#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ 480cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 481cabdff1aSopenharmony_ci{ \ 482cabdff1aSopenharmony_ci v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ 483cabdff1aSopenharmony_ci v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ 484cabdff1aSopenharmony_ci v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 485cabdff1aSopenharmony_ci v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ 486cabdff1aSopenharmony_ci cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ 487cabdff1aSopenharmony_ci \ 488cabdff1aSopenharmony_ci k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \ 489cabdff1aSopenharmony_ci k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \ 490cabdff1aSopenharmony_ci k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \ 491cabdff1aSopenharmony_ci k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \ 492cabdff1aSopenharmony_ci VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ 493cabdff1aSopenharmony_ci SUB2(in1, in3, in7, in5, res0_m, res1_m); \ 494cabdff1aSopenharmony_ci k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \ 495cabdff1aSopenharmony_ci k1_m = __msa_splati_h(mask_m, 4); \ 496cabdff1aSopenharmony_ci \ 497cabdff1aSopenharmony_ci ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ 498cabdff1aSopenharmony_ci DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ 499cabdff1aSopenharmony_ci tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 500cabdff1aSopenharmony_ci SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ 501cabdff1aSopenharmony_ci tp4_m = in1 + in3; \ 502cabdff1aSopenharmony_ci PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ 503cabdff1aSopenharmony_ci tp7_m = in7 + in5; \ 504cabdff1aSopenharmony_ci k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ 505cabdff1aSopenharmony_ci k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ 506cabdff1aSopenharmony_ci VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \ 507cabdff1aSopenharmony_ci in0, in4, in2, in6); \ 508cabdff1aSopenharmony_ci BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ 509cabdff1aSopenharmony_ci BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \ 510cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7); \ 511cabdff1aSopenharmony_ci} 512cabdff1aSopenharmony_ci 513cabdff1aSopenharmony_ci#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ 514cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 515cabdff1aSopenharmony_ci{ \ 516cabdff1aSopenharmony_ci v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ 517cabdff1aSopenharmony_ci v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ 518cabdff1aSopenharmony_ci v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ 519cabdff1aSopenharmony_ci v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, \ 520cabdff1aSopenharmony_ci cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ 521cabdff1aSopenharmony_ci v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, \ 522cabdff1aSopenharmony_ci cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ 523cabdff1aSopenharmony_ci v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64, \ 524cabdff1aSopenharmony_ci -cospi_16_64, 0, 0, 0, 0 }; \ 525cabdff1aSopenharmony_ci \ 526cabdff1aSopenharmony_ci k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \ 527cabdff1aSopenharmony_ci k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \ 528cabdff1aSopenharmony_ci ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ 529cabdff1aSopenharmony_ci DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ 530cabdff1aSopenharmony_ci r0_m, r1_m, r2_m, r3_m); \ 531cabdff1aSopenharmony_ci k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \ 532cabdff1aSopenharmony_ci k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \ 533cabdff1aSopenharmony_ci ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ 534cabdff1aSopenharmony_ci DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ 535cabdff1aSopenharmony_ci r4_m, r5_m, r6_m, r7_m); \ 536cabdff1aSopenharmony_ci ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ 537cabdff1aSopenharmony_ci m0_m, m1_m, m2_m, m3_m); \ 538cabdff1aSopenharmony_ci SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ 539cabdff1aSopenharmony_ci PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ 540cabdff1aSopenharmony_ci SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ 541cabdff1aSopenharmony_ci m0_m, m1_m, m2_m, m3_m); \ 542cabdff1aSopenharmony_ci SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ 543cabdff1aSopenharmony_ci PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ 544cabdff1aSopenharmony_ci k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \ 545cabdff1aSopenharmony_ci k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \ 546cabdff1aSopenharmony_ci ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ 547cabdff1aSopenharmony_ci DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ 548cabdff1aSopenharmony_ci r0_m, r1_m, r2_m, r3_m); \ 549cabdff1aSopenharmony_ci k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \ 550cabdff1aSopenharmony_ci k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \ 551cabdff1aSopenharmony_ci ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ 552cabdff1aSopenharmony_ci DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ 553cabdff1aSopenharmony_ci r4_m, r5_m, r6_m, r7_m); \ 554cabdff1aSopenharmony_ci ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ 555cabdff1aSopenharmony_ci m0_m, m1_m, m2_m, m3_m); \ 556cabdff1aSopenharmony_ci SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ 557cabdff1aSopenharmony_ci PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ 558cabdff1aSopenharmony_ci SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ 559cabdff1aSopenharmony_ci m0_m, m1_m, m2_m, m3_m); \ 560cabdff1aSopenharmony_ci SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ 561cabdff1aSopenharmony_ci PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ 562cabdff1aSopenharmony_ci ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ 563cabdff1aSopenharmony_ci BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ 564cabdff1aSopenharmony_ci k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \ 565cabdff1aSopenharmony_ci k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \ 566cabdff1aSopenharmony_ci ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ 567cabdff1aSopenharmony_ci DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ 568cabdff1aSopenharmony_ci r0_m, r1_m, r2_m, r3_m); \ 569cabdff1aSopenharmony_ci k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \ 570cabdff1aSopenharmony_ci DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ 571cabdff1aSopenharmony_ci r4_m, r5_m, r6_m, r7_m); \ 572cabdff1aSopenharmony_ci ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ 573cabdff1aSopenharmony_ci m0_m, m1_m, m2_m, m3_m); \ 574cabdff1aSopenharmony_ci SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ 575cabdff1aSopenharmony_ci PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ 576cabdff1aSopenharmony_ci SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ 577cabdff1aSopenharmony_ci m0_m, m1_m, m2_m, m3_m); \ 578cabdff1aSopenharmony_ci SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ 579cabdff1aSopenharmony_ci PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ 580cabdff1aSopenharmony_ci k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \ 581cabdff1aSopenharmony_ci k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \ 582cabdff1aSopenharmony_ci ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ 583cabdff1aSopenharmony_ci DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ 584cabdff1aSopenharmony_ci m0_m, m1_m, m2_m, m3_m); \ 585cabdff1aSopenharmony_ci SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ 586cabdff1aSopenharmony_ci PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ 587cabdff1aSopenharmony_ci ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ 588cabdff1aSopenharmony_ci DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ 589cabdff1aSopenharmony_ci m0_m, m1_m, m2_m, m3_m); \ 590cabdff1aSopenharmony_ci SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ 591cabdff1aSopenharmony_ci PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ 592cabdff1aSopenharmony_ci \ 593cabdff1aSopenharmony_ci out1 = -in1; \ 594cabdff1aSopenharmony_ci out3 = -in3; \ 595cabdff1aSopenharmony_ci out5 = -in5; \ 596cabdff1aSopenharmony_ci out7 = -in7; \ 597cabdff1aSopenharmony_ci} 598cabdff1aSopenharmony_ci 599cabdff1aSopenharmony_cistatic void vp9_idct8x8_1_add_msa(int16_t *input, uint8_t *dst, 600cabdff1aSopenharmony_ci int32_t dst_stride) 601cabdff1aSopenharmony_ci{ 602cabdff1aSopenharmony_ci int16_t out; 603cabdff1aSopenharmony_ci int32_t val; 604cabdff1aSopenharmony_ci v8i16 vec; 605cabdff1aSopenharmony_ci 606cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); 607cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); 608cabdff1aSopenharmony_ci val = ROUND_POWER_OF_TWO(out, 5); 609cabdff1aSopenharmony_ci vec = __msa_fill_h(val); 610cabdff1aSopenharmony_ci input[0] = 0; 611cabdff1aSopenharmony_ci 612cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); 613cabdff1aSopenharmony_ci dst += (4 * dst_stride); 614cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); 615cabdff1aSopenharmony_ci} 616cabdff1aSopenharmony_ci 617cabdff1aSopenharmony_cistatic void vp9_idct8x8_12_colcol_addblk_msa(int16_t *input, uint8_t *dst, 618cabdff1aSopenharmony_ci int32_t dst_stride) 619cabdff1aSopenharmony_ci{ 620cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 621cabdff1aSopenharmony_ci v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3; 622cabdff1aSopenharmony_ci v4i32 tmp0, tmp1, tmp2, tmp3; 623cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 624cabdff1aSopenharmony_ci 625cabdff1aSopenharmony_ci /* load vector elements of 8x8 block */ 626cabdff1aSopenharmony_ci LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); 627cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8); 628cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 629cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 630cabdff1aSopenharmony_ci 631cabdff1aSopenharmony_ci /* stage1 */ 632cabdff1aSopenharmony_ci ILVL_H2_SH(in3, in0, in2, in1, s0, s1); 633cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); 634cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); 635cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); 636cabdff1aSopenharmony_ci k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); 637cabdff1aSopenharmony_ci DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); 638cabdff1aSopenharmony_ci SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, VP9_DCT_CONST_BITS); 639cabdff1aSopenharmony_ci PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); 640cabdff1aSopenharmony_ci PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); 641cabdff1aSopenharmony_ci BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5); 642cabdff1aSopenharmony_ci 643cabdff1aSopenharmony_ci /* stage2 */ 644cabdff1aSopenharmony_ci ILVR_H2_SH(in3, in1, in2, in0, s1, s0); 645cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); 646cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); 647cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); 648cabdff1aSopenharmony_ci k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); 649cabdff1aSopenharmony_ci DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); 650cabdff1aSopenharmony_ci SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, VP9_DCT_CONST_BITS); 651cabdff1aSopenharmony_ci PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); 652cabdff1aSopenharmony_ci PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); 653cabdff1aSopenharmony_ci BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3); 654cabdff1aSopenharmony_ci 655cabdff1aSopenharmony_ci /* stage3 */ 656cabdff1aSopenharmony_ci s0 = __msa_ilvr_h(s6, s5); 657cabdff1aSopenharmony_ci 658cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); 659cabdff1aSopenharmony_ci DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1); 660cabdff1aSopenharmony_ci SRARI_W2_SW(tmp0, tmp1, VP9_DCT_CONST_BITS); 661cabdff1aSopenharmony_ci PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); 662cabdff1aSopenharmony_ci 663cabdff1aSopenharmony_ci /* stage4 */ 664cabdff1aSopenharmony_ci BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, 665cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 666cabdff1aSopenharmony_ci TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 667cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 668cabdff1aSopenharmony_ci VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, 669cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 670cabdff1aSopenharmony_ci 671cabdff1aSopenharmony_ci /* final rounding (add 2^4, divide by 2^5) and shift */ 672cabdff1aSopenharmony_ci SRARI_H4_SH(in0, in1, in2, in3, 5); 673cabdff1aSopenharmony_ci SRARI_H4_SH(in4, in5, in6, in7, 5); 674cabdff1aSopenharmony_ci 675cabdff1aSopenharmony_ci /* add block and store 8x8 */ 676cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); 677cabdff1aSopenharmony_ci dst += (4 * dst_stride); 678cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); 679cabdff1aSopenharmony_ci} 680cabdff1aSopenharmony_ci 681cabdff1aSopenharmony_cistatic void vp9_idct8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst, 682cabdff1aSopenharmony_ci int32_t dst_stride) 683cabdff1aSopenharmony_ci{ 684cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 685cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 686cabdff1aSopenharmony_ci 687cabdff1aSopenharmony_ci /* load vector elements of 8x8 block */ 688cabdff1aSopenharmony_ci LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); 689cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8); 690cabdff1aSopenharmony_ci /* 1D idct8x8 */ 691cabdff1aSopenharmony_ci VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, 692cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 693cabdff1aSopenharmony_ci /* columns transform */ 694cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 695cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 696cabdff1aSopenharmony_ci /* 1D idct8x8 */ 697cabdff1aSopenharmony_ci VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, 698cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 699cabdff1aSopenharmony_ci /* final rounding (add 2^4, divide by 2^5) and shift */ 700cabdff1aSopenharmony_ci SRARI_H4_SH(in0, in1, in2, in3, 5); 701cabdff1aSopenharmony_ci SRARI_H4_SH(in4, in5, in6, in7, 5); 702cabdff1aSopenharmony_ci /* add block and store 8x8 */ 703cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); 704cabdff1aSopenharmony_ci dst += (4 * dst_stride); 705cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); 706cabdff1aSopenharmony_ci} 707cabdff1aSopenharmony_ci 708cabdff1aSopenharmony_cistatic void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst, 709cabdff1aSopenharmony_ci int32_t dst_stride) 710cabdff1aSopenharmony_ci{ 711cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 712cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 713cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 714cabdff1aSopenharmony_ci v8i16 out0, out1, out2, out3, out4, out5, out6, out7; 715cabdff1aSopenharmony_ci v8i16 cnst0, cnst1, cnst2, cnst3, cnst4; 716cabdff1aSopenharmony_ci v8i16 temp0, temp1, temp2, temp3, s0, s1; 717cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 718cabdff1aSopenharmony_ci 719cabdff1aSopenharmony_ci /* load vector elements of 8x8 block */ 720cabdff1aSopenharmony_ci LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); 721cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8); 722cabdff1aSopenharmony_ci 723cabdff1aSopenharmony_ci /* 1D adst8x8 */ 724cabdff1aSopenharmony_ci VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, 725cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 726cabdff1aSopenharmony_ci 727cabdff1aSopenharmony_ci /* columns transform */ 728cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 729cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 730cabdff1aSopenharmony_ci 731cabdff1aSopenharmony_ci cnst0 = __msa_fill_h(cospi_2_64); 732cabdff1aSopenharmony_ci cnst1 = __msa_fill_h(cospi_30_64); 733cabdff1aSopenharmony_ci cnst2 = -cnst0; 734cabdff1aSopenharmony_ci ILVEV_H2_SH(cnst0, cnst1, cnst1, cnst2, cnst0, cnst1); 735cabdff1aSopenharmony_ci cnst2 = __msa_fill_h(cospi_18_64); 736cabdff1aSopenharmony_ci cnst3 = __msa_fill_h(cospi_14_64); 737cabdff1aSopenharmony_ci cnst4 = -cnst2; 738cabdff1aSopenharmony_ci ILVEV_H2_SH(cnst2, cnst3, cnst3, cnst4, cnst2, cnst3); 739cabdff1aSopenharmony_ci 740cabdff1aSopenharmony_ci ILVRL_H2_SH(in0, in7, temp1, temp0); 741cabdff1aSopenharmony_ci ILVRL_H2_SH(in4, in3, temp3, temp2); 742cabdff1aSopenharmony_ci VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst1, cnst2, 743cabdff1aSopenharmony_ci cnst3, in7, in0, in4, in3); 744cabdff1aSopenharmony_ci 745cabdff1aSopenharmony_ci cnst0 = __msa_fill_h(cospi_10_64); 746cabdff1aSopenharmony_ci cnst1 = __msa_fill_h(cospi_22_64); 747cabdff1aSopenharmony_ci cnst2 = -cnst0; 748cabdff1aSopenharmony_ci ILVEV_H2_SH(cnst0, cnst1, cnst1, cnst2, cnst0, cnst1); 749cabdff1aSopenharmony_ci cnst2 = __msa_fill_h(cospi_26_64); 750cabdff1aSopenharmony_ci cnst3 = __msa_fill_h(cospi_6_64); 751cabdff1aSopenharmony_ci cnst4 = -cnst2; 752cabdff1aSopenharmony_ci ILVEV_H2_SH(cnst2, cnst3, cnst3, cnst4, cnst2, cnst3); 753cabdff1aSopenharmony_ci 754cabdff1aSopenharmony_ci ILVRL_H2_SH(in2, in5, temp1, temp0); 755cabdff1aSopenharmony_ci ILVRL_H2_SH(in6, in1, temp3, temp2); 756cabdff1aSopenharmony_ci VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst1, cnst2, 757cabdff1aSopenharmony_ci cnst3, in5, in2, in6, in1); 758cabdff1aSopenharmony_ci BUTTERFLY_4(in7, in0, in2, in5, s1, s0, in2, in5); 759cabdff1aSopenharmony_ci out7 = -s0; 760cabdff1aSopenharmony_ci out0 = s1; 761cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out7, 5); 762cabdff1aSopenharmony_ci dst0 = LD_UB(dst + 0 * dst_stride); 763cabdff1aSopenharmony_ci dst7 = LD_UB(dst + 7 * dst_stride); 764cabdff1aSopenharmony_ci 765cabdff1aSopenharmony_ci res0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst0); 766cabdff1aSopenharmony_ci res0 += out0; 767cabdff1aSopenharmony_ci CLIP_SH_0_255(res0); 768cabdff1aSopenharmony_ci res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0); 769cabdff1aSopenharmony_ci ST_D1(res0, 0, dst); 770cabdff1aSopenharmony_ci 771cabdff1aSopenharmony_ci res7 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst7); 772cabdff1aSopenharmony_ci res7 += out7; 773cabdff1aSopenharmony_ci CLIP_SH_0_255(res7); 774cabdff1aSopenharmony_ci res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7); 775cabdff1aSopenharmony_ci ST_D1(res7, 0, dst + 7 * dst_stride); 776cabdff1aSopenharmony_ci 777cabdff1aSopenharmony_ci cnst1 = __msa_fill_h(cospi_24_64); 778cabdff1aSopenharmony_ci cnst0 = __msa_fill_h(cospi_8_64); 779cabdff1aSopenharmony_ci cnst3 = -cnst1; 780cabdff1aSopenharmony_ci cnst2 = -cnst0; 781cabdff1aSopenharmony_ci 782cabdff1aSopenharmony_ci ILVEV_H2_SH(cnst3, cnst0, cnst1, cnst2, cnst3, cnst2); 783cabdff1aSopenharmony_ci cnst0 = __msa_ilvev_h(cnst1, cnst0); 784cabdff1aSopenharmony_ci cnst1 = cnst0; 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci ILVRL_H2_SH(in4, in3, temp1, temp0); 787cabdff1aSopenharmony_ci ILVRL_H2_SH(in6, in1, temp3, temp2); 788cabdff1aSopenharmony_ci VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst2, cnst3, 789cabdff1aSopenharmony_ci cnst1, out1, out6, s0, s1); 790cabdff1aSopenharmony_ci out1 = -out1; 791cabdff1aSopenharmony_ci SRARI_H2_SH(out1, out6, 5); 792cabdff1aSopenharmony_ci dst1 = LD_UB(dst + 1 * dst_stride); 793cabdff1aSopenharmony_ci dst6 = LD_UB(dst + 6 * dst_stride); 794cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst1, zero, dst6, res1, res6); 795cabdff1aSopenharmony_ci ADD2(res1, out1, res6, out6, res1, res6); 796cabdff1aSopenharmony_ci CLIP_SH2_0_255(res1, res6); 797cabdff1aSopenharmony_ci PCKEV_B2_SH(res1, res1, res6, res6, res1, res6); 798cabdff1aSopenharmony_ci ST_D1(res1, 0, dst + dst_stride); 799cabdff1aSopenharmony_ci ST_D1(res6, 0, dst + 6 * dst_stride); 800cabdff1aSopenharmony_ci 801cabdff1aSopenharmony_ci cnst0 = __msa_fill_h(cospi_16_64); 802cabdff1aSopenharmony_ci cnst1 = -cnst0; 803cabdff1aSopenharmony_ci cnst1 = __msa_ilvev_h(cnst1, cnst0); 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_ci ILVRL_H2_SH(in2, in5, temp1, temp0); 806cabdff1aSopenharmony_ci ILVRL_H2_SH(s0, s1, temp3, temp2); 807cabdff1aSopenharmony_ci out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp0, temp1, cnst0); 808cabdff1aSopenharmony_ci out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp0, temp1, cnst1); 809cabdff1aSopenharmony_ci out3 = -out3; 810cabdff1aSopenharmony_ci SRARI_H2_SH(out3, out4, 5); 811cabdff1aSopenharmony_ci dst3 = LD_UB(dst + 3 * dst_stride); 812cabdff1aSopenharmony_ci dst4 = LD_UB(dst + 4 * dst_stride); 813cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst3, zero, dst4, res3, res4); 814cabdff1aSopenharmony_ci ADD2(res3, out3, res4, out4, res3, res4); 815cabdff1aSopenharmony_ci CLIP_SH2_0_255(res3, res4); 816cabdff1aSopenharmony_ci PCKEV_B2_SH(res3, res3, res4, res4, res3, res4); 817cabdff1aSopenharmony_ci ST_D1(res3, 0, dst + 3 * dst_stride); 818cabdff1aSopenharmony_ci ST_D1(res4, 0, dst + 4 * dst_stride); 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_ci out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst0); 821cabdff1aSopenharmony_ci out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst1); 822cabdff1aSopenharmony_ci out5 = -out5; 823cabdff1aSopenharmony_ci SRARI_H2_SH(out2, out5, 5); 824cabdff1aSopenharmony_ci dst2 = LD_UB(dst + 2 * dst_stride); 825cabdff1aSopenharmony_ci dst5 = LD_UB(dst + 5 * dst_stride); 826cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst2, zero, dst5, res2, res5); 827cabdff1aSopenharmony_ci ADD2(res2, out2, res5, out5, res2, res5); 828cabdff1aSopenharmony_ci CLIP_SH2_0_255(res2, res5); 829cabdff1aSopenharmony_ci PCKEV_B2_SH(res2, res2, res5, res5, res2, res5); 830cabdff1aSopenharmony_ci ST_D1(res2, 0, dst + 2 * dst_stride); 831cabdff1aSopenharmony_ci ST_D1(res5, 0, dst + 5 * dst_stride); 832cabdff1aSopenharmony_ci} 833cabdff1aSopenharmony_ci 834cabdff1aSopenharmony_cistatic void vp9_iadst_idct_8x8_add_msa(int16_t *input, uint8_t *dst, 835cabdff1aSopenharmony_ci int32_t dst_stride, int32_t eob) 836cabdff1aSopenharmony_ci{ 837cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 838cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 839cabdff1aSopenharmony_ci 840cabdff1aSopenharmony_ci /* load vector elements of 8x8 block */ 841cabdff1aSopenharmony_ci LD_SH8(input, 8, in1, in6, in3, in4, in5, in2, in7, in0); 842cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8); 843cabdff1aSopenharmony_ci /* 1D idct8x8 */ 844cabdff1aSopenharmony_ci VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, 845cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 846cabdff1aSopenharmony_ci /* columns transform */ 847cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 848cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 849cabdff1aSopenharmony_ci /* 1D idct8x8 */ 850cabdff1aSopenharmony_ci VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, 851cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 852cabdff1aSopenharmony_ci /* final rounding (add 2^4, divide by 2^5) and shift */ 853cabdff1aSopenharmony_ci SRARI_H4_SH(in0, in1, in2, in3, 5); 854cabdff1aSopenharmony_ci SRARI_H4_SH(in4, in5, in6, in7, 5); 855cabdff1aSopenharmony_ci /* add block and store 8x8 */ 856cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); 857cabdff1aSopenharmony_ci dst += (4 * dst_stride); 858cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); 859cabdff1aSopenharmony_ci} 860cabdff1aSopenharmony_ci 861cabdff1aSopenharmony_cistatic void vp9_idct_iadst_8x8_add_msa(int16_t *input, uint8_t *dst, 862cabdff1aSopenharmony_ci int32_t dst_stride, int32_t eob) 863cabdff1aSopenharmony_ci{ 864cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 865cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 866cabdff1aSopenharmony_ci 867cabdff1aSopenharmony_ci /* load vector elements of 8x8 block */ 868cabdff1aSopenharmony_ci LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); 869cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8); 870cabdff1aSopenharmony_ci 871cabdff1aSopenharmony_ci /* 1D idct8x8 */ 872cabdff1aSopenharmony_ci VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, 873cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 874cabdff1aSopenharmony_ci /* columns transform */ 875cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 876cabdff1aSopenharmony_ci in1, in6, in3, in4, in5, in2, in7, in0); 877cabdff1aSopenharmony_ci /* 1D idct8x8 */ 878cabdff1aSopenharmony_ci VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, 879cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 880cabdff1aSopenharmony_ci /* final rounding (add 2^4, divide by 2^5) and shift */ 881cabdff1aSopenharmony_ci SRARI_H4_SH(in0, in1, in2, in3, 5); 882cabdff1aSopenharmony_ci SRARI_H4_SH(in4, in5, in6, in7, 5); 883cabdff1aSopenharmony_ci /* add block and store 8x8 */ 884cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); 885cabdff1aSopenharmony_ci dst += (4 * dst_stride); 886cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); 887cabdff1aSopenharmony_ci} 888cabdff1aSopenharmony_ci 889cabdff1aSopenharmony_ci#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, \ 890cabdff1aSopenharmony_ci r9, r10, r11, r12, r13, r14, r15, \ 891cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, \ 892cabdff1aSopenharmony_ci out6, out7, out8, out9, out10, out11, \ 893cabdff1aSopenharmony_ci out12, out13, out14, out15) \ 894cabdff1aSopenharmony_ci{ \ 895cabdff1aSopenharmony_ci v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ 896cabdff1aSopenharmony_ci v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ 897cabdff1aSopenharmony_ci v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ 898cabdff1aSopenharmony_ci v8i16 h8_m, h9_m, h10_m, h11_m; \ 899cabdff1aSopenharmony_ci v8i16 k0_m, k1_m, k2_m, k3_m; \ 900cabdff1aSopenharmony_ci \ 901cabdff1aSopenharmony_ci /* stage 1 */ \ 902cabdff1aSopenharmony_ci k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ 903cabdff1aSopenharmony_ci k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ 904cabdff1aSopenharmony_ci k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ 905cabdff1aSopenharmony_ci k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ 906cabdff1aSopenharmony_ci VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \ 907cabdff1aSopenharmony_ci g0_m, g1_m, g2_m, g3_m); \ 908cabdff1aSopenharmony_ci k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ 909cabdff1aSopenharmony_ci k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ 910cabdff1aSopenharmony_ci k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ 911cabdff1aSopenharmony_ci k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ 912cabdff1aSopenharmony_ci VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \ 913cabdff1aSopenharmony_ci g4_m, g5_m, g6_m, g7_m); \ 914cabdff1aSopenharmony_ci k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ 915cabdff1aSopenharmony_ci k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ 916cabdff1aSopenharmony_ci k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ 917cabdff1aSopenharmony_ci k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ 918cabdff1aSopenharmony_ci VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \ 919cabdff1aSopenharmony_ci g8_m, g9_m, g10_m, g11_m); \ 920cabdff1aSopenharmony_ci k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ 921cabdff1aSopenharmony_ci k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ 922cabdff1aSopenharmony_ci k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ 923cabdff1aSopenharmony_ci k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ 924cabdff1aSopenharmony_ci VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \ 925cabdff1aSopenharmony_ci g12_m, g13_m, g14_m, g15_m); \ 926cabdff1aSopenharmony_ci \ 927cabdff1aSopenharmony_ci /* stage 2 */ \ 928cabdff1aSopenharmony_ci k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ 929cabdff1aSopenharmony_ci k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ 930cabdff1aSopenharmony_ci k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ 931cabdff1aSopenharmony_ci VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \ 932cabdff1aSopenharmony_ci h0_m, h1_m, h2_m, h3_m); \ 933cabdff1aSopenharmony_ci k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ 934cabdff1aSopenharmony_ci k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ 935cabdff1aSopenharmony_ci k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ 936cabdff1aSopenharmony_ci VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \ 937cabdff1aSopenharmony_ci h4_m, h5_m, h6_m, h7_m); \ 938cabdff1aSopenharmony_ci BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ 939cabdff1aSopenharmony_ci BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \ 940cabdff1aSopenharmony_ci h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ 941cabdff1aSopenharmony_ci \ 942cabdff1aSopenharmony_ci /* stage 3 */ \ 943cabdff1aSopenharmony_ci BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ 944cabdff1aSopenharmony_ci k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ 945cabdff1aSopenharmony_ci k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ 946cabdff1aSopenharmony_ci k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ 947cabdff1aSopenharmony_ci VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \ 948cabdff1aSopenharmony_ci out4, out6, out5, out7); \ 949cabdff1aSopenharmony_ci VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \ 950cabdff1aSopenharmony_ci out12, out14, out13, out15); \ 951cabdff1aSopenharmony_ci \ 952cabdff1aSopenharmony_ci /* stage 4 */ \ 953cabdff1aSopenharmony_ci k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ 954cabdff1aSopenharmony_ci k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ 955cabdff1aSopenharmony_ci k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ 956cabdff1aSopenharmony_ci k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ 957cabdff1aSopenharmony_ci VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ 958cabdff1aSopenharmony_ci VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ 959cabdff1aSopenharmony_ci VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ 960cabdff1aSopenharmony_ci VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ 961cabdff1aSopenharmony_ci} 962cabdff1aSopenharmony_ci 963cabdff1aSopenharmony_cistatic void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, 964cabdff1aSopenharmony_ci int32_t dst_stride) 965cabdff1aSopenharmony_ci{ 966cabdff1aSopenharmony_ci v8i16 loc0, loc1, loc2, loc3; 967cabdff1aSopenharmony_ci v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; 968cabdff1aSopenharmony_ci v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; 969cabdff1aSopenharmony_ci v8i16 tmp5, tmp6, tmp7; 970cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 971cabdff1aSopenharmony_ci 972cabdff1aSopenharmony_ci /* load up 8x16 */ 973cabdff1aSopenharmony_ci LD_SH16(input, 16, 974cabdff1aSopenharmony_ci reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, 975cabdff1aSopenharmony_ci reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); 976cabdff1aSopenharmony_ci 977cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16); 978cabdff1aSopenharmony_ci input += 8 * 16; 979cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16); 980cabdff1aSopenharmony_ci 981cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); 982cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); 983cabdff1aSopenharmony_ci BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); 984cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); 985cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); 986cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); 987cabdff1aSopenharmony_ci BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); 988cabdff1aSopenharmony_ci 989cabdff1aSopenharmony_ci reg0 = reg2 - loc1; 990cabdff1aSopenharmony_ci reg2 = reg2 + loc1; 991cabdff1aSopenharmony_ci reg12 = reg14 - loc0; 992cabdff1aSopenharmony_ci reg14 = reg14 + loc0; 993cabdff1aSopenharmony_ci reg4 = reg6 - loc3; 994cabdff1aSopenharmony_ci reg6 = reg6 + loc3; 995cabdff1aSopenharmony_ci reg8 = reg10 - loc2; 996cabdff1aSopenharmony_ci reg10 = reg10 + loc2; 997cabdff1aSopenharmony_ci 998cabdff1aSopenharmony_ci /* stage 2 */ 999cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); 1000cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); 1001cabdff1aSopenharmony_ci 1002cabdff1aSopenharmony_ci reg9 = reg1 - loc2; 1003cabdff1aSopenharmony_ci reg1 = reg1 + loc2; 1004cabdff1aSopenharmony_ci reg7 = reg15 - loc3; 1005cabdff1aSopenharmony_ci reg15 = reg15 + loc3; 1006cabdff1aSopenharmony_ci 1007cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); 1008cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); 1009cabdff1aSopenharmony_ci BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); 1010cabdff1aSopenharmony_ci 1011cabdff1aSopenharmony_ci loc1 = reg15 + reg3; 1012cabdff1aSopenharmony_ci reg3 = reg15 - reg3; 1013cabdff1aSopenharmony_ci loc2 = reg2 + loc1; 1014cabdff1aSopenharmony_ci reg15 = reg2 - loc1; 1015cabdff1aSopenharmony_ci 1016cabdff1aSopenharmony_ci loc1 = reg1 + reg13; 1017cabdff1aSopenharmony_ci reg13 = reg1 - reg13; 1018cabdff1aSopenharmony_ci loc0 = reg0 + loc1; 1019cabdff1aSopenharmony_ci loc1 = reg0 - loc1; 1020cabdff1aSopenharmony_ci tmp6 = loc0; 1021cabdff1aSopenharmony_ci tmp7 = loc1; 1022cabdff1aSopenharmony_ci reg0 = loc2; 1023cabdff1aSopenharmony_ci 1024cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); 1025cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, 1026cabdff1aSopenharmony_ci reg11); 1027cabdff1aSopenharmony_ci 1028cabdff1aSopenharmony_ci loc0 = reg9 + reg5; 1029cabdff1aSopenharmony_ci reg5 = reg9 - reg5; 1030cabdff1aSopenharmony_ci reg2 = reg6 + loc0; 1031cabdff1aSopenharmony_ci reg1 = reg6 - loc0; 1032cabdff1aSopenharmony_ci 1033cabdff1aSopenharmony_ci loc0 = reg7 + reg11; 1034cabdff1aSopenharmony_ci reg11 = reg7 - reg11; 1035cabdff1aSopenharmony_ci loc1 = reg4 + loc0; 1036cabdff1aSopenharmony_ci loc2 = reg4 - loc0; 1037cabdff1aSopenharmony_ci tmp5 = loc1; 1038cabdff1aSopenharmony_ci 1039cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); 1040cabdff1aSopenharmony_ci BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); 1041cabdff1aSopenharmony_ci 1042cabdff1aSopenharmony_ci reg10 = loc0; 1043cabdff1aSopenharmony_ci reg11 = loc1; 1044cabdff1aSopenharmony_ci 1045cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); 1046cabdff1aSopenharmony_ci BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); 1047cabdff1aSopenharmony_ci reg13 = loc2; 1048cabdff1aSopenharmony_ci 1049cabdff1aSopenharmony_ci /* Transpose and store the output */ 1050cabdff1aSopenharmony_ci reg12 = tmp5; 1051cabdff1aSopenharmony_ci reg14 = tmp6; 1052cabdff1aSopenharmony_ci reg3 = tmp7; 1053cabdff1aSopenharmony_ci 1054cabdff1aSopenharmony_ci SRARI_H4_SH(reg0, reg2, reg4, reg6, 6); 1055cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6); 1056cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1057cabdff1aSopenharmony_ci SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); 1058cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); 1059cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1060cabdff1aSopenharmony_ci SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); 1061cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); 1062cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1063cabdff1aSopenharmony_ci SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); 1064cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); 1065cabdff1aSopenharmony_ci} 1066cabdff1aSopenharmony_ci 1067cabdff1aSopenharmony_cistatic void vp9_idct16_1d_columns_msa(int16_t *input, int16_t *output) 1068cabdff1aSopenharmony_ci{ 1069cabdff1aSopenharmony_ci v8i16 loc0, loc1, loc2, loc3; 1070cabdff1aSopenharmony_ci v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; 1071cabdff1aSopenharmony_ci v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; 1072cabdff1aSopenharmony_ci v8i16 tmp5, tmp6, tmp7; 1073cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 1074cabdff1aSopenharmony_ci 1075cabdff1aSopenharmony_ci /* load up 8x16 */ 1076cabdff1aSopenharmony_ci LD_SH16(input, 16, 1077cabdff1aSopenharmony_ci reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, 1078cabdff1aSopenharmony_ci reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); 1079cabdff1aSopenharmony_ci 1080cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16); 1081cabdff1aSopenharmony_ci input += 16 * 8; 1082cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16); 1083cabdff1aSopenharmony_ci 1084cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); 1085cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); 1086cabdff1aSopenharmony_ci BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); 1087cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); 1088cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); 1089cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); 1090cabdff1aSopenharmony_ci BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); 1091cabdff1aSopenharmony_ci 1092cabdff1aSopenharmony_ci reg0 = reg2 - loc1; 1093cabdff1aSopenharmony_ci reg2 = reg2 + loc1; 1094cabdff1aSopenharmony_ci reg12 = reg14 - loc0; 1095cabdff1aSopenharmony_ci reg14 = reg14 + loc0; 1096cabdff1aSopenharmony_ci reg4 = reg6 - loc3; 1097cabdff1aSopenharmony_ci reg6 = reg6 + loc3; 1098cabdff1aSopenharmony_ci reg8 = reg10 - loc2; 1099cabdff1aSopenharmony_ci reg10 = reg10 + loc2; 1100cabdff1aSopenharmony_ci 1101cabdff1aSopenharmony_ci /* stage 2 */ 1102cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); 1103cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); 1104cabdff1aSopenharmony_ci 1105cabdff1aSopenharmony_ci reg9 = reg1 - loc2; 1106cabdff1aSopenharmony_ci reg1 = reg1 + loc2; 1107cabdff1aSopenharmony_ci reg7 = reg15 - loc3; 1108cabdff1aSopenharmony_ci reg15 = reg15 + loc3; 1109cabdff1aSopenharmony_ci 1110cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); 1111cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); 1112cabdff1aSopenharmony_ci BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); 1113cabdff1aSopenharmony_ci 1114cabdff1aSopenharmony_ci loc1 = reg15 + reg3; 1115cabdff1aSopenharmony_ci reg3 = reg15 - reg3; 1116cabdff1aSopenharmony_ci loc2 = reg2 + loc1; 1117cabdff1aSopenharmony_ci reg15 = reg2 - loc1; 1118cabdff1aSopenharmony_ci 1119cabdff1aSopenharmony_ci loc1 = reg1 + reg13; 1120cabdff1aSopenharmony_ci reg13 = reg1 - reg13; 1121cabdff1aSopenharmony_ci loc0 = reg0 + loc1; 1122cabdff1aSopenharmony_ci loc1 = reg0 - loc1; 1123cabdff1aSopenharmony_ci tmp6 = loc0; 1124cabdff1aSopenharmony_ci tmp7 = loc1; 1125cabdff1aSopenharmony_ci reg0 = loc2; 1126cabdff1aSopenharmony_ci 1127cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); 1128cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, 1129cabdff1aSopenharmony_ci reg11); 1130cabdff1aSopenharmony_ci 1131cabdff1aSopenharmony_ci loc0 = reg9 + reg5; 1132cabdff1aSopenharmony_ci reg5 = reg9 - reg5; 1133cabdff1aSopenharmony_ci reg2 = reg6 + loc0; 1134cabdff1aSopenharmony_ci reg1 = reg6 - loc0; 1135cabdff1aSopenharmony_ci 1136cabdff1aSopenharmony_ci loc0 = reg7 + reg11; 1137cabdff1aSopenharmony_ci reg11 = reg7 - reg11; 1138cabdff1aSopenharmony_ci loc1 = reg4 + loc0; 1139cabdff1aSopenharmony_ci loc2 = reg4 - loc0; 1140cabdff1aSopenharmony_ci 1141cabdff1aSopenharmony_ci tmp5 = loc1; 1142cabdff1aSopenharmony_ci 1143cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); 1144cabdff1aSopenharmony_ci BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); 1145cabdff1aSopenharmony_ci 1146cabdff1aSopenharmony_ci reg10 = loc0; 1147cabdff1aSopenharmony_ci reg11 = loc1; 1148cabdff1aSopenharmony_ci 1149cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); 1150cabdff1aSopenharmony_ci BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); 1151cabdff1aSopenharmony_ci reg13 = loc2; 1152cabdff1aSopenharmony_ci 1153cabdff1aSopenharmony_ci /* Transpose and store the output */ 1154cabdff1aSopenharmony_ci reg12 = tmp5; 1155cabdff1aSopenharmony_ci reg14 = tmp6; 1156cabdff1aSopenharmony_ci reg3 = tmp7; 1157cabdff1aSopenharmony_ci 1158cabdff1aSopenharmony_ci /* transpose block */ 1159cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, 1160cabdff1aSopenharmony_ci reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); 1161cabdff1aSopenharmony_ci ST_SH4(reg0, reg2, reg4, reg6, output, 16); 1162cabdff1aSopenharmony_ci ST_SH4(reg8, reg10, reg12, reg14, (output + 4 * 16), 16); 1163cabdff1aSopenharmony_ci 1164cabdff1aSopenharmony_ci /* transpose block */ 1165cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, 1166cabdff1aSopenharmony_ci reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); 1167cabdff1aSopenharmony_ci ST_SH4(reg3, reg13, reg11, reg5, (output + 8), 16); 1168cabdff1aSopenharmony_ci ST_SH4(reg7, reg9, reg1, reg15, (output + 8 + 4 * 16), 16); 1169cabdff1aSopenharmony_ci} 1170cabdff1aSopenharmony_ci 1171cabdff1aSopenharmony_cistatic void vp9_idct16x16_1_add_msa(int16_t *input, uint8_t *dst, 1172cabdff1aSopenharmony_ci int32_t dst_stride) 1173cabdff1aSopenharmony_ci{ 1174cabdff1aSopenharmony_ci uint8_t i; 1175cabdff1aSopenharmony_ci int16_t out; 1176cabdff1aSopenharmony_ci v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; 1177cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; 1178cabdff1aSopenharmony_ci 1179cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); 1180cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); 1181cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO(out, 6); 1182cabdff1aSopenharmony_ci input[0] = 0; 1183cabdff1aSopenharmony_ci 1184cabdff1aSopenharmony_ci vec = __msa_fill_h(out); 1185cabdff1aSopenharmony_ci 1186cabdff1aSopenharmony_ci for (i = 4; i--;) { 1187cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 1188cabdff1aSopenharmony_ci UNPCK_UB_SH(dst0, res0, res4); 1189cabdff1aSopenharmony_ci UNPCK_UB_SH(dst1, res1, res5); 1190cabdff1aSopenharmony_ci UNPCK_UB_SH(dst2, res2, res6); 1191cabdff1aSopenharmony_ci UNPCK_UB_SH(dst3, res3, res7); 1192cabdff1aSopenharmony_ci ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, 1193cabdff1aSopenharmony_ci res3); 1194cabdff1aSopenharmony_ci ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, 1195cabdff1aSopenharmony_ci res7); 1196cabdff1aSopenharmony_ci CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7); 1197cabdff1aSopenharmony_ci PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, 1198cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1199cabdff1aSopenharmony_ci ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 1200cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1201cabdff1aSopenharmony_ci } 1202cabdff1aSopenharmony_ci} 1203cabdff1aSopenharmony_ci 1204cabdff1aSopenharmony_cistatic void vp9_idct16x16_10_colcol_addblk_msa(int16_t *input, uint8_t *dst, 1205cabdff1aSopenharmony_ci int32_t dst_stride) 1206cabdff1aSopenharmony_ci{ 1207cabdff1aSopenharmony_ci int32_t i; 1208cabdff1aSopenharmony_ci int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT); 1209cabdff1aSopenharmony_ci int16_t *out = out_arr; 1210cabdff1aSopenharmony_ci 1211cabdff1aSopenharmony_ci /* transform rows */ 1212cabdff1aSopenharmony_ci vp9_idct16_1d_columns_msa(input, out); 1213cabdff1aSopenharmony_ci 1214cabdff1aSopenharmony_ci /* short case just considers top 4 rows as valid output */ 1215cabdff1aSopenharmony_ci out += 4 * 16; 1216cabdff1aSopenharmony_ci for (i = 12; i--;) { 1217cabdff1aSopenharmony_ci __asm__ volatile ( 1218cabdff1aSopenharmony_ci "sw $zero, 0(%[out]) \n\t" 1219cabdff1aSopenharmony_ci "sw $zero, 4(%[out]) \n\t" 1220cabdff1aSopenharmony_ci "sw $zero, 8(%[out]) \n\t" 1221cabdff1aSopenharmony_ci "sw $zero, 12(%[out]) \n\t" 1222cabdff1aSopenharmony_ci "sw $zero, 16(%[out]) \n\t" 1223cabdff1aSopenharmony_ci "sw $zero, 20(%[out]) \n\t" 1224cabdff1aSopenharmony_ci "sw $zero, 24(%[out]) \n\t" 1225cabdff1aSopenharmony_ci "sw $zero, 28(%[out]) \n\t" 1226cabdff1aSopenharmony_ci 1227cabdff1aSopenharmony_ci : 1228cabdff1aSopenharmony_ci : [out] "r" (out) 1229cabdff1aSopenharmony_ci ); 1230cabdff1aSopenharmony_ci 1231cabdff1aSopenharmony_ci out += 16; 1232cabdff1aSopenharmony_ci } 1233cabdff1aSopenharmony_ci 1234cabdff1aSopenharmony_ci out = out_arr; 1235cabdff1aSopenharmony_ci 1236cabdff1aSopenharmony_ci /* transform columns */ 1237cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 1238cabdff1aSopenharmony_ci /* process 8 * 16 block */ 1239cabdff1aSopenharmony_ci vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), 1240cabdff1aSopenharmony_ci dst_stride); 1241cabdff1aSopenharmony_ci } 1242cabdff1aSopenharmony_ci} 1243cabdff1aSopenharmony_ci 1244cabdff1aSopenharmony_cistatic void vp9_idct16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst, 1245cabdff1aSopenharmony_ci int32_t dst_stride) 1246cabdff1aSopenharmony_ci{ 1247cabdff1aSopenharmony_ci int32_t i; 1248cabdff1aSopenharmony_ci int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT); 1249cabdff1aSopenharmony_ci int16_t *out = out_arr; 1250cabdff1aSopenharmony_ci 1251cabdff1aSopenharmony_ci /* transform rows */ 1252cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 1253cabdff1aSopenharmony_ci /* process 8 * 16 block */ 1254cabdff1aSopenharmony_ci vp9_idct16_1d_columns_msa((input + (i << 3)), (out + (i << 7))); 1255cabdff1aSopenharmony_ci } 1256cabdff1aSopenharmony_ci 1257cabdff1aSopenharmony_ci /* transform columns */ 1258cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 1259cabdff1aSopenharmony_ci /* process 8 * 16 block */ 1260cabdff1aSopenharmony_ci vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), 1261cabdff1aSopenharmony_ci dst_stride); 1262cabdff1aSopenharmony_ci } 1263cabdff1aSopenharmony_ci} 1264cabdff1aSopenharmony_ci 1265cabdff1aSopenharmony_cistatic void vp9_iadst16_1d_columns_msa(int16_t *input, int16_t *output) 1266cabdff1aSopenharmony_ci{ 1267cabdff1aSopenharmony_ci v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 1268cabdff1aSopenharmony_ci v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; 1269cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 1270cabdff1aSopenharmony_ci 1271cabdff1aSopenharmony_ci /* load input data */ 1272cabdff1aSopenharmony_ci LD_SH16(input, 16, 1273cabdff1aSopenharmony_ci l0, l1, l2, l3, l4, l5, l6, l7, 1274cabdff1aSopenharmony_ci l8, l9, l10, l11, l12, l13, l14, l15); 1275cabdff1aSopenharmony_ci 1276cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16); 1277cabdff1aSopenharmony_ci input += 16 * 8; 1278cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16); 1279cabdff1aSopenharmony_ci 1280cabdff1aSopenharmony_ci /* ADST in horizontal */ 1281cabdff1aSopenharmony_ci VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, 1282cabdff1aSopenharmony_ci l8, l9, l10, l11, l12, l13, l14, l15, 1283cabdff1aSopenharmony_ci r0, r1, r2, r3, r4, r5, r6, r7, 1284cabdff1aSopenharmony_ci r8, r9, r10, r11, r12, r13, r14, r15); 1285cabdff1aSopenharmony_ci 1286cabdff1aSopenharmony_ci l1 = -r8; 1287cabdff1aSopenharmony_ci l3 = -r4; 1288cabdff1aSopenharmony_ci l13 = -r13; 1289cabdff1aSopenharmony_ci l15 = -r1; 1290cabdff1aSopenharmony_ci 1291cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, 1292cabdff1aSopenharmony_ci l0, l1, l2, l3, l4, l5, l6, l7); 1293cabdff1aSopenharmony_ci ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); 1294cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, 1295cabdff1aSopenharmony_ci l8, l9, l10, l11, l12, l13, l14, l15); 1296cabdff1aSopenharmony_ci ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); 1297cabdff1aSopenharmony_ci} 1298cabdff1aSopenharmony_ci 1299cabdff1aSopenharmony_cistatic void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, 1300cabdff1aSopenharmony_ci int32_t dst_stride) 1301cabdff1aSopenharmony_ci{ 1302cabdff1aSopenharmony_ci v8i16 v0, v2, v4, v6, k0, k1, k2, k3; 1303cabdff1aSopenharmony_ci v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 1304cabdff1aSopenharmony_ci v8i16 out0, out1, out2, out3, out4, out5, out6, out7; 1305cabdff1aSopenharmony_ci v8i16 out8, out9, out10, out11, out12, out13, out14, out15; 1306cabdff1aSopenharmony_ci v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; 1307cabdff1aSopenharmony_ci v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; 1308cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 1309cabdff1aSopenharmony_ci v8i16 res8, res9, res10, res11, res12, res13, res14, res15; 1310cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1311cabdff1aSopenharmony_ci v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; 1312cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 1313cabdff1aSopenharmony_ci 1314cabdff1aSopenharmony_ci r0 = LD_SH(input + 0 * 16); 1315cabdff1aSopenharmony_ci r3 = LD_SH(input + 3 * 16); 1316cabdff1aSopenharmony_ci r4 = LD_SH(input + 4 * 16); 1317cabdff1aSopenharmony_ci r7 = LD_SH(input + 7 * 16); 1318cabdff1aSopenharmony_ci r8 = LD_SH(input + 8 * 16); 1319cabdff1aSopenharmony_ci r11 = LD_SH(input + 11 * 16); 1320cabdff1aSopenharmony_ci r12 = LD_SH(input + 12 * 16); 1321cabdff1aSopenharmony_ci r15 = LD_SH(input + 15 * 16); 1322cabdff1aSopenharmony_ci 1323cabdff1aSopenharmony_ci /* stage 1 */ 1324cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); 1325cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); 1326cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); 1327cabdff1aSopenharmony_ci k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); 1328cabdff1aSopenharmony_ci VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); 1329cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); 1330cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); 1331cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); 1332cabdff1aSopenharmony_ci k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); 1333cabdff1aSopenharmony_ci VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); 1334cabdff1aSopenharmony_ci BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0); 1335cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); 1336cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); 1337cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); 1338cabdff1aSopenharmony_ci VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); 1339cabdff1aSopenharmony_ci 1340cabdff1aSopenharmony_ci r1 = LD_SH(input + 1 * 16); 1341cabdff1aSopenharmony_ci r2 = LD_SH(input + 2 * 16); 1342cabdff1aSopenharmony_ci r5 = LD_SH(input + 5 * 16); 1343cabdff1aSopenharmony_ci r6 = LD_SH(input + 6 * 16); 1344cabdff1aSopenharmony_ci r9 = LD_SH(input + 9 * 16); 1345cabdff1aSopenharmony_ci r10 = LD_SH(input + 10 * 16); 1346cabdff1aSopenharmony_ci r13 = LD_SH(input + 13 * 16); 1347cabdff1aSopenharmony_ci r14 = LD_SH(input + 14 * 16); 1348cabdff1aSopenharmony_ci 1349cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); 1350cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); 1351cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); 1352cabdff1aSopenharmony_ci k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); 1353cabdff1aSopenharmony_ci VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7); 1354cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); 1355cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); 1356cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); 1357cabdff1aSopenharmony_ci k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); 1358cabdff1aSopenharmony_ci VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15); 1359cabdff1aSopenharmony_ci BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4); 1360cabdff1aSopenharmony_ci BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10); 1361cabdff1aSopenharmony_ci out1 = -out1; 1362cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 6); 1363cabdff1aSopenharmony_ci dst0 = LD_UB(dst + 0 * dst_stride); 1364cabdff1aSopenharmony_ci dst1 = LD_UB(dst + 15 * dst_stride); 1365cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1); 1366cabdff1aSopenharmony_ci ADD2(res0, out0, res1, out1, res0, res1); 1367cabdff1aSopenharmony_ci CLIP_SH2_0_255(res0, res1); 1368cabdff1aSopenharmony_ci PCKEV_B2_SH(res0, res0, res1, res1, res0, res1); 1369cabdff1aSopenharmony_ci ST_D1(res0, 0, dst); 1370cabdff1aSopenharmony_ci ST_D1(res1, 0, dst + 15 * dst_stride); 1371cabdff1aSopenharmony_ci 1372cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); 1373cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); 1374cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); 1375cabdff1aSopenharmony_ci VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); 1376cabdff1aSopenharmony_ci BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); 1377cabdff1aSopenharmony_ci out8 = -out8; 1378cabdff1aSopenharmony_ci 1379cabdff1aSopenharmony_ci SRARI_H2_SH(out8, out9, 6); 1380cabdff1aSopenharmony_ci dst8 = LD_UB(dst + 1 * dst_stride); 1381cabdff1aSopenharmony_ci dst9 = LD_UB(dst + 14 * dst_stride); 1382cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9); 1383cabdff1aSopenharmony_ci ADD2(res8, out8, res9, out9, res8, res9); 1384cabdff1aSopenharmony_ci CLIP_SH2_0_255(res8, res9); 1385cabdff1aSopenharmony_ci PCKEV_B2_SH(res8, res8, res9, res9, res8, res9); 1386cabdff1aSopenharmony_ci ST_D1(res8, 0, dst + dst_stride); 1387cabdff1aSopenharmony_ci ST_D1(res9, 0, dst + 14 * dst_stride); 1388cabdff1aSopenharmony_ci 1389cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); 1390cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); 1391cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); 1392cabdff1aSopenharmony_ci VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7); 1393cabdff1aSopenharmony_ci out4 = -out4; 1394cabdff1aSopenharmony_ci SRARI_H2_SH(out4, out5, 6); 1395cabdff1aSopenharmony_ci dst4 = LD_UB(dst + 3 * dst_stride); 1396cabdff1aSopenharmony_ci dst5 = LD_UB(dst + 12 * dst_stride); 1397cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5); 1398cabdff1aSopenharmony_ci ADD2(res4, out4, res5, out5, res4, res5); 1399cabdff1aSopenharmony_ci CLIP_SH2_0_255(res4, res5); 1400cabdff1aSopenharmony_ci PCKEV_B2_SH(res4, res4, res5, res5, res4, res5); 1401cabdff1aSopenharmony_ci ST_D1(res4, 0, dst + 3 * dst_stride); 1402cabdff1aSopenharmony_ci ST_D1(res5, 0, dst + 12 * dst_stride); 1403cabdff1aSopenharmony_ci 1404cabdff1aSopenharmony_ci VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); 1405cabdff1aSopenharmony_ci out13 = -out13; 1406cabdff1aSopenharmony_ci SRARI_H2_SH(out12, out13, 6); 1407cabdff1aSopenharmony_ci dst12 = LD_UB(dst + 2 * dst_stride); 1408cabdff1aSopenharmony_ci dst13 = LD_UB(dst + 13 * dst_stride); 1409cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13); 1410cabdff1aSopenharmony_ci ADD2(res12, out12, res13, out13, res12, res13); 1411cabdff1aSopenharmony_ci CLIP_SH2_0_255(res12, res13); 1412cabdff1aSopenharmony_ci PCKEV_B2_SH(res12, res12, res13, res13, res12, res13); 1413cabdff1aSopenharmony_ci ST_D1(res12, 0, dst + 2 * dst_stride); 1414cabdff1aSopenharmony_ci ST_D1(res13, 0, dst + 13 * dst_stride); 1415cabdff1aSopenharmony_ci 1416cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); 1417cabdff1aSopenharmony_ci k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); 1418cabdff1aSopenharmony_ci VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7); 1419cabdff1aSopenharmony_ci SRARI_H2_SH(out6, out7, 6); 1420cabdff1aSopenharmony_ci dst6 = LD_UB(dst + 4 * dst_stride); 1421cabdff1aSopenharmony_ci dst7 = LD_UB(dst + 11 * dst_stride); 1422cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7); 1423cabdff1aSopenharmony_ci ADD2(res6, out6, res7, out7, res6, res7); 1424cabdff1aSopenharmony_ci CLIP_SH2_0_255(res6, res7); 1425cabdff1aSopenharmony_ci PCKEV_B2_SH(res6, res6, res7, res7, res6, res7); 1426cabdff1aSopenharmony_ci ST_D1(res6, 0, dst + 4 * dst_stride); 1427cabdff1aSopenharmony_ci ST_D1(res7, 0, dst + 11 * dst_stride); 1428cabdff1aSopenharmony_ci 1429cabdff1aSopenharmony_ci VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11); 1430cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out11, 6); 1431cabdff1aSopenharmony_ci dst10 = LD_UB(dst + 6 * dst_stride); 1432cabdff1aSopenharmony_ci dst11 = LD_UB(dst + 9 * dst_stride); 1433cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11); 1434cabdff1aSopenharmony_ci ADD2(res10, out10, res11, out11, res10, res11); 1435cabdff1aSopenharmony_ci CLIP_SH2_0_255(res10, res11); 1436cabdff1aSopenharmony_ci PCKEV_B2_SH(res10, res10, res11, res11, res10, res11); 1437cabdff1aSopenharmony_ci ST_D1(res10, 0, dst + 6 * dst_stride); 1438cabdff1aSopenharmony_ci ST_D1(res11, 0, dst + 9 * dst_stride); 1439cabdff1aSopenharmony_ci 1440cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); 1441cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); 1442cabdff1aSopenharmony_ci VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3); 1443cabdff1aSopenharmony_ci SRARI_H2_SH(out2, out3, 6); 1444cabdff1aSopenharmony_ci dst2 = LD_UB(dst + 7 * dst_stride); 1445cabdff1aSopenharmony_ci dst3 = LD_UB(dst + 8 * dst_stride); 1446cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3); 1447cabdff1aSopenharmony_ci ADD2(res2, out2, res3, out3, res2, res3); 1448cabdff1aSopenharmony_ci CLIP_SH2_0_255(res2, res3); 1449cabdff1aSopenharmony_ci PCKEV_B2_SH(res2, res2, res3, res3, res2, res3); 1450cabdff1aSopenharmony_ci ST_D1(res2, 0, dst + 7 * dst_stride); 1451cabdff1aSopenharmony_ci ST_D1(res3, 0, dst + 8 * dst_stride); 1452cabdff1aSopenharmony_ci 1453cabdff1aSopenharmony_ci VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15); 1454cabdff1aSopenharmony_ci SRARI_H2_SH(out14, out15, 6); 1455cabdff1aSopenharmony_ci dst14 = LD_UB(dst + 5 * dst_stride); 1456cabdff1aSopenharmony_ci dst15 = LD_UB(dst + 10 * dst_stride); 1457cabdff1aSopenharmony_ci ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); 1458cabdff1aSopenharmony_ci ADD2(res14, out14, res15, out15, res14, res15); 1459cabdff1aSopenharmony_ci CLIP_SH2_0_255(res14, res15); 1460cabdff1aSopenharmony_ci PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); 1461cabdff1aSopenharmony_ci ST_D1(res14, 0, dst + 5 * dst_stride); 1462cabdff1aSopenharmony_ci ST_D1(res15, 0, dst + 10 * dst_stride); 1463cabdff1aSopenharmony_ci} 1464cabdff1aSopenharmony_ci 1465cabdff1aSopenharmony_cistatic void vp9_iadst16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst, 1466cabdff1aSopenharmony_ci int32_t dst_stride) 1467cabdff1aSopenharmony_ci{ 1468cabdff1aSopenharmony_ci int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT); 1469cabdff1aSopenharmony_ci int16_t *out = out_arr; 1470cabdff1aSopenharmony_ci int32_t i; 1471cabdff1aSopenharmony_ci 1472cabdff1aSopenharmony_ci /* transform rows */ 1473cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 1474cabdff1aSopenharmony_ci /* process 16 * 8 block */ 1475cabdff1aSopenharmony_ci vp9_iadst16_1d_columns_msa((input + (i << 3)), (out + (i << 7))); 1476cabdff1aSopenharmony_ci } 1477cabdff1aSopenharmony_ci 1478cabdff1aSopenharmony_ci /* transform columns */ 1479cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 1480cabdff1aSopenharmony_ci /* process 8 * 16 block */ 1481cabdff1aSopenharmony_ci vp9_iadst16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), 1482cabdff1aSopenharmony_ci dst_stride); 1483cabdff1aSopenharmony_ci } 1484cabdff1aSopenharmony_ci} 1485cabdff1aSopenharmony_ci 1486cabdff1aSopenharmony_cistatic void vp9_iadst_idct_16x16_add_msa(int16_t *input, uint8_t *dst, 1487cabdff1aSopenharmony_ci int32_t dst_stride, int32_t eob) 1488cabdff1aSopenharmony_ci{ 1489cabdff1aSopenharmony_ci int32_t i; 1490cabdff1aSopenharmony_ci int16_t out[16 * 16]; 1491cabdff1aSopenharmony_ci int16_t *out_ptr = &out[0]; 1492cabdff1aSopenharmony_ci 1493cabdff1aSopenharmony_ci /* transform rows */ 1494cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 1495cabdff1aSopenharmony_ci /* process 8 * 16 block */ 1496cabdff1aSopenharmony_ci vp9_iadst16_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 7))); 1497cabdff1aSopenharmony_ci } 1498cabdff1aSopenharmony_ci 1499cabdff1aSopenharmony_ci /* transform columns */ 1500cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 1501cabdff1aSopenharmony_ci /* process 8 * 16 block */ 1502cabdff1aSopenharmony_ci vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), 1503cabdff1aSopenharmony_ci (dst + (i << 3)), dst_stride); 1504cabdff1aSopenharmony_ci } 1505cabdff1aSopenharmony_ci} 1506cabdff1aSopenharmony_ci 1507cabdff1aSopenharmony_cistatic void vp9_idct_iadst_16x16_add_msa(int16_t *input, uint8_t *dst, 1508cabdff1aSopenharmony_ci int32_t dst_stride, int32_t eob) 1509cabdff1aSopenharmony_ci{ 1510cabdff1aSopenharmony_ci int32_t i; 1511cabdff1aSopenharmony_ci int16_t out[16 * 16]; 1512cabdff1aSopenharmony_ci int16_t *out_ptr = &out[0]; 1513cabdff1aSopenharmony_ci 1514cabdff1aSopenharmony_ci /* transform rows */ 1515cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 1516cabdff1aSopenharmony_ci /* process 8 * 16 block */ 1517cabdff1aSopenharmony_ci vp9_idct16_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 7))); 1518cabdff1aSopenharmony_ci } 1519cabdff1aSopenharmony_ci 1520cabdff1aSopenharmony_ci /* transform columns */ 1521cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 1522cabdff1aSopenharmony_ci /* process 8 * 16 block */ 1523cabdff1aSopenharmony_ci vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)), 1524cabdff1aSopenharmony_ci (dst + (i << 3)), dst_stride); 1525cabdff1aSopenharmony_ci } 1526cabdff1aSopenharmony_ci} 1527cabdff1aSopenharmony_ci 1528cabdff1aSopenharmony_cistatic void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf, 1529cabdff1aSopenharmony_ci int16_t *tmp_eve_buf, 1530cabdff1aSopenharmony_ci int16_t *tmp_odd_buf, 1531cabdff1aSopenharmony_ci int16_t *dst) 1532cabdff1aSopenharmony_ci{ 1533cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 1534cabdff1aSopenharmony_ci v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 1535cabdff1aSopenharmony_ci 1536cabdff1aSopenharmony_ci /* FINAL BUTTERFLY : Dependency on Even & Odd */ 1537cabdff1aSopenharmony_ci vec0 = LD_SH(tmp_odd_buf); 1538cabdff1aSopenharmony_ci vec1 = LD_SH(tmp_odd_buf + 9 * 8); 1539cabdff1aSopenharmony_ci vec2 = LD_SH(tmp_odd_buf + 14 * 8); 1540cabdff1aSopenharmony_ci vec3 = LD_SH(tmp_odd_buf + 6 * 8); 1541cabdff1aSopenharmony_ci loc0 = LD_SH(tmp_eve_buf); 1542cabdff1aSopenharmony_ci loc1 = LD_SH(tmp_eve_buf + 8 * 8); 1543cabdff1aSopenharmony_ci loc2 = LD_SH(tmp_eve_buf + 4 * 8); 1544cabdff1aSopenharmony_ci loc3 = LD_SH(tmp_eve_buf + 12 * 8); 1545cabdff1aSopenharmony_ci 1546cabdff1aSopenharmony_ci ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); 1547cabdff1aSopenharmony_ci 1548cabdff1aSopenharmony_ci ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); 1549cabdff1aSopenharmony_ci ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); 1550cabdff1aSopenharmony_ci ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); 1551cabdff1aSopenharmony_ci ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); 1552cabdff1aSopenharmony_ci 1553cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1554cabdff1aSopenharmony_ci vec0 = LD_SH(tmp_odd_buf + 4 * 8); 1555cabdff1aSopenharmony_ci vec1 = LD_SH(tmp_odd_buf + 13 * 8); 1556cabdff1aSopenharmony_ci vec2 = LD_SH(tmp_odd_buf + 10 * 8); 1557cabdff1aSopenharmony_ci vec3 = LD_SH(tmp_odd_buf + 3 * 8); 1558cabdff1aSopenharmony_ci loc0 = LD_SH(tmp_eve_buf + 2 * 8); 1559cabdff1aSopenharmony_ci loc1 = LD_SH(tmp_eve_buf + 10 * 8); 1560cabdff1aSopenharmony_ci loc2 = LD_SH(tmp_eve_buf + 6 * 8); 1561cabdff1aSopenharmony_ci loc3 = LD_SH(tmp_eve_buf + 14 * 8); 1562cabdff1aSopenharmony_ci 1563cabdff1aSopenharmony_ci ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); 1564cabdff1aSopenharmony_ci 1565cabdff1aSopenharmony_ci ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); 1566cabdff1aSopenharmony_ci ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); 1567cabdff1aSopenharmony_ci ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); 1568cabdff1aSopenharmony_ci ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); 1569cabdff1aSopenharmony_ci 1570cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1571cabdff1aSopenharmony_ci vec0 = LD_SH(tmp_odd_buf + 2 * 8); 1572cabdff1aSopenharmony_ci vec1 = LD_SH(tmp_odd_buf + 11 * 8); 1573cabdff1aSopenharmony_ci vec2 = LD_SH(tmp_odd_buf + 12 * 8); 1574cabdff1aSopenharmony_ci vec3 = LD_SH(tmp_odd_buf + 7 * 8); 1575cabdff1aSopenharmony_ci loc0 = LD_SH(tmp_eve_buf + 1 * 8); 1576cabdff1aSopenharmony_ci loc1 = LD_SH(tmp_eve_buf + 9 * 8); 1577cabdff1aSopenharmony_ci loc2 = LD_SH(tmp_eve_buf + 5 * 8); 1578cabdff1aSopenharmony_ci loc3 = LD_SH(tmp_eve_buf + 13 * 8); 1579cabdff1aSopenharmony_ci 1580cabdff1aSopenharmony_ci ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); 1581cabdff1aSopenharmony_ci 1582cabdff1aSopenharmony_ci ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); 1583cabdff1aSopenharmony_ci ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); 1584cabdff1aSopenharmony_ci ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); 1585cabdff1aSopenharmony_ci ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); 1586cabdff1aSopenharmony_ci 1587cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1588cabdff1aSopenharmony_ci vec0 = LD_SH(tmp_odd_buf + 5 * 8); 1589cabdff1aSopenharmony_ci vec1 = LD_SH(tmp_odd_buf + 15 * 8); 1590cabdff1aSopenharmony_ci vec2 = LD_SH(tmp_odd_buf + 8 * 8); 1591cabdff1aSopenharmony_ci vec3 = LD_SH(tmp_odd_buf + 1 * 8); 1592cabdff1aSopenharmony_ci loc0 = LD_SH(tmp_eve_buf + 3 * 8); 1593cabdff1aSopenharmony_ci loc1 = LD_SH(tmp_eve_buf + 11 * 8); 1594cabdff1aSopenharmony_ci loc2 = LD_SH(tmp_eve_buf + 7 * 8); 1595cabdff1aSopenharmony_ci loc3 = LD_SH(tmp_eve_buf + 15 * 8); 1596cabdff1aSopenharmony_ci 1597cabdff1aSopenharmony_ci ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); 1598cabdff1aSopenharmony_ci 1599cabdff1aSopenharmony_ci ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); 1600cabdff1aSopenharmony_ci ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); 1601cabdff1aSopenharmony_ci ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); 1602cabdff1aSopenharmony_ci ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); 1603cabdff1aSopenharmony_ci 1604cabdff1aSopenharmony_ci /* Transpose : 16 vectors */ 1605cabdff1aSopenharmony_ci /* 1st & 2nd 8x8 */ 1606cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, 1607cabdff1aSopenharmony_ci m0, n0, m1, n1, m2, n2, m3, n3); 1608cabdff1aSopenharmony_ci ST_SH4(m0, n0, m1, n1, (dst + 0), 32); 1609cabdff1aSopenharmony_ci ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); 1610cabdff1aSopenharmony_ci 1611cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, 1612cabdff1aSopenharmony_ci m4, n4, m5, n5, m6, n6, m7, n7); 1613cabdff1aSopenharmony_ci ST_SH4(m4, n4, m5, n5, (dst + 8), 32); 1614cabdff1aSopenharmony_ci ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); 1615cabdff1aSopenharmony_ci 1616cabdff1aSopenharmony_ci /* 3rd & 4th 8x8 */ 1617cabdff1aSopenharmony_ci LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); 1618cabdff1aSopenharmony_ci LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); 1619cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, 1620cabdff1aSopenharmony_ci m0, n0, m1, n1, m2, n2, m3, n3); 1621cabdff1aSopenharmony_ci ST_SH4(m0, n0, m1, n1, (dst + 16), 32); 1622cabdff1aSopenharmony_ci ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); 1623cabdff1aSopenharmony_ci 1624cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, 1625cabdff1aSopenharmony_ci m4, n4, m5, n5, m6, n6, m7, n7); 1626cabdff1aSopenharmony_ci ST_SH4(m4, n4, m5, n5, (dst + 24), 32); 1627cabdff1aSopenharmony_ci ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); 1628cabdff1aSopenharmony_ci} 1629cabdff1aSopenharmony_ci 1630cabdff1aSopenharmony_cistatic void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf, 1631cabdff1aSopenharmony_ci int16_t *tmp_eve_buf) 1632cabdff1aSopenharmony_ci{ 1633cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 1634cabdff1aSopenharmony_ci v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 1635cabdff1aSopenharmony_ci v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; 1636cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 1637cabdff1aSopenharmony_ci 1638cabdff1aSopenharmony_ci /* Even stage 1 */ 1639cabdff1aSopenharmony_ci LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 1640cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, tmp_buf, (4 * 32)); 1641cabdff1aSopenharmony_ci tmp_buf += (2 * 32); 1642cabdff1aSopenharmony_ci 1643cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); 1644cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); 1645cabdff1aSopenharmony_ci BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); 1646cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 1647cabdff1aSopenharmony_ci 1648cabdff1aSopenharmony_ci loc1 = vec3; 1649cabdff1aSopenharmony_ci loc0 = vec1; 1650cabdff1aSopenharmony_ci 1651cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); 1652cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); 1653cabdff1aSopenharmony_ci BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); 1654cabdff1aSopenharmony_ci BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); 1655cabdff1aSopenharmony_ci BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); 1656cabdff1aSopenharmony_ci 1657cabdff1aSopenharmony_ci /* Even stage 2 */ 1658cabdff1aSopenharmony_ci /* Load 8 */ 1659cabdff1aSopenharmony_ci LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 1660cabdff1aSopenharmony_ci ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, tmp_buf, (4 * 32)); 1661cabdff1aSopenharmony_ci 1662cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); 1663cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); 1664cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); 1665cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); 1666cabdff1aSopenharmony_ci 1667cabdff1aSopenharmony_ci vec0 = reg0 + reg4; 1668cabdff1aSopenharmony_ci reg0 = reg0 - reg4; 1669cabdff1aSopenharmony_ci reg4 = reg6 + reg2; 1670cabdff1aSopenharmony_ci reg6 = reg6 - reg2; 1671cabdff1aSopenharmony_ci reg2 = reg1 + reg5; 1672cabdff1aSopenharmony_ci reg1 = reg1 - reg5; 1673cabdff1aSopenharmony_ci reg5 = reg7 + reg3; 1674cabdff1aSopenharmony_ci reg7 = reg7 - reg3; 1675cabdff1aSopenharmony_ci reg3 = vec0; 1676cabdff1aSopenharmony_ci 1677cabdff1aSopenharmony_ci vec1 = reg2; 1678cabdff1aSopenharmony_ci reg2 = reg3 + reg4; 1679cabdff1aSopenharmony_ci reg3 = reg3 - reg4; 1680cabdff1aSopenharmony_ci reg4 = reg5 - vec1; 1681cabdff1aSopenharmony_ci reg5 = reg5 + vec1; 1682cabdff1aSopenharmony_ci 1683cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); 1684cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); 1685cabdff1aSopenharmony_ci 1686cabdff1aSopenharmony_ci vec0 = reg0 - reg6; 1687cabdff1aSopenharmony_ci reg0 = reg0 + reg6; 1688cabdff1aSopenharmony_ci vec1 = reg7 - reg1; 1689cabdff1aSopenharmony_ci reg7 = reg7 + reg1; 1690cabdff1aSopenharmony_ci 1691cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); 1692cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); 1693cabdff1aSopenharmony_ci 1694cabdff1aSopenharmony_ci /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ 1695cabdff1aSopenharmony_ci /* Store 8 */ 1696cabdff1aSopenharmony_ci BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); 1697cabdff1aSopenharmony_ci ST_SH2(loc1, loc3, tmp_eve_buf, 8); 1698cabdff1aSopenharmony_ci ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); 1699cabdff1aSopenharmony_ci 1700cabdff1aSopenharmony_ci BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); 1701cabdff1aSopenharmony_ci ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); 1702cabdff1aSopenharmony_ci ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); 1703cabdff1aSopenharmony_ci 1704cabdff1aSopenharmony_ci /* Store 8 */ 1705cabdff1aSopenharmony_ci BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); 1706cabdff1aSopenharmony_ci ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); 1707cabdff1aSopenharmony_ci ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); 1708cabdff1aSopenharmony_ci 1709cabdff1aSopenharmony_ci BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); 1710cabdff1aSopenharmony_ci ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); 1711cabdff1aSopenharmony_ci ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); 1712cabdff1aSopenharmony_ci} 1713cabdff1aSopenharmony_ci 1714cabdff1aSopenharmony_cistatic void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf, 1715cabdff1aSopenharmony_ci int16_t *tmp_odd_buf) 1716cabdff1aSopenharmony_ci{ 1717cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 1718cabdff1aSopenharmony_ci v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 1719cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 1720cabdff1aSopenharmony_ci 1721cabdff1aSopenharmony_ci /* Odd stage 1 */ 1722cabdff1aSopenharmony_ci reg0 = LD_SH(tmp_buf + 32); 1723cabdff1aSopenharmony_ci reg1 = LD_SH(tmp_buf + 7 * 32); 1724cabdff1aSopenharmony_ci reg2 = LD_SH(tmp_buf + 9 * 32); 1725cabdff1aSopenharmony_ci reg3 = LD_SH(tmp_buf + 15 * 32); 1726cabdff1aSopenharmony_ci reg4 = LD_SH(tmp_buf + 17 * 32); 1727cabdff1aSopenharmony_ci reg5 = LD_SH(tmp_buf + 23 * 32); 1728cabdff1aSopenharmony_ci reg6 = LD_SH(tmp_buf + 25 * 32); 1729cabdff1aSopenharmony_ci reg7 = LD_SH(tmp_buf + 31 * 32); 1730cabdff1aSopenharmony_ci 1731cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 32); 1732cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 7 * 32); 1733cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 9 * 32); 1734cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 15 * 32); 1735cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 17 * 32); 1736cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 23 * 32); 1737cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 25 * 32); 1738cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 31 * 32); 1739cabdff1aSopenharmony_ci 1740cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); 1741cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); 1742cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); 1743cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); 1744cabdff1aSopenharmony_ci 1745cabdff1aSopenharmony_ci vec0 = reg0 + reg3; 1746cabdff1aSopenharmony_ci reg0 = reg0 - reg3; 1747cabdff1aSopenharmony_ci reg3 = reg7 + reg4; 1748cabdff1aSopenharmony_ci reg7 = reg7 - reg4; 1749cabdff1aSopenharmony_ci reg4 = reg1 + reg2; 1750cabdff1aSopenharmony_ci reg1 = reg1 - reg2; 1751cabdff1aSopenharmony_ci reg2 = reg6 + reg5; 1752cabdff1aSopenharmony_ci reg6 = reg6 - reg5; 1753cabdff1aSopenharmony_ci reg5 = vec0; 1754cabdff1aSopenharmony_ci 1755cabdff1aSopenharmony_ci /* 4 Stores */ 1756cabdff1aSopenharmony_ci ADD2(reg5, reg4, reg3, reg2, vec0, vec1); 1757cabdff1aSopenharmony_ci ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); 1758cabdff1aSopenharmony_ci SUB2(reg5, reg4, reg3, reg2, vec0, vec1); 1759cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); 1760cabdff1aSopenharmony_ci ST_SH2(vec0, vec1, tmp_odd_buf, 8); 1761cabdff1aSopenharmony_ci 1762cabdff1aSopenharmony_ci /* 4 Stores */ 1763cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); 1764cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); 1765cabdff1aSopenharmony_ci BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); 1766cabdff1aSopenharmony_ci ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); 1767cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); 1768cabdff1aSopenharmony_ci ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); 1769cabdff1aSopenharmony_ci 1770cabdff1aSopenharmony_ci /* Odd stage 2 */ 1771cabdff1aSopenharmony_ci /* 8 loads */ 1772cabdff1aSopenharmony_ci reg0 = LD_SH(tmp_buf + 3 * 32); 1773cabdff1aSopenharmony_ci reg1 = LD_SH(tmp_buf + 5 * 32); 1774cabdff1aSopenharmony_ci reg2 = LD_SH(tmp_buf + 11 * 32); 1775cabdff1aSopenharmony_ci reg3 = LD_SH(tmp_buf + 13 * 32); 1776cabdff1aSopenharmony_ci reg4 = LD_SH(tmp_buf + 19 * 32); 1777cabdff1aSopenharmony_ci reg5 = LD_SH(tmp_buf + 21 * 32); 1778cabdff1aSopenharmony_ci reg6 = LD_SH(tmp_buf + 27 * 32); 1779cabdff1aSopenharmony_ci reg7 = LD_SH(tmp_buf + 29 * 32); 1780cabdff1aSopenharmony_ci 1781cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 3 * 32); 1782cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 5 * 32); 1783cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 11 * 32); 1784cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 13 * 32); 1785cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 19 * 32); 1786cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 21 * 32); 1787cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 27 * 32); 1788cabdff1aSopenharmony_ci ST_SH(zero, tmp_buf + 29 * 32); 1789cabdff1aSopenharmony_ci 1790cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); 1791cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); 1792cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); 1793cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); 1794cabdff1aSopenharmony_ci 1795cabdff1aSopenharmony_ci /* 4 Stores */ 1796cabdff1aSopenharmony_ci SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, 1797cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1798cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); 1799cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); 1800cabdff1aSopenharmony_ci BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); 1801cabdff1aSopenharmony_ci ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); 1802cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); 1803cabdff1aSopenharmony_ci ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); 1804cabdff1aSopenharmony_ci 1805cabdff1aSopenharmony_ci /* 4 Stores */ 1806cabdff1aSopenharmony_ci ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, 1807cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1808cabdff1aSopenharmony_ci BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); 1809cabdff1aSopenharmony_ci ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); 1810cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); 1811cabdff1aSopenharmony_ci ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); 1812cabdff1aSopenharmony_ci 1813cabdff1aSopenharmony_ci /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ 1814cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1815cabdff1aSopenharmony_ci LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); 1816cabdff1aSopenharmony_ci LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); 1817cabdff1aSopenharmony_ci 1818cabdff1aSopenharmony_ci ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, 1819cabdff1aSopenharmony_ci loc0, loc1, loc2, loc3); 1820cabdff1aSopenharmony_ci ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); 1821cabdff1aSopenharmony_ci 1822cabdff1aSopenharmony_ci SUB2(reg0, reg4, reg1, reg5, vec0, vec1); 1823cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 1824cabdff1aSopenharmony_ci 1825cabdff1aSopenharmony_ci SUB2(reg2, reg6, reg3, reg7, vec0, vec1); 1826cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 1827cabdff1aSopenharmony_ci ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); 1828cabdff1aSopenharmony_ci 1829cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1830cabdff1aSopenharmony_ci LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); 1831cabdff1aSopenharmony_ci LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); 1832cabdff1aSopenharmony_ci 1833cabdff1aSopenharmony_ci ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, 1834cabdff1aSopenharmony_ci loc0, loc1, loc2, loc3); 1835cabdff1aSopenharmony_ci ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); 1836cabdff1aSopenharmony_ci 1837cabdff1aSopenharmony_ci SUB2(reg0, reg4, reg3, reg7, vec0, vec1); 1838cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 1839cabdff1aSopenharmony_ci 1840cabdff1aSopenharmony_ci SUB2(reg1, reg5, reg2, reg6, vec0, vec1); 1841cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 1842cabdff1aSopenharmony_ci ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); 1843cabdff1aSopenharmony_ci} 1844cabdff1aSopenharmony_ci 1845cabdff1aSopenharmony_cistatic void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, 1846cabdff1aSopenharmony_ci int16_t *tmp_odd_buf, 1847cabdff1aSopenharmony_ci uint8_t *dst, 1848cabdff1aSopenharmony_ci int32_t dst_stride) 1849cabdff1aSopenharmony_ci{ 1850cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 1851cabdff1aSopenharmony_ci v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 1852cabdff1aSopenharmony_ci 1853cabdff1aSopenharmony_ci /* FINAL BUTTERFLY : Dependency on Even & Odd */ 1854cabdff1aSopenharmony_ci vec0 = LD_SH(tmp_odd_buf); 1855cabdff1aSopenharmony_ci vec1 = LD_SH(tmp_odd_buf + 9 * 8); 1856cabdff1aSopenharmony_ci vec2 = LD_SH(tmp_odd_buf + 14 * 8); 1857cabdff1aSopenharmony_ci vec3 = LD_SH(tmp_odd_buf + 6 * 8); 1858cabdff1aSopenharmony_ci loc0 = LD_SH(tmp_eve_buf); 1859cabdff1aSopenharmony_ci loc1 = LD_SH(tmp_eve_buf + 8 * 8); 1860cabdff1aSopenharmony_ci loc2 = LD_SH(tmp_eve_buf + 4 * 8); 1861cabdff1aSopenharmony_ci loc3 = LD_SH(tmp_eve_buf + 12 * 8); 1862cabdff1aSopenharmony_ci 1863cabdff1aSopenharmony_ci ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); 1864cabdff1aSopenharmony_ci SRARI_H4_SH(m0, m2, m4, m6, 6); 1865cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); 1866cabdff1aSopenharmony_ci 1867cabdff1aSopenharmony_ci SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); 1868cabdff1aSopenharmony_ci SRARI_H4_SH(m0, m2, m4, m6, 6); 1869cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), 1870cabdff1aSopenharmony_ci m0, m2, m4, m6); 1871cabdff1aSopenharmony_ci 1872cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1873cabdff1aSopenharmony_ci vec0 = LD_SH(tmp_odd_buf + 4 * 8); 1874cabdff1aSopenharmony_ci vec1 = LD_SH(tmp_odd_buf + 13 * 8); 1875cabdff1aSopenharmony_ci vec2 = LD_SH(tmp_odd_buf + 10 * 8); 1876cabdff1aSopenharmony_ci vec3 = LD_SH(tmp_odd_buf + 3 * 8); 1877cabdff1aSopenharmony_ci loc0 = LD_SH(tmp_eve_buf + 2 * 8); 1878cabdff1aSopenharmony_ci loc1 = LD_SH(tmp_eve_buf + 10 * 8); 1879cabdff1aSopenharmony_ci loc2 = LD_SH(tmp_eve_buf + 6 * 8); 1880cabdff1aSopenharmony_ci loc3 = LD_SH(tmp_eve_buf + 14 * 8); 1881cabdff1aSopenharmony_ci 1882cabdff1aSopenharmony_ci ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); 1883cabdff1aSopenharmony_ci SRARI_H4_SH(m1, m3, m5, m7, 6); 1884cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), 1885cabdff1aSopenharmony_ci m1, m3, m5, m7); 1886cabdff1aSopenharmony_ci 1887cabdff1aSopenharmony_ci SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); 1888cabdff1aSopenharmony_ci SRARI_H4_SH(m1, m3, m5, m7, 6); 1889cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), 1890cabdff1aSopenharmony_ci m1, m3, m5, m7); 1891cabdff1aSopenharmony_ci 1892cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1893cabdff1aSopenharmony_ci vec0 = LD_SH(tmp_odd_buf + 2 * 8); 1894cabdff1aSopenharmony_ci vec1 = LD_SH(tmp_odd_buf + 11 * 8); 1895cabdff1aSopenharmony_ci vec2 = LD_SH(tmp_odd_buf + 12 * 8); 1896cabdff1aSopenharmony_ci vec3 = LD_SH(tmp_odd_buf + 7 * 8); 1897cabdff1aSopenharmony_ci loc0 = LD_SH(tmp_eve_buf + 1 * 8); 1898cabdff1aSopenharmony_ci loc1 = LD_SH(tmp_eve_buf + 9 * 8); 1899cabdff1aSopenharmony_ci loc2 = LD_SH(tmp_eve_buf + 5 * 8); 1900cabdff1aSopenharmony_ci loc3 = LD_SH(tmp_eve_buf + 13 * 8); 1901cabdff1aSopenharmony_ci 1902cabdff1aSopenharmony_ci ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); 1903cabdff1aSopenharmony_ci SRARI_H4_SH(n0, n2, n4, n6, 6); 1904cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), 1905cabdff1aSopenharmony_ci n0, n2, n4, n6); 1906cabdff1aSopenharmony_ci 1907cabdff1aSopenharmony_ci SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); 1908cabdff1aSopenharmony_ci SRARI_H4_SH(n0, n2, n4, n6, 6); 1909cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), 1910cabdff1aSopenharmony_ci n0, n2, n4, n6); 1911cabdff1aSopenharmony_ci 1912cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1913cabdff1aSopenharmony_ci vec0 = LD_SH(tmp_odd_buf + 5 * 8); 1914cabdff1aSopenharmony_ci vec1 = LD_SH(tmp_odd_buf + 15 * 8); 1915cabdff1aSopenharmony_ci vec2 = LD_SH(tmp_odd_buf + 8 * 8); 1916cabdff1aSopenharmony_ci vec3 = LD_SH(tmp_odd_buf + 1 * 8); 1917cabdff1aSopenharmony_ci loc0 = LD_SH(tmp_eve_buf + 3 * 8); 1918cabdff1aSopenharmony_ci loc1 = LD_SH(tmp_eve_buf + 11 * 8); 1919cabdff1aSopenharmony_ci loc2 = LD_SH(tmp_eve_buf + 7 * 8); 1920cabdff1aSopenharmony_ci loc3 = LD_SH(tmp_eve_buf + 15 * 8); 1921cabdff1aSopenharmony_ci 1922cabdff1aSopenharmony_ci ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); 1923cabdff1aSopenharmony_ci SRARI_H4_SH(n1, n3, n5, n7, 6); 1924cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), 1925cabdff1aSopenharmony_ci n1, n3, n5, n7); 1926cabdff1aSopenharmony_ci 1927cabdff1aSopenharmony_ci SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); 1928cabdff1aSopenharmony_ci SRARI_H4_SH(n1, n3, n5, n7, 6); 1929cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), 1930cabdff1aSopenharmony_ci n1, n3, n5, n7); 1931cabdff1aSopenharmony_ci} 1932cabdff1aSopenharmony_ci 1933cabdff1aSopenharmony_cistatic void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, 1934cabdff1aSopenharmony_ci int32_t dst_stride) 1935cabdff1aSopenharmony_ci{ 1936cabdff1aSopenharmony_ci int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT); 1937cabdff1aSopenharmony_ci int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT); 1938cabdff1aSopenharmony_ci 1939cabdff1aSopenharmony_ci vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); 1940cabdff1aSopenharmony_ci vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); 1941cabdff1aSopenharmony_ci vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], 1942cabdff1aSopenharmony_ci dst, dst_stride); 1943cabdff1aSopenharmony_ci} 1944cabdff1aSopenharmony_ci 1945cabdff1aSopenharmony_cistatic void vp9_idct8x32_1d_columns_msa(int16_t *input, int16_t *output, 1946cabdff1aSopenharmony_ci int16_t *tmp_buf) 1947cabdff1aSopenharmony_ci{ 1948cabdff1aSopenharmony_ci int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT); 1949cabdff1aSopenharmony_ci int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT); 1950cabdff1aSopenharmony_ci 1951cabdff1aSopenharmony_ci vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); 1952cabdff1aSopenharmony_ci vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); 1953cabdff1aSopenharmony_ci vp9_idct_butterfly_transpose_store(tmp_buf, &tmp_eve_buf[0], 1954cabdff1aSopenharmony_ci &tmp_odd_buf[0], output); 1955cabdff1aSopenharmony_ci} 1956cabdff1aSopenharmony_ci 1957cabdff1aSopenharmony_cistatic void vp9_idct32x32_1_add_msa(int16_t *input, uint8_t *dst, 1958cabdff1aSopenharmony_ci int32_t dst_stride) 1959cabdff1aSopenharmony_ci{ 1960cabdff1aSopenharmony_ci int32_t i; 1961cabdff1aSopenharmony_ci int16_t out; 1962cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; 1963cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; 1964cabdff1aSopenharmony_ci 1965cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); 1966cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); 1967cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO(out, 6); 1968cabdff1aSopenharmony_ci input[0] = 0; 1969cabdff1aSopenharmony_ci 1970cabdff1aSopenharmony_ci vec = __msa_fill_h(out); 1971cabdff1aSopenharmony_ci 1972cabdff1aSopenharmony_ci for (i = 16; i--;) { 1973cabdff1aSopenharmony_ci LD_UB2(dst, 16, dst0, dst1); 1974cabdff1aSopenharmony_ci LD_UB2(dst + dst_stride, 16, dst2, dst3); 1975cabdff1aSopenharmony_ci 1976cabdff1aSopenharmony_ci UNPCK_UB_SH(dst0, res0, res4); 1977cabdff1aSopenharmony_ci UNPCK_UB_SH(dst1, res1, res5); 1978cabdff1aSopenharmony_ci UNPCK_UB_SH(dst2, res2, res6); 1979cabdff1aSopenharmony_ci UNPCK_UB_SH(dst3, res3, res7); 1980cabdff1aSopenharmony_ci ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, 1981cabdff1aSopenharmony_ci res3); 1982cabdff1aSopenharmony_ci ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, 1983cabdff1aSopenharmony_ci res7); 1984cabdff1aSopenharmony_ci CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7); 1985cabdff1aSopenharmony_ci PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, 1986cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1987cabdff1aSopenharmony_ci 1988cabdff1aSopenharmony_ci ST_UB2(tmp0, tmp1, dst, 16); 1989cabdff1aSopenharmony_ci dst += dst_stride; 1990cabdff1aSopenharmony_ci ST_UB2(tmp2, tmp3, dst, 16); 1991cabdff1aSopenharmony_ci dst += dst_stride; 1992cabdff1aSopenharmony_ci } 1993cabdff1aSopenharmony_ci} 1994cabdff1aSopenharmony_ci 1995cabdff1aSopenharmony_cistatic void vp9_idct32x32_34_colcol_addblk_msa(int16_t *input, uint8_t *dst, 1996cabdff1aSopenharmony_ci int32_t dst_stride) 1997cabdff1aSopenharmony_ci{ 1998cabdff1aSopenharmony_ci int32_t i; 1999cabdff1aSopenharmony_ci int16_t out_arr[32 * 32] ALLOC_ALIGNED(ALIGNMENT); 2000cabdff1aSopenharmony_ci int16_t *out_ptr = out_arr; 2001cabdff1aSopenharmony_ci int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(ALIGNMENT); 2002cabdff1aSopenharmony_ci 2003cabdff1aSopenharmony_ci for (i = 32; i--;) { 2004cabdff1aSopenharmony_ci __asm__ volatile ( 2005cabdff1aSopenharmony_ci "sw $zero, (%[out_ptr]) \n\t" 2006cabdff1aSopenharmony_ci "sw $zero, 4(%[out_ptr]) \n\t" 2007cabdff1aSopenharmony_ci "sw $zero, 8(%[out_ptr]) \n\t" 2008cabdff1aSopenharmony_ci "sw $zero, 12(%[out_ptr]) \n\t" 2009cabdff1aSopenharmony_ci "sw $zero, 16(%[out_ptr]) \n\t" 2010cabdff1aSopenharmony_ci "sw $zero, 20(%[out_ptr]) \n\t" 2011cabdff1aSopenharmony_ci "sw $zero, 24(%[out_ptr]) \n\t" 2012cabdff1aSopenharmony_ci "sw $zero, 28(%[out_ptr]) \n\t" 2013cabdff1aSopenharmony_ci "sw $zero, 32(%[out_ptr]) \n\t" 2014cabdff1aSopenharmony_ci "sw $zero, 36(%[out_ptr]) \n\t" 2015cabdff1aSopenharmony_ci "sw $zero, 40(%[out_ptr]) \n\t" 2016cabdff1aSopenharmony_ci "sw $zero, 44(%[out_ptr]) \n\t" 2017cabdff1aSopenharmony_ci "sw $zero, 48(%[out_ptr]) \n\t" 2018cabdff1aSopenharmony_ci "sw $zero, 52(%[out_ptr]) \n\t" 2019cabdff1aSopenharmony_ci "sw $zero, 56(%[out_ptr]) \n\t" 2020cabdff1aSopenharmony_ci "sw $zero, 60(%[out_ptr]) \n\t" 2021cabdff1aSopenharmony_ci 2022cabdff1aSopenharmony_ci : 2023cabdff1aSopenharmony_ci : [out_ptr] "r" (out_ptr) 2024cabdff1aSopenharmony_ci ); 2025cabdff1aSopenharmony_ci 2026cabdff1aSopenharmony_ci out_ptr += 32; 2027cabdff1aSopenharmony_ci } 2028cabdff1aSopenharmony_ci 2029cabdff1aSopenharmony_ci out_ptr = out_arr; 2030cabdff1aSopenharmony_ci 2031cabdff1aSopenharmony_ci /* process 8*32 block */ 2032cabdff1aSopenharmony_ci vp9_idct8x32_1d_columns_msa(input, out_ptr, &tmp_buf[0]); 2033cabdff1aSopenharmony_ci 2034cabdff1aSopenharmony_ci /* transform columns */ 2035cabdff1aSopenharmony_ci for (i = 0; i < 4; i++) { 2036cabdff1aSopenharmony_ci /* process 8*32 block */ 2037cabdff1aSopenharmony_ci vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), 2038cabdff1aSopenharmony_ci (dst + (i << 3)), dst_stride); 2039cabdff1aSopenharmony_ci } 2040cabdff1aSopenharmony_ci} 2041cabdff1aSopenharmony_ci 2042cabdff1aSopenharmony_cistatic void vp9_idct32x32_colcol_addblk_msa(int16_t *input, uint8_t *dst, 2043cabdff1aSopenharmony_ci int32_t dst_stride) 2044cabdff1aSopenharmony_ci{ 2045cabdff1aSopenharmony_ci int32_t i; 2046cabdff1aSopenharmony_ci int16_t out_arr[32 * 32] ALLOC_ALIGNED(ALIGNMENT); 2047cabdff1aSopenharmony_ci int16_t *out_ptr = out_arr; 2048cabdff1aSopenharmony_ci int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(ALIGNMENT); 2049cabdff1aSopenharmony_ci 2050cabdff1aSopenharmony_ci /* transform rows */ 2051cabdff1aSopenharmony_ci for (i = 0; i < 4; i++) { 2052cabdff1aSopenharmony_ci /* process 8*32 block */ 2053cabdff1aSopenharmony_ci vp9_idct8x32_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 8)), 2054cabdff1aSopenharmony_ci &tmp_buf[0]); 2055cabdff1aSopenharmony_ci } 2056cabdff1aSopenharmony_ci 2057cabdff1aSopenharmony_ci /* transform columns */ 2058cabdff1aSopenharmony_ci for (i = 0; i < 4; i++) { 2059cabdff1aSopenharmony_ci /* process 8*32 block */ 2060cabdff1aSopenharmony_ci vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), 2061cabdff1aSopenharmony_ci (dst + (i << 3)), dst_stride); 2062cabdff1aSopenharmony_ci } 2063cabdff1aSopenharmony_ci} 2064cabdff1aSopenharmony_ci 2065cabdff1aSopenharmony_civoid ff_idct_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, 2066cabdff1aSopenharmony_ci int16_t *block, int eob) 2067cabdff1aSopenharmony_ci{ 2068cabdff1aSopenharmony_ci if (eob > 1) { 2069cabdff1aSopenharmony_ci vp9_idct4x4_colcol_addblk_msa(block, dst, stride); 2070cabdff1aSopenharmony_ci } 2071cabdff1aSopenharmony_ci else { 2072cabdff1aSopenharmony_ci vp9_idct4x4_1_add_msa(block, dst, stride); 2073cabdff1aSopenharmony_ci } 2074cabdff1aSopenharmony_ci} 2075cabdff1aSopenharmony_ci 2076cabdff1aSopenharmony_civoid ff_idct_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, 2077cabdff1aSopenharmony_ci int16_t *block, int eob) 2078cabdff1aSopenharmony_ci{ 2079cabdff1aSopenharmony_ci if (eob == 1) { 2080cabdff1aSopenharmony_ci vp9_idct8x8_1_add_msa(block, dst, stride); 2081cabdff1aSopenharmony_ci } 2082cabdff1aSopenharmony_ci else if (eob <= 12) { 2083cabdff1aSopenharmony_ci vp9_idct8x8_12_colcol_addblk_msa(block, dst, stride); 2084cabdff1aSopenharmony_ci } 2085cabdff1aSopenharmony_ci else { 2086cabdff1aSopenharmony_ci vp9_idct8x8_colcol_addblk_msa(block, dst, stride); 2087cabdff1aSopenharmony_ci } 2088cabdff1aSopenharmony_ci} 2089cabdff1aSopenharmony_ci 2090cabdff1aSopenharmony_civoid ff_idct_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, 2091cabdff1aSopenharmony_ci int16_t *block, int eob) 2092cabdff1aSopenharmony_ci{ 2093cabdff1aSopenharmony_ci if (eob == 1) { 2094cabdff1aSopenharmony_ci /* DC only DCT coefficient. */ 2095cabdff1aSopenharmony_ci vp9_idct16x16_1_add_msa(block, dst, stride); 2096cabdff1aSopenharmony_ci } 2097cabdff1aSopenharmony_ci else if (eob <= 10) { 2098cabdff1aSopenharmony_ci vp9_idct16x16_10_colcol_addblk_msa(block, dst, stride); 2099cabdff1aSopenharmony_ci } 2100cabdff1aSopenharmony_ci else { 2101cabdff1aSopenharmony_ci vp9_idct16x16_colcol_addblk_msa(block, dst, stride); 2102cabdff1aSopenharmony_ci } 2103cabdff1aSopenharmony_ci} 2104cabdff1aSopenharmony_ci 2105cabdff1aSopenharmony_civoid ff_idct_idct_32x32_add_msa(uint8_t *dst, ptrdiff_t stride, 2106cabdff1aSopenharmony_ci int16_t *block, int eob) 2107cabdff1aSopenharmony_ci{ 2108cabdff1aSopenharmony_ci if (eob == 1) { 2109cabdff1aSopenharmony_ci vp9_idct32x32_1_add_msa(block, dst, stride); 2110cabdff1aSopenharmony_ci } 2111cabdff1aSopenharmony_ci else if (eob <= 34) { 2112cabdff1aSopenharmony_ci vp9_idct32x32_34_colcol_addblk_msa(block, dst, stride); 2113cabdff1aSopenharmony_ci } 2114cabdff1aSopenharmony_ci else { 2115cabdff1aSopenharmony_ci vp9_idct32x32_colcol_addblk_msa(block, dst, stride); 2116cabdff1aSopenharmony_ci } 2117cabdff1aSopenharmony_ci} 2118cabdff1aSopenharmony_ci 2119cabdff1aSopenharmony_civoid ff_iadst_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, 2120cabdff1aSopenharmony_ci int16_t *block, int eob) 2121cabdff1aSopenharmony_ci{ 2122cabdff1aSopenharmony_ci vp9_iadst4x4_colcol_addblk_msa(block, dst, stride); 2123cabdff1aSopenharmony_ci} 2124cabdff1aSopenharmony_ci 2125cabdff1aSopenharmony_civoid ff_iadst_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, 2126cabdff1aSopenharmony_ci int16_t *block, int eob) 2127cabdff1aSopenharmony_ci{ 2128cabdff1aSopenharmony_ci vp9_iadst8x8_colcol_addblk_msa(block, dst, stride); 2129cabdff1aSopenharmony_ci} 2130cabdff1aSopenharmony_ci 2131cabdff1aSopenharmony_civoid ff_iadst_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, 2132cabdff1aSopenharmony_ci int16_t *block, int eob) 2133cabdff1aSopenharmony_ci{ 2134cabdff1aSopenharmony_ci vp9_iadst16x16_colcol_addblk_msa(block, dst, stride); 2135cabdff1aSopenharmony_ci} 2136cabdff1aSopenharmony_ci 2137cabdff1aSopenharmony_civoid ff_idct_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, 2138cabdff1aSopenharmony_ci int16_t *block, int eob) 2139cabdff1aSopenharmony_ci{ 2140cabdff1aSopenharmony_ci vp9_idct_iadst_4x4_add_msa(block, dst, stride, eob); 2141cabdff1aSopenharmony_ci} 2142cabdff1aSopenharmony_ci 2143cabdff1aSopenharmony_civoid ff_idct_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, 2144cabdff1aSopenharmony_ci int16_t *block, int eob) 2145cabdff1aSopenharmony_ci{ 2146cabdff1aSopenharmony_ci vp9_idct_iadst_8x8_add_msa(block, dst, stride, eob); 2147cabdff1aSopenharmony_ci} 2148cabdff1aSopenharmony_ci 2149cabdff1aSopenharmony_civoid ff_idct_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, 2150cabdff1aSopenharmony_ci int16_t *block, int eob) 2151cabdff1aSopenharmony_ci{ 2152cabdff1aSopenharmony_ci vp9_idct_iadst_16x16_add_msa(block, dst, stride, eob); 2153cabdff1aSopenharmony_ci} 2154cabdff1aSopenharmony_ci 2155cabdff1aSopenharmony_civoid ff_iadst_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, 2156cabdff1aSopenharmony_ci int16_t *block, int eob) 2157cabdff1aSopenharmony_ci{ 2158cabdff1aSopenharmony_ci vp9_iadst_idct_4x4_add_msa(block, dst, stride, eob); 2159cabdff1aSopenharmony_ci} 2160cabdff1aSopenharmony_ci 2161cabdff1aSopenharmony_civoid ff_iadst_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, 2162cabdff1aSopenharmony_ci int16_t *block, int eob) 2163cabdff1aSopenharmony_ci{ 2164cabdff1aSopenharmony_ci vp9_iadst_idct_8x8_add_msa(block, dst, stride, eob); 2165cabdff1aSopenharmony_ci} 2166cabdff1aSopenharmony_ci 2167cabdff1aSopenharmony_civoid ff_iadst_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, 2168cabdff1aSopenharmony_ci int16_t *block, int eob) 2169cabdff1aSopenharmony_ci{ 2170cabdff1aSopenharmony_ci vp9_iadst_idct_16x16_add_msa(block, dst, stride, eob); 2171cabdff1aSopenharmony_ci} 2172