1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited 3cabdff1aSopenharmony_ci * Contributed by Jin Bo <jinbo@loongson.cn> 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h" 23cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 24cabdff1aSopenharmony_ci#include "vp9dsp_loongarch.h" 25cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ci#define VP9_DCT_CONST_BITS 14 28cabdff1aSopenharmony_ci#define ALLOC_ALIGNED(align) __attribute__ ((aligned(align))) 29cabdff1aSopenharmony_ci#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n)) 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ciconst int32_t cospi_1_64 = 16364; 32cabdff1aSopenharmony_ciconst int32_t cospi_2_64 = 16305; 33cabdff1aSopenharmony_ciconst int32_t cospi_3_64 = 16207; 34cabdff1aSopenharmony_ciconst int32_t cospi_4_64 = 16069; 35cabdff1aSopenharmony_ciconst int32_t cospi_5_64 = 15893; 36cabdff1aSopenharmony_ciconst int32_t cospi_6_64 = 15679; 37cabdff1aSopenharmony_ciconst int32_t cospi_7_64 = 15426; 38cabdff1aSopenharmony_ciconst int32_t cospi_8_64 = 15137; 39cabdff1aSopenharmony_ciconst int32_t cospi_9_64 = 14811; 40cabdff1aSopenharmony_ciconst int32_t cospi_10_64 = 14449; 41cabdff1aSopenharmony_ciconst int32_t cospi_11_64 = 14053; 42cabdff1aSopenharmony_ciconst int32_t cospi_12_64 = 13623; 43cabdff1aSopenharmony_ciconst int32_t cospi_13_64 = 13160; 44cabdff1aSopenharmony_ciconst int32_t cospi_14_64 = 12665; 45cabdff1aSopenharmony_ciconst int32_t cospi_15_64 = 12140; 46cabdff1aSopenharmony_ciconst int32_t cospi_16_64 = 11585; 47cabdff1aSopenharmony_ciconst int32_t cospi_17_64 = 11003; 48cabdff1aSopenharmony_ciconst int32_t cospi_18_64 = 10394; 49cabdff1aSopenharmony_ciconst int32_t cospi_19_64 = 9760; 50cabdff1aSopenharmony_ciconst int32_t cospi_20_64 = 9102; 51cabdff1aSopenharmony_ciconst int32_t cospi_21_64 = 8423; 52cabdff1aSopenharmony_ciconst int32_t cospi_22_64 = 7723; 53cabdff1aSopenharmony_ciconst int32_t cospi_23_64 = 7005; 54cabdff1aSopenharmony_ciconst int32_t cospi_24_64 = 6270; 55cabdff1aSopenharmony_ciconst int32_t cospi_25_64 = 5520; 56cabdff1aSopenharmony_ciconst int32_t cospi_26_64 = 4756; 57cabdff1aSopenharmony_ciconst int32_t cospi_27_64 = 3981; 58cabdff1aSopenharmony_ciconst int32_t cospi_28_64 = 3196; 59cabdff1aSopenharmony_ciconst int32_t cospi_29_64 = 2404; 60cabdff1aSopenharmony_ciconst int32_t cospi_30_64 = 1606; 61cabdff1aSopenharmony_ciconst int32_t cospi_31_64 = 804; 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_ciconst int32_t sinpi_1_9 = 5283; 64cabdff1aSopenharmony_ciconst int32_t sinpi_2_9 = 9929; 65cabdff1aSopenharmony_ciconst int32_t sinpi_3_9 = 13377; 66cabdff1aSopenharmony_ciconst int32_t sinpi_4_9 = 15212; 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ 69cabdff1aSopenharmony_ci{ \ 70cabdff1aSopenharmony_ci __m128i k0_m = __lsx_vreplgr2vr_h(cnst0); \ 71cabdff1aSopenharmony_ci __m128i s0_m, s1_m, s2_m, s3_m; \ 72cabdff1aSopenharmony_ci \ 73cabdff1aSopenharmony_ci s0_m = __lsx_vreplgr2vr_h(cnst1); \ 74cabdff1aSopenharmony_ci k0_m = __lsx_vpackev_h(s0_m, k0_m); \ 75cabdff1aSopenharmony_ci \ 76cabdff1aSopenharmony_ci s1_m = __lsx_vilvl_h(__lsx_vneg_h(reg1), reg0); \ 77cabdff1aSopenharmony_ci s0_m = __lsx_vilvh_h(__lsx_vneg_h(reg1), reg0); \ 78cabdff1aSopenharmony_ci s3_m = __lsx_vilvl_h(reg0, reg1); \ 79cabdff1aSopenharmony_ci s2_m = __lsx_vilvh_h(reg0, reg1); \ 80cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_w_h, s1_m, k0_m, s0_m, k0_m, s1_m, s0_m); \ 81cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsrari_w, s1_m, VP9_DCT_CONST_BITS, \ 82cabdff1aSopenharmony_ci s0_m, VP9_DCT_CONST_BITS, s1_m, s0_m); \ 83cabdff1aSopenharmony_ci out0 = __lsx_vpickev_h(s0_m, s1_m); \ 84cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k0_m, s2_m, k0_m, s1_m, s0_m); \ 85cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsrari_w, s1_m, VP9_DCT_CONST_BITS, \ 86cabdff1aSopenharmony_ci s0_m, VP9_DCT_CONST_BITS, s1_m, s0_m); \ 87cabdff1aSopenharmony_ci out1 = __lsx_vpickev_h(s0_m, s1_m); \ 88cabdff1aSopenharmony_ci} 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ci#define VP9_SET_COSPI_PAIR(c0_h, c1_h) \ 91cabdff1aSopenharmony_ci( { \ 92cabdff1aSopenharmony_ci __m128i out0_m, r0_m, r1_m; \ 93cabdff1aSopenharmony_ci \ 94cabdff1aSopenharmony_ci r0_m = __lsx_vreplgr2vr_h(c0_h); \ 95cabdff1aSopenharmony_ci r1_m = __lsx_vreplgr2vr_h(c1_h); \ 96cabdff1aSopenharmony_ci out0_m = __lsx_vpackev_h(r1_m, r0_m); \ 97cabdff1aSopenharmony_ci \ 98cabdff1aSopenharmony_ci out0_m; \ 99cabdff1aSopenharmony_ci} ) 100cabdff1aSopenharmony_ci 101cabdff1aSopenharmony_ci#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ 102cabdff1aSopenharmony_ci{ \ 103cabdff1aSopenharmony_ci uint8_t *dst_m = (uint8_t *) (dst); \ 104cabdff1aSopenharmony_ci __m128i dst0_m, dst1_m, dst2_m, dst3_m; \ 105cabdff1aSopenharmony_ci __m128i tmp0_m, tmp1_m; \ 106cabdff1aSopenharmony_ci __m128i res0_m, res1_m, res2_m, res3_m; \ 107cabdff1aSopenharmony_ci __m128i zero_m = __lsx_vldi(0); \ 108cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, dst_m, 0, dst_m + dst_stride, 0, \ 109cabdff1aSopenharmony_ci dst_m + 2 * dst_stride, 0, dst_m + 3 * dst_stride, 0, \ 110cabdff1aSopenharmony_ci dst0_m, dst1_m, dst2_m, dst3_m); \ 111cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, zero_m, dst0_m, zero_m, dst1_m, zero_m, \ 112cabdff1aSopenharmony_ci dst2_m, zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);\ 113cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, \ 114cabdff1aSopenharmony_ci res3_m, in3, res0_m, res1_m, res2_m, res3_m); \ 115cabdff1aSopenharmony_ci DUP4_ARG1(__lsx_vclip255_h, res0_m, res1_m, res2_m, res3_m, \ 116cabdff1aSopenharmony_ci res0_m, res1_m, res2_m, res3_m); \ 117cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_b, res1_m, res0_m, res3_m, res2_m, \ 118cabdff1aSopenharmony_ci tmp0_m, tmp1_m); \ 119cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp0_m, dst_m, 0, 0); \ 120cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp0_m, dst_m + dst_stride, 0, 1); \ 121cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp1_m, dst_m + 2 * dst_stride, 0, 0); \ 122cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp1_m, dst_m + 3 * dst_stride, 0, 1); \ 123cabdff1aSopenharmony_ci} 124cabdff1aSopenharmony_ci 125cabdff1aSopenharmony_ci#define VP9_UNPCK_UB_SH(in, out_h, out_l) \ 126cabdff1aSopenharmony_ci{ \ 127cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); \ 128cabdff1aSopenharmony_ci out_l = __lsx_vilvl_b(zero, in); \ 129cabdff1aSopenharmony_ci out_h = __lsx_vilvh_b(zero, in); \ 130cabdff1aSopenharmony_ci} 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci#define VP9_ILVLTRANS4x8_H(in0, in1, in2, in3, in4, in5, in6, in7, \ 133cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 134cabdff1aSopenharmony_ci{ \ 135cabdff1aSopenharmony_ci __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 136cabdff1aSopenharmony_ci __m128i tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ 137cabdff1aSopenharmony_ci __m128i zero_m = __lsx_vldi(0); \ 138cabdff1aSopenharmony_ci \ 139cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6, \ 140cabdff1aSopenharmony_ci tmp0_n, tmp1_n, tmp2_n, tmp3_n); \ 141cabdff1aSopenharmony_ci tmp0_m = __lsx_vilvl_w(tmp1_n, tmp0_n); \ 142cabdff1aSopenharmony_ci tmp2_m = __lsx_vilvh_w(tmp1_n, tmp0_n); \ 143cabdff1aSopenharmony_ci tmp1_m = __lsx_vilvl_w(tmp3_n, tmp2_n); \ 144cabdff1aSopenharmony_ci tmp3_m = __lsx_vilvh_w(tmp3_n, tmp2_n); \ 145cabdff1aSopenharmony_ci \ 146cabdff1aSopenharmony_ci out0 = __lsx_vilvl_d(tmp1_m, tmp0_m); \ 147cabdff1aSopenharmony_ci out1 = __lsx_vilvh_d(tmp1_m, tmp0_m); \ 148cabdff1aSopenharmony_ci out2 = __lsx_vilvl_d(tmp3_m, tmp2_m); \ 149cabdff1aSopenharmony_ci out3 = __lsx_vilvh_d(tmp3_m, tmp2_m); \ 150cabdff1aSopenharmony_ci \ 151cabdff1aSopenharmony_ci out4 = zero_m; \ 152cabdff1aSopenharmony_ci out5 = zero_m; \ 153cabdff1aSopenharmony_ci out6 = zero_m; \ 154cabdff1aSopenharmony_ci out7 = zero_m; \ 155cabdff1aSopenharmony_ci} 156cabdff1aSopenharmony_ci 157cabdff1aSopenharmony_ci/* multiply and add macro */ 158cabdff1aSopenharmony_ci#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ 159cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 160cabdff1aSopenharmony_ci{ \ 161cabdff1aSopenharmony_ci __m128i madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ 162cabdff1aSopenharmony_ci __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 163cabdff1aSopenharmony_ci \ 164cabdff1aSopenharmony_ci madd_s1_m = __lsx_vilvl_h(inp1, inp0); \ 165cabdff1aSopenharmony_ci madd_s0_m = __lsx_vilvh_h(inp1, inp0); \ 166cabdff1aSopenharmony_ci madd_s3_m = __lsx_vilvl_h(inp3, inp2); \ 167cabdff1aSopenharmony_ci madd_s2_m = __lsx_vilvh_h(inp3, inp2); \ 168cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, madd_s1_m, cst0, madd_s0_m, cst0, \ 169cabdff1aSopenharmony_ci madd_s1_m, cst1, madd_s0_m, cst1, tmp0_m, tmp1_m, \ 170cabdff1aSopenharmony_ci tmp2_m, tmp3_m); \ 171cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_w, tmp0_m, VP9_DCT_CONST_BITS, tmp1_m, \ 172cabdff1aSopenharmony_ci VP9_DCT_CONST_BITS, tmp2_m, VP9_DCT_CONST_BITS, tmp3_m, \ 173cabdff1aSopenharmony_ci VP9_DCT_CONST_BITS, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 174cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ 175cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, madd_s3_m, cst2, madd_s2_m, cst2, madd_s3_m, \ 176cabdff1aSopenharmony_ci cst3, madd_s2_m, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 177cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_w, tmp0_m, VP9_DCT_CONST_BITS, \ 178cabdff1aSopenharmony_ci tmp1_m, VP9_DCT_CONST_BITS, tmp2_m, VP9_DCT_CONST_BITS, \ 179cabdff1aSopenharmony_ci tmp3_m, VP9_DCT_CONST_BITS, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 180cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ 181cabdff1aSopenharmony_ci} 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ 184cabdff1aSopenharmony_ci( { \ 185cabdff1aSopenharmony_ci __m128i c0_m, c1_m; \ 186cabdff1aSopenharmony_ci \ 187cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_h, mask_h, idx1_h, mask_h, idx2_h, c0_m, c1_m); \ 188cabdff1aSopenharmony_ci c0_m = __lsx_vpackev_h(c1_m, c0_m); \ 189cabdff1aSopenharmony_ci \ 190cabdff1aSopenharmony_ci c0_m; \ 191cabdff1aSopenharmony_ci} ) 192cabdff1aSopenharmony_ci 193cabdff1aSopenharmony_ci/* idct 8x8 macro */ 194cabdff1aSopenharmony_ci#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ 195cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 196cabdff1aSopenharmony_ci{ \ 197cabdff1aSopenharmony_ci __m128i tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ 198cabdff1aSopenharmony_ci __m128i k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ 199cabdff1aSopenharmony_ci __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 200cabdff1aSopenharmony_ci v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ 201cabdff1aSopenharmony_ci cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ 202cabdff1aSopenharmony_ci \ 203cabdff1aSopenharmony_ci k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \ 204cabdff1aSopenharmony_ci k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \ 205cabdff1aSopenharmony_ci k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \ 206cabdff1aSopenharmony_ci k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \ 207cabdff1aSopenharmony_ci VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ 208cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsub_h, in1, in3, in7, in5, res0_m, res1_m); \ 209cabdff1aSopenharmony_ci k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \ 210cabdff1aSopenharmony_ci k1_m = __lsx_vreplvei_h(mask_m, 4); \ 211cabdff1aSopenharmony_ci \ 212cabdff1aSopenharmony_ci res2_m = __lsx_vilvl_h(res0_m, res1_m); \ 213cabdff1aSopenharmony_ci res3_m = __lsx_vilvh_h(res0_m, res1_m); \ 214cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, res2_m, k0_m, res3_m, k0_m, res2_m, k1_m, \ 215cabdff1aSopenharmony_ci res3_m, k1_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 216cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_w, tmp0_m, VP9_DCT_CONST_BITS, \ 217cabdff1aSopenharmony_ci tmp1_m, VP9_DCT_CONST_BITS, tmp2_m, VP9_DCT_CONST_BITS, \ 218cabdff1aSopenharmony_ci tmp3_m, VP9_DCT_CONST_BITS, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 219cabdff1aSopenharmony_ci tp4_m = __lsx_vadd_h(in1, in3); \ 220cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ 221cabdff1aSopenharmony_ci tp7_m = __lsx_vadd_h(in7, in5); \ 222cabdff1aSopenharmony_ci k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ 223cabdff1aSopenharmony_ci k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ 224cabdff1aSopenharmony_ci VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \ 225cabdff1aSopenharmony_ci in0, in4, in2, in6); \ 226cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ 227cabdff1aSopenharmony_ci LSX_BUTTERFLY_8_H(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \ 228cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7); \ 229cabdff1aSopenharmony_ci} 230cabdff1aSopenharmony_ci 231cabdff1aSopenharmony_cistatic av_always_inline 232cabdff1aSopenharmony_civoid vp9_idct8x8_1_add_lsx(int16_t *input, uint8_t *dst, 233cabdff1aSopenharmony_ci int32_t dst_stride) 234cabdff1aSopenharmony_ci{ 235cabdff1aSopenharmony_ci int16_t out; 236cabdff1aSopenharmony_ci int32_t val; 237cabdff1aSopenharmony_ci __m128i vec; 238cabdff1aSopenharmony_ci 239cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); 240cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); 241cabdff1aSopenharmony_ci val = ROUND_POWER_OF_TWO(out, 5); 242cabdff1aSopenharmony_ci vec = __lsx_vreplgr2vr_h(val); 243cabdff1aSopenharmony_ci input[0] = 0; 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); 246cabdff1aSopenharmony_ci dst += (4 * dst_stride); 247cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); 248cabdff1aSopenharmony_ci} 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_cistatic void vp9_idct8x8_12_colcol_addblk_lsx(int16_t *input, uint8_t *dst, 251cabdff1aSopenharmony_ci int32_t dst_stride) 252cabdff1aSopenharmony_ci{ 253cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5, in6, in7; 254cabdff1aSopenharmony_ci __m128i s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3; 255cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3; 256cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 257cabdff1aSopenharmony_ci 258cabdff1aSopenharmony_ci /* load vector elements of 8x8 block */ 259cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, 260cabdff1aSopenharmony_ci in0, in1, in2, in3); 261cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, 262cabdff1aSopenharmony_ci in4, in5, in6, in7); 263cabdff1aSopenharmony_ci __lsx_vst(zero, input, 0); 264cabdff1aSopenharmony_ci __lsx_vst(zero, input, 16); 265cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32); 266cabdff1aSopenharmony_ci __lsx_vst(zero, input, 48); 267cabdff1aSopenharmony_ci __lsx_vst(zero, input, 64); 268cabdff1aSopenharmony_ci __lsx_vst(zero, input, 80); 269cabdff1aSopenharmony_ci __lsx_vst(zero, input, 96); 270cabdff1aSopenharmony_ci __lsx_vst(zero, input, 112); 271cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_d,in1, in0, in3, in2, in5, in4, in7, 272cabdff1aSopenharmony_ci in6, in0, in1, in2, in3); 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ci /* stage1 */ 275cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, in3, in0, in2, in1, s0, s1); 276cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); 277cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); 278cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); 279cabdff1aSopenharmony_ci k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); 280cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, s0, k0, s0, k1, s1, k2, s1, k3, 281cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 282cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_w, tmp0, VP9_DCT_CONST_BITS, tmp1, 283cabdff1aSopenharmony_ci VP9_DCT_CONST_BITS, tmp2, VP9_DCT_CONST_BITS, tmp3, 284cabdff1aSopenharmony_ci VP9_DCT_CONST_BITS, tmp0, tmp1, tmp2, tmp3); 285cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_h, zero, tmp0, zero, tmp1, zero, tmp2, zero, tmp3, 286cabdff1aSopenharmony_ci s0, s1, s2, s3); 287cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(s0, s1, s3, s2, s4, s7, s6, s5); 288cabdff1aSopenharmony_ci 289cabdff1aSopenharmony_ci /* stage2 */ 290cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, in3, in1, in2, in0, s1, s0); 291cabdff1aSopenharmony_ci k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); 292cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); 293cabdff1aSopenharmony_ci k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); 294cabdff1aSopenharmony_ci k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); 295cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, s0, k0, s0, k1, s1, k2, s1, k3, 296cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 297cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_w, tmp0, VP9_DCT_CONST_BITS, tmp1, 298cabdff1aSopenharmony_ci VP9_DCT_CONST_BITS, tmp2, VP9_DCT_CONST_BITS, tmp3, 299cabdff1aSopenharmony_ci VP9_DCT_CONST_BITS, tmp0, tmp1, tmp2, tmp3); 300cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_h, zero, tmp0, zero, tmp1, zero, tmp2, zero, tmp3, 301cabdff1aSopenharmony_ci s0, s1, s2, s3); 302cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(s0, s1, s2, s3, m0, m1, m2, m3); 303cabdff1aSopenharmony_ci 304cabdff1aSopenharmony_ci /* stage3 */ 305cabdff1aSopenharmony_ci s0 = __lsx_vilvl_h(s6, s5); 306cabdff1aSopenharmony_ci 307cabdff1aSopenharmony_ci k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); 308cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_w_h, s0, k1, s0, k0, tmp0, tmp1); 309cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsrari_w, tmp0, VP9_DCT_CONST_BITS, tmp1, 310cabdff1aSopenharmony_ci VP9_DCT_CONST_BITS, tmp0, tmp1); 311cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, zero, tmp0, zero, tmp1, s2, s3); 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci /* stage4 */ 314cabdff1aSopenharmony_ci LSX_BUTTERFLY_8_H(m0, m1, m2, m3, s4, s2, s3, s7, 315cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 316cabdff1aSopenharmony_ci VP9_ILVLTRANS4x8_H(in0, in1, in2, in3, in4, in5, in6, in7, 317cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 318cabdff1aSopenharmony_ci VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, 319cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 320cabdff1aSopenharmony_ci 321cabdff1aSopenharmony_ci /* final rounding (add 2^4, divide by 2^5) and shift */ 322cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, in0 , 5, in1, 5, in2, 5, in3, 5, 323cabdff1aSopenharmony_ci in0, in1, in2, in3); 324cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, in4 , 5, in5, 5, in6, 5, in7, 5, 325cabdff1aSopenharmony_ci in4, in5, in6, in7); 326cabdff1aSopenharmony_ci 327cabdff1aSopenharmony_ci /* add block and store 8x8 */ 328cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); 329cabdff1aSopenharmony_ci dst += (4 * dst_stride); 330cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); 331cabdff1aSopenharmony_ci} 332cabdff1aSopenharmony_ci 333cabdff1aSopenharmony_cistatic void vp9_idct8x8_colcol_addblk_lsx(int16_t *input, uint8_t *dst, 334cabdff1aSopenharmony_ci int32_t dst_stride) 335cabdff1aSopenharmony_ci{ 336cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5, in6, in7; 337cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 338cabdff1aSopenharmony_ci 339cabdff1aSopenharmony_ci /* load vector elements of 8x8 block */ 340cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, 341cabdff1aSopenharmony_ci in0, in1, in2, in3); 342cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, 343cabdff1aSopenharmony_ci in4, in5, in6, in7); 344cabdff1aSopenharmony_ci __lsx_vst(zero, input, 0); 345cabdff1aSopenharmony_ci __lsx_vst(zero, input, 16); 346cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32); 347cabdff1aSopenharmony_ci __lsx_vst(zero, input, 48); 348cabdff1aSopenharmony_ci __lsx_vst(zero, input, 64); 349cabdff1aSopenharmony_ci __lsx_vst(zero, input, 80); 350cabdff1aSopenharmony_ci __lsx_vst(zero, input, 96); 351cabdff1aSopenharmony_ci __lsx_vst(zero, input, 112); 352cabdff1aSopenharmony_ci /* 1D idct8x8 */ 353cabdff1aSopenharmony_ci VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, 354cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 355cabdff1aSopenharmony_ci /* columns transform */ 356cabdff1aSopenharmony_ci LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, 357cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 358cabdff1aSopenharmony_ci /* 1D idct8x8 */ 359cabdff1aSopenharmony_ci VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, 360cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 361cabdff1aSopenharmony_ci /* final rounding (add 2^4, divide by 2^5) and shift */ 362cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, in0, 5, in1, 5, in2, 5, in3, 5, 363cabdff1aSopenharmony_ci in0, in1, in2, in3); 364cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, in4, 5, in5, 5, in6, 5, in7, 5, 365cabdff1aSopenharmony_ci in4, in5, in6, in7); 366cabdff1aSopenharmony_ci /* add block and store 8x8 */ 367cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); 368cabdff1aSopenharmony_ci dst += (4 * dst_stride); 369cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); 370cabdff1aSopenharmony_ci} 371cabdff1aSopenharmony_ci 372cabdff1aSopenharmony_cistatic void vp9_idct16_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst, 373cabdff1aSopenharmony_ci int32_t dst_stride) 374cabdff1aSopenharmony_ci{ 375cabdff1aSopenharmony_ci __m128i loc0, loc1, loc2, loc3; 376cabdff1aSopenharmony_ci __m128i reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; 377cabdff1aSopenharmony_ci __m128i reg1, reg3, reg5, reg7, reg9, reg11, reg13, reg15; 378cabdff1aSopenharmony_ci __m128i tmp5, tmp6, tmp7; 379cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 380cabdff1aSopenharmony_ci int32_t offset = dst_stride << 2; 381cabdff1aSopenharmony_ci 382cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 32*0, input, 32*1, input, 32*2, input, 32*3, 383cabdff1aSopenharmony_ci reg0, reg1, reg2, reg3); 384cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 32*4, input, 32*5, input, 32*6, input, 32*7, 385cabdff1aSopenharmony_ci reg4, reg5, reg6, reg7); 386cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 32*8, input, 32*9, input, 32*10, input, 32*11, 387cabdff1aSopenharmony_ci reg8, reg9, reg10, reg11); 388cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 32*12, input, 32*13, input, 32*14, input, 389cabdff1aSopenharmony_ci 32*15, reg12, reg13, reg14, reg15); 390cabdff1aSopenharmony_ci 391cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*0); 392cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*1); 393cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*2); 394cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*3); 395cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*4); 396cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*5); 397cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*6); 398cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*7); 399cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*8); 400cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*9); 401cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*10); 402cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*11); 403cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*12); 404cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*13); 405cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*14); 406cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*15); 407cabdff1aSopenharmony_ci 408cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); 409cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); 410cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); 411cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); 412cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); 413cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); 414cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); 415cabdff1aSopenharmony_ci 416cabdff1aSopenharmony_ci reg0 = __lsx_vsub_h(reg2, loc1); 417cabdff1aSopenharmony_ci reg2 = __lsx_vadd_h(reg2, loc1); 418cabdff1aSopenharmony_ci reg12 = __lsx_vsub_h(reg14, loc0); 419cabdff1aSopenharmony_ci reg14 = __lsx_vadd_h(reg14, loc0); 420cabdff1aSopenharmony_ci reg4 = __lsx_vsub_h(reg6, loc3); 421cabdff1aSopenharmony_ci reg6 = __lsx_vadd_h(reg6, loc3); 422cabdff1aSopenharmony_ci reg8 = __lsx_vsub_h(reg10, loc2); 423cabdff1aSopenharmony_ci reg10 = __lsx_vadd_h(reg10, loc2); 424cabdff1aSopenharmony_ci 425cabdff1aSopenharmony_ci /* stage2 */ 426cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); 427cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); 428cabdff1aSopenharmony_ci 429cabdff1aSopenharmony_ci reg9 = __lsx_vsub_h(reg1, loc2); 430cabdff1aSopenharmony_ci reg1 = __lsx_vadd_h(reg1, loc2); 431cabdff1aSopenharmony_ci reg7 = __lsx_vsub_h(reg15, loc3); 432cabdff1aSopenharmony_ci reg15 = __lsx_vadd_h(reg15, loc3); 433cabdff1aSopenharmony_ci 434cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); 435cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); 436cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); 437cabdff1aSopenharmony_ci 438cabdff1aSopenharmony_ci loc1 = __lsx_vadd_h(reg15, reg3); 439cabdff1aSopenharmony_ci reg3 = __lsx_vsub_h(reg15, reg3); 440cabdff1aSopenharmony_ci loc2 = __lsx_vadd_h(reg2, loc1); 441cabdff1aSopenharmony_ci reg15 = __lsx_vsub_h(reg2, loc1); 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci loc1 = __lsx_vadd_h(reg1, reg13); 444cabdff1aSopenharmony_ci reg13 = __lsx_vsub_h(reg1, reg13); 445cabdff1aSopenharmony_ci loc0 = __lsx_vadd_h(reg0, loc1); 446cabdff1aSopenharmony_ci loc1 = __lsx_vsub_h(reg0, loc1); 447cabdff1aSopenharmony_ci tmp6 = loc0; 448cabdff1aSopenharmony_ci tmp7 = loc1; 449cabdff1aSopenharmony_ci reg0 = loc2; 450cabdff1aSopenharmony_ci 451cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); 452cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(__lsx_vneg_h(reg5), __lsx_vneg_h(reg11), cospi_8_64, 453cabdff1aSopenharmony_ci cospi_24_64, reg5, reg11); 454cabdff1aSopenharmony_ci 455cabdff1aSopenharmony_ci loc0 = __lsx_vadd_h(reg9, reg5); 456cabdff1aSopenharmony_ci reg5 = __lsx_vsub_h(reg9, reg5); 457cabdff1aSopenharmony_ci reg2 = __lsx_vadd_h(reg6, loc0); 458cabdff1aSopenharmony_ci reg1 = __lsx_vsub_h(reg6, loc0); 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci loc0 = __lsx_vadd_h(reg7, reg11); 461cabdff1aSopenharmony_ci reg11 = __lsx_vsub_h(reg7, reg11); 462cabdff1aSopenharmony_ci loc1 = __lsx_vadd_h(reg4, loc0); 463cabdff1aSopenharmony_ci loc2 = __lsx_vsub_h(reg4, loc0); 464cabdff1aSopenharmony_ci tmp5 = loc1; 465cabdff1aSopenharmony_ci 466cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); 467cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ci reg10 = loc0; 470cabdff1aSopenharmony_ci reg11 = loc1; 471cabdff1aSopenharmony_ci 472cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); 473cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); 474cabdff1aSopenharmony_ci reg13 = loc2; 475cabdff1aSopenharmony_ci 476cabdff1aSopenharmony_ci /* Transpose and store the output */ 477cabdff1aSopenharmony_ci reg12 = tmp5; 478cabdff1aSopenharmony_ci reg14 = tmp6; 479cabdff1aSopenharmony_ci reg3 = tmp7; 480cabdff1aSopenharmony_ci 481cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, reg0, 6, reg2, 6, reg4, 6, reg6, 6, 482cabdff1aSopenharmony_ci reg0, reg2, reg4, reg6); 483cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6); 484cabdff1aSopenharmony_ci dst += offset; 485cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, reg8, 6, reg10, 6, reg12, 6, reg14, 6, 486cabdff1aSopenharmony_ci reg8, reg10, reg12, reg14); 487cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); 488cabdff1aSopenharmony_ci dst += offset; 489cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, reg3, 6, reg5, 6, reg11, 6, reg13, 6, 490cabdff1aSopenharmony_ci reg3, reg5, reg11, reg13); 491cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); 492cabdff1aSopenharmony_ci dst += offset; 493cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, reg1, 6, reg7, 6, reg9, 6, reg15, 6, 494cabdff1aSopenharmony_ci reg1, reg7, reg9, reg15); 495cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); 496cabdff1aSopenharmony_ci} 497cabdff1aSopenharmony_ci 498cabdff1aSopenharmony_cistatic void vp9_idct16_1d_columns_lsx(int16_t *input, int16_t *output) 499cabdff1aSopenharmony_ci{ 500cabdff1aSopenharmony_ci __m128i loc0, loc1, loc2, loc3; 501cabdff1aSopenharmony_ci __m128i reg1, reg3, reg5, reg7, reg9, reg11, reg13, reg15; 502cabdff1aSopenharmony_ci __m128i reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; 503cabdff1aSopenharmony_ci __m128i tmp5, tmp6, tmp7; 504cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 505cabdff1aSopenharmony_ci int16_t *offset; 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 32*0, input, 32*1, input, 32*2, input, 32*3, 508cabdff1aSopenharmony_ci reg0, reg1, reg2, reg3); 509cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 32*4, input, 32*5, input, 32*6, input, 32*7, 510cabdff1aSopenharmony_ci reg4, reg5, reg6, reg7); 511cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 32*8, input, 32*9, input, 32*10, input, 32*11, 512cabdff1aSopenharmony_ci reg8, reg9, reg10, reg11); 513cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, input, 32*12, input, 32*13, input, 32*14, input, 514cabdff1aSopenharmony_ci 32*15, reg12, reg13, reg14, reg15); 515cabdff1aSopenharmony_ci 516cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*0); 517cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*1); 518cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*2); 519cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*3); 520cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*4); 521cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*5); 522cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*6); 523cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*7); 524cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*8); 525cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*9); 526cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*10); 527cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*11); 528cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*12); 529cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*13); 530cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*14); 531cabdff1aSopenharmony_ci __lsx_vst(zero, input, 32*15); 532cabdff1aSopenharmony_ci 533cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); 534cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); 535cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); 536cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); 537cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); 538cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); 539cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); 540cabdff1aSopenharmony_ci 541cabdff1aSopenharmony_ci reg0 = __lsx_vsub_h(reg2, loc1); 542cabdff1aSopenharmony_ci reg2 = __lsx_vadd_h(reg2, loc1); 543cabdff1aSopenharmony_ci reg12 = __lsx_vsub_h(reg14, loc0); 544cabdff1aSopenharmony_ci reg14 = __lsx_vadd_h(reg14, loc0); 545cabdff1aSopenharmony_ci reg4 = __lsx_vsub_h(reg6, loc3); 546cabdff1aSopenharmony_ci reg6 = __lsx_vadd_h(reg6, loc3); 547cabdff1aSopenharmony_ci reg8 = __lsx_vsub_h(reg10, loc2); 548cabdff1aSopenharmony_ci reg10 = __lsx_vadd_h(reg10, loc2); 549cabdff1aSopenharmony_ci 550cabdff1aSopenharmony_ci /* stage2 */ 551cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); 552cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); 553cabdff1aSopenharmony_ci 554cabdff1aSopenharmony_ci reg9 = __lsx_vsub_h(reg1, loc2); 555cabdff1aSopenharmony_ci reg1 = __lsx_vadd_h(reg1, loc2); 556cabdff1aSopenharmony_ci reg7 = __lsx_vsub_h(reg15, loc3); 557cabdff1aSopenharmony_ci reg15 = __lsx_vadd_h(reg15, loc3); 558cabdff1aSopenharmony_ci 559cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); 560cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); 561cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); 562cabdff1aSopenharmony_ci 563cabdff1aSopenharmony_ci loc1 = __lsx_vadd_h(reg15, reg3); 564cabdff1aSopenharmony_ci reg3 = __lsx_vsub_h(reg15, reg3); 565cabdff1aSopenharmony_ci loc2 = __lsx_vadd_h(reg2, loc1); 566cabdff1aSopenharmony_ci reg15 = __lsx_vsub_h(reg2, loc1); 567cabdff1aSopenharmony_ci 568cabdff1aSopenharmony_ci loc1 = __lsx_vadd_h(reg1, reg13); 569cabdff1aSopenharmony_ci reg13 = __lsx_vsub_h(reg1, reg13); 570cabdff1aSopenharmony_ci loc0 = __lsx_vadd_h(reg0, loc1); 571cabdff1aSopenharmony_ci loc1 = __lsx_vsub_h(reg0, loc1); 572cabdff1aSopenharmony_ci tmp6 = loc0; 573cabdff1aSopenharmony_ci tmp7 = loc1; 574cabdff1aSopenharmony_ci reg0 = loc2; 575cabdff1aSopenharmony_ci 576cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); 577cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(__lsx_vneg_h(reg5), __lsx_vneg_h(reg11), cospi_8_64, 578cabdff1aSopenharmony_ci cospi_24_64, reg5, reg11); 579cabdff1aSopenharmony_ci 580cabdff1aSopenharmony_ci loc0 = __lsx_vadd_h(reg9, reg5); 581cabdff1aSopenharmony_ci reg5 = __lsx_vsub_h(reg9, reg5); 582cabdff1aSopenharmony_ci reg2 = __lsx_vadd_h(reg6, loc0); 583cabdff1aSopenharmony_ci reg1 = __lsx_vsub_h(reg6, loc0); 584cabdff1aSopenharmony_ci 585cabdff1aSopenharmony_ci loc0 = __lsx_vadd_h(reg7, reg11); 586cabdff1aSopenharmony_ci reg11 = __lsx_vsub_h(reg7, reg11); 587cabdff1aSopenharmony_ci loc1 = __lsx_vadd_h(reg4, loc0); 588cabdff1aSopenharmony_ci loc2 = __lsx_vsub_h(reg4, loc0); 589cabdff1aSopenharmony_ci 590cabdff1aSopenharmony_ci tmp5 = loc1; 591cabdff1aSopenharmony_ci 592cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); 593cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); 594cabdff1aSopenharmony_ci 595cabdff1aSopenharmony_ci reg10 = loc0; 596cabdff1aSopenharmony_ci reg11 = loc1; 597cabdff1aSopenharmony_ci 598cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); 599cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); 600cabdff1aSopenharmony_ci reg13 = loc2; 601cabdff1aSopenharmony_ci 602cabdff1aSopenharmony_ci /* Transpose and store the output */ 603cabdff1aSopenharmony_ci reg12 = tmp5; 604cabdff1aSopenharmony_ci reg14 = tmp6; 605cabdff1aSopenharmony_ci reg3 = tmp7; 606cabdff1aSopenharmony_ci 607cabdff1aSopenharmony_ci /* transpose block */ 608cabdff1aSopenharmony_ci LSX_TRANSPOSE8x8_H(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, 609cabdff1aSopenharmony_ci reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); 610cabdff1aSopenharmony_ci 611cabdff1aSopenharmony_ci __lsx_vst(reg0, output, 32*0); 612cabdff1aSopenharmony_ci __lsx_vst(reg2, output, 32*1); 613cabdff1aSopenharmony_ci __lsx_vst(reg4, output, 32*2); 614cabdff1aSopenharmony_ci __lsx_vst(reg6, output, 32*3); 615cabdff1aSopenharmony_ci __lsx_vst(reg8, output, 32*4); 616cabdff1aSopenharmony_ci __lsx_vst(reg10, output, 32*5); 617cabdff1aSopenharmony_ci __lsx_vst(reg12, output, 32*6); 618cabdff1aSopenharmony_ci __lsx_vst(reg14, output, 32*7); 619cabdff1aSopenharmony_ci 620cabdff1aSopenharmony_ci /* transpose block */ 621cabdff1aSopenharmony_ci LSX_TRANSPOSE8x8_H(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, 622cabdff1aSopenharmony_ci reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); 623cabdff1aSopenharmony_ci 624cabdff1aSopenharmony_ci offset = output + 8; 625cabdff1aSopenharmony_ci __lsx_vst(reg3, offset, 32*0); 626cabdff1aSopenharmony_ci __lsx_vst(reg13, offset, 32*1); 627cabdff1aSopenharmony_ci __lsx_vst(reg11, offset, 32*2); 628cabdff1aSopenharmony_ci __lsx_vst(reg5, offset, 32*3); 629cabdff1aSopenharmony_ci 630cabdff1aSopenharmony_ci offset = output + 8 + 4 * 16; 631cabdff1aSopenharmony_ci __lsx_vst(reg7, offset, 32*0); 632cabdff1aSopenharmony_ci __lsx_vst(reg9, offset, 32*1); 633cabdff1aSopenharmony_ci __lsx_vst(reg1, offset, 32*2); 634cabdff1aSopenharmony_ci __lsx_vst(reg15, offset, 32*3); 635cabdff1aSopenharmony_ci} 636cabdff1aSopenharmony_ci 637cabdff1aSopenharmony_cistatic void vp9_idct16x16_1_add_lsx(int16_t *input, uint8_t *dst, 638cabdff1aSopenharmony_ci int32_t dst_stride) 639cabdff1aSopenharmony_ci{ 640cabdff1aSopenharmony_ci uint8_t i; 641cabdff1aSopenharmony_ci int16_t out; 642cabdff1aSopenharmony_ci __m128i vec, res0, res1, res2, res3, res4, res5, res6, res7; 643cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; 644cabdff1aSopenharmony_ci int32_t stride2 = dst_stride << 1; 645cabdff1aSopenharmony_ci int32_t stride3 = stride2 + dst_stride; 646cabdff1aSopenharmony_ci int32_t stride4 = stride2 << 1; 647cabdff1aSopenharmony_ci 648cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); 649cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); 650cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO(out, 6); 651cabdff1aSopenharmony_ci input[0] = 0; 652cabdff1aSopenharmony_ci vec = __lsx_vreplgr2vr_h(out); 653cabdff1aSopenharmony_ci 654cabdff1aSopenharmony_ci for (i = 4; i--;) { 655cabdff1aSopenharmony_ci dst0 = __lsx_vld(dst, 0); 656cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, stride2, dst1, dst2); 657cabdff1aSopenharmony_ci dst3 = __lsx_vldx(dst, stride3); 658cabdff1aSopenharmony_ci VP9_UNPCK_UB_SH(dst0, res4, res0); 659cabdff1aSopenharmony_ci VP9_UNPCK_UB_SH(dst1, res5, res1); 660cabdff1aSopenharmony_ci VP9_UNPCK_UB_SH(dst2, res6, res2); 661cabdff1aSopenharmony_ci VP9_UNPCK_UB_SH(dst3, res7, res3); 662cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, 663cabdff1aSopenharmony_ci res0, res1, res2, res3); 664cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, 665cabdff1aSopenharmony_ci res4, res5, res6, res7); 666cabdff1aSopenharmony_ci DUP4_ARG1(__lsx_vclip255_h, res0, res1, res2, res3, 667cabdff1aSopenharmony_ci res0, res1, res2, res3); 668cabdff1aSopenharmony_ci DUP4_ARG1(__lsx_vclip255_h, res4, res5, res6, res7, 669cabdff1aSopenharmony_ci res4, res5, res6, res7); 670cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_b, res4, res0, res5, res1, res6, 671cabdff1aSopenharmony_ci res2, res7, res3, tmp0, tmp1, tmp2, tmp3); 672cabdff1aSopenharmony_ci __lsx_vst(tmp0, dst, 0); 673cabdff1aSopenharmony_ci __lsx_vstx(tmp1, dst, dst_stride); 674cabdff1aSopenharmony_ci __lsx_vstx(tmp2, dst, stride2); 675cabdff1aSopenharmony_ci __lsx_vstx(tmp3, dst, stride3); 676cabdff1aSopenharmony_ci dst += stride4; 677cabdff1aSopenharmony_ci } 678cabdff1aSopenharmony_ci} 679cabdff1aSopenharmony_ci 680cabdff1aSopenharmony_cistatic void vp9_idct16x16_10_colcol_addblk_lsx(int16_t *input, uint8_t *dst, 681cabdff1aSopenharmony_ci int32_t dst_stride) 682cabdff1aSopenharmony_ci{ 683cabdff1aSopenharmony_ci int32_t i; 684cabdff1aSopenharmony_ci int16_t out_arr[16 * 16] ALLOC_ALIGNED(16); 685cabdff1aSopenharmony_ci int16_t *out = out_arr; 686cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 687cabdff1aSopenharmony_ci 688cabdff1aSopenharmony_ci /* transform rows */ 689cabdff1aSopenharmony_ci vp9_idct16_1d_columns_lsx(input, out); 690cabdff1aSopenharmony_ci 691cabdff1aSopenharmony_ci /* short case just considers top 4 rows as valid output */ 692cabdff1aSopenharmony_ci out += 4 * 16; 693cabdff1aSopenharmony_ci for (i = 3; i--;) { 694cabdff1aSopenharmony_ci __lsx_vst(zero, out, 0); 695cabdff1aSopenharmony_ci __lsx_vst(zero, out, 16); 696cabdff1aSopenharmony_ci __lsx_vst(zero, out, 32); 697cabdff1aSopenharmony_ci __lsx_vst(zero, out, 48); 698cabdff1aSopenharmony_ci __lsx_vst(zero, out, 64); 699cabdff1aSopenharmony_ci __lsx_vst(zero, out, 80); 700cabdff1aSopenharmony_ci __lsx_vst(zero, out, 96); 701cabdff1aSopenharmony_ci __lsx_vst(zero, out, 112); 702cabdff1aSopenharmony_ci out += 64; 703cabdff1aSopenharmony_ci } 704cabdff1aSopenharmony_ci 705cabdff1aSopenharmony_ci out = out_arr; 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_ci /* transform columns */ 708cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 709cabdff1aSopenharmony_ci /* process 8 * 16 block */ 710cabdff1aSopenharmony_ci vp9_idct16_1d_columns_addblk_lsx((out + (i << 3)), (dst + (i << 3)), 711cabdff1aSopenharmony_ci dst_stride); 712cabdff1aSopenharmony_ci } 713cabdff1aSopenharmony_ci} 714cabdff1aSopenharmony_ci 715cabdff1aSopenharmony_cistatic void vp9_idct16x16_colcol_addblk_lsx(int16_t *input, uint8_t *dst, 716cabdff1aSopenharmony_ci int32_t dst_stride) 717cabdff1aSopenharmony_ci{ 718cabdff1aSopenharmony_ci int32_t i; 719cabdff1aSopenharmony_ci int16_t out_arr[16 * 16] ALLOC_ALIGNED(16); 720cabdff1aSopenharmony_ci int16_t *out = out_arr; 721cabdff1aSopenharmony_ci 722cabdff1aSopenharmony_ci /* transform rows */ 723cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 724cabdff1aSopenharmony_ci /* process 8 * 16 block */ 725cabdff1aSopenharmony_ci vp9_idct16_1d_columns_lsx((input + (i << 3)), (out + (i << 7))); 726cabdff1aSopenharmony_ci } 727cabdff1aSopenharmony_ci 728cabdff1aSopenharmony_ci /* transform columns */ 729cabdff1aSopenharmony_ci for (i = 0; i < 2; i++) { 730cabdff1aSopenharmony_ci /* process 8 * 16 block */ 731cabdff1aSopenharmony_ci vp9_idct16_1d_columns_addblk_lsx((out + (i << 3)), (dst + (i << 3)), 732cabdff1aSopenharmony_ci dst_stride); 733cabdff1aSopenharmony_ci } 734cabdff1aSopenharmony_ci} 735cabdff1aSopenharmony_ci 736cabdff1aSopenharmony_cistatic void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf, 737cabdff1aSopenharmony_ci int16_t *tmp_eve_buf, 738cabdff1aSopenharmony_ci int16_t *tmp_odd_buf, 739cabdff1aSopenharmony_ci int16_t *dst) 740cabdff1aSopenharmony_ci{ 741cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 742cabdff1aSopenharmony_ci __m128i m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 743cabdff1aSopenharmony_ci 744cabdff1aSopenharmony_ci /* FINAL BUTTERFLY : Dependency on Even & Odd */ 745cabdff1aSopenharmony_ci vec0 = __lsx_vld(tmp_odd_buf, 0); 746cabdff1aSopenharmony_ci vec1 = __lsx_vld(tmp_odd_buf, 9 * 16); 747cabdff1aSopenharmony_ci vec2 = __lsx_vld(tmp_odd_buf, 14 * 16); 748cabdff1aSopenharmony_ci vec3 = __lsx_vld(tmp_odd_buf, 6 * 16); 749cabdff1aSopenharmony_ci loc0 = __lsx_vld(tmp_eve_buf, 0); 750cabdff1aSopenharmony_ci loc1 = __lsx_vld(tmp_eve_buf, 8 * 16); 751cabdff1aSopenharmony_ci loc2 = __lsx_vld(tmp_eve_buf, 4 * 16); 752cabdff1aSopenharmony_ci loc3 = __lsx_vld(tmp_eve_buf, 12 * 16); 753cabdff1aSopenharmony_ci 754cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h,loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 755cabdff1aSopenharmony_ci m0, m4, m2, m6); 756cabdff1aSopenharmony_ci 757cabdff1aSopenharmony_ci #define SUB(a, b) __lsx_vsub_h(a, b) 758cabdff1aSopenharmony_ci 759cabdff1aSopenharmony_ci __lsx_vst(SUB(loc0, vec3), tmp_buf, 31 * 16); 760cabdff1aSopenharmony_ci __lsx_vst(SUB(loc1, vec2), tmp_buf, 23 * 16); 761cabdff1aSopenharmony_ci __lsx_vst(SUB(loc2, vec1), tmp_buf, 27 * 16); 762cabdff1aSopenharmony_ci __lsx_vst(SUB(loc3, vec0), tmp_buf, 19 * 16); 763cabdff1aSopenharmony_ci 764cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 765cabdff1aSopenharmony_ci vec0 = __lsx_vld(tmp_odd_buf, 4 * 16); 766cabdff1aSopenharmony_ci vec1 = __lsx_vld(tmp_odd_buf, 13 * 16); 767cabdff1aSopenharmony_ci vec2 = __lsx_vld(tmp_odd_buf, 10 * 16); 768cabdff1aSopenharmony_ci vec3 = __lsx_vld(tmp_odd_buf, 3 * 16); 769cabdff1aSopenharmony_ci loc0 = __lsx_vld(tmp_eve_buf, 2 * 16); 770cabdff1aSopenharmony_ci loc1 = __lsx_vld(tmp_eve_buf, 10 * 16); 771cabdff1aSopenharmony_ci loc2 = __lsx_vld(tmp_eve_buf, 6 * 16); 772cabdff1aSopenharmony_ci loc3 = __lsx_vld(tmp_eve_buf, 14 * 16); 773cabdff1aSopenharmony_ci 774cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 775cabdff1aSopenharmony_ci m1, m5, m3, m7); 776cabdff1aSopenharmony_ci 777cabdff1aSopenharmony_ci __lsx_vst(SUB(loc0, vec3), tmp_buf, 29 * 16); 778cabdff1aSopenharmony_ci __lsx_vst(SUB(loc1, vec2), tmp_buf, 21 * 16); 779cabdff1aSopenharmony_ci __lsx_vst(SUB(loc2, vec1), tmp_buf, 25 * 16); 780cabdff1aSopenharmony_ci __lsx_vst(SUB(loc3, vec0), tmp_buf, 17 * 16); 781cabdff1aSopenharmony_ci 782cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 783cabdff1aSopenharmony_ci vec0 = __lsx_vld(tmp_odd_buf, 2 * 16); 784cabdff1aSopenharmony_ci vec1 = __lsx_vld(tmp_odd_buf, 11 * 16); 785cabdff1aSopenharmony_ci vec2 = __lsx_vld(tmp_odd_buf, 12 * 16); 786cabdff1aSopenharmony_ci vec3 = __lsx_vld(tmp_odd_buf, 7 * 16); 787cabdff1aSopenharmony_ci loc0 = __lsx_vld(tmp_eve_buf, 1 * 16); 788cabdff1aSopenharmony_ci loc1 = __lsx_vld(tmp_eve_buf, 9 * 16); 789cabdff1aSopenharmony_ci loc2 = __lsx_vld(tmp_eve_buf, 5 * 16); 790cabdff1aSopenharmony_ci loc3 = __lsx_vld(tmp_eve_buf, 13 * 16); 791cabdff1aSopenharmony_ci 792cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 793cabdff1aSopenharmony_ci n0, n4, n2, n6); 794cabdff1aSopenharmony_ci 795cabdff1aSopenharmony_ci __lsx_vst(SUB(loc0, vec3), tmp_buf, 30 * 16); 796cabdff1aSopenharmony_ci __lsx_vst(SUB(loc1, vec2), tmp_buf, 22 * 16); 797cabdff1aSopenharmony_ci __lsx_vst(SUB(loc2, vec1), tmp_buf, 26 * 16); 798cabdff1aSopenharmony_ci __lsx_vst(SUB(loc3, vec0), tmp_buf, 18 * 16); 799cabdff1aSopenharmony_ci 800cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 801cabdff1aSopenharmony_ci vec0 = __lsx_vld(tmp_odd_buf, 5 * 16); 802cabdff1aSopenharmony_ci vec1 = __lsx_vld(tmp_odd_buf, 15 * 16); 803cabdff1aSopenharmony_ci vec2 = __lsx_vld(tmp_odd_buf, 8 * 16); 804cabdff1aSopenharmony_ci vec3 = __lsx_vld(tmp_odd_buf, 1 * 16); 805cabdff1aSopenharmony_ci loc0 = __lsx_vld(tmp_eve_buf, 3 * 16); 806cabdff1aSopenharmony_ci loc1 = __lsx_vld(tmp_eve_buf, 11 * 16); 807cabdff1aSopenharmony_ci loc2 = __lsx_vld(tmp_eve_buf, 7 * 16); 808cabdff1aSopenharmony_ci loc3 = __lsx_vld(tmp_eve_buf, 15 * 16); 809cabdff1aSopenharmony_ci 810cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 811cabdff1aSopenharmony_ci n1, n5, n3, n7); 812cabdff1aSopenharmony_ci 813cabdff1aSopenharmony_ci __lsx_vst(SUB(loc0, vec3), tmp_buf, 28 * 16); 814cabdff1aSopenharmony_ci __lsx_vst(SUB(loc1, vec2), tmp_buf, 20 * 16); 815cabdff1aSopenharmony_ci __lsx_vst(SUB(loc2, vec1), tmp_buf, 24 * 16); 816cabdff1aSopenharmony_ci __lsx_vst(SUB(loc3, vec0), tmp_buf, 16 * 16); 817cabdff1aSopenharmony_ci 818cabdff1aSopenharmony_ci /* Transpose : 16 vectors */ 819cabdff1aSopenharmony_ci /* 1st & 2nd 8x8 */ 820cabdff1aSopenharmony_ci LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, 821cabdff1aSopenharmony_ci m0, n0, m1, n1, m2, n2, m3, n3); 822cabdff1aSopenharmony_ci __lsx_vst(m0, dst, 0); 823cabdff1aSopenharmony_ci __lsx_vst(n0, dst, 32 * 2); 824cabdff1aSopenharmony_ci __lsx_vst(m1, dst, 32 * 4); 825cabdff1aSopenharmony_ci __lsx_vst(n1, dst, 32 * 6); 826cabdff1aSopenharmony_ci __lsx_vst(m2, dst, 32 * 8); 827cabdff1aSopenharmony_ci __lsx_vst(n2, dst, 32 * 10); 828cabdff1aSopenharmony_ci __lsx_vst(m3, dst, 32 * 12); 829cabdff1aSopenharmony_ci __lsx_vst(n3, dst, 32 * 14); 830cabdff1aSopenharmony_ci 831cabdff1aSopenharmony_ci LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, 832cabdff1aSopenharmony_ci m4, n4, m5, n5, m6, n6, m7, n7); 833cabdff1aSopenharmony_ci 834cabdff1aSopenharmony_ci __lsx_vst(m4, dst, 16); 835cabdff1aSopenharmony_ci __lsx_vst(n4, dst, 16 + 32 * 2); 836cabdff1aSopenharmony_ci __lsx_vst(m5, dst, 16 + 32 * 4); 837cabdff1aSopenharmony_ci __lsx_vst(n5, dst, 16 + 32 * 6); 838cabdff1aSopenharmony_ci __lsx_vst(m6, dst, 16 + 32 * 8); 839cabdff1aSopenharmony_ci __lsx_vst(n6, dst, 16 + 32 * 10); 840cabdff1aSopenharmony_ci __lsx_vst(m7, dst, 16 + 32 * 12); 841cabdff1aSopenharmony_ci __lsx_vst(n7, dst, 16 + 32 * 14); 842cabdff1aSopenharmony_ci 843cabdff1aSopenharmony_ci /* 3rd & 4th 8x8 */ 844cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 16, tmp_buf, 16 * 17, 845cabdff1aSopenharmony_ci tmp_buf, 16 * 18, tmp_buf, 16 * 19, m0, n0, m1, n1); 846cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 20, tmp_buf, 16 * 21, 847cabdff1aSopenharmony_ci tmp_buf, 16 * 22, tmp_buf, 16 * 23, m2, n2, m3, n3); 848cabdff1aSopenharmony_ci 849cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 24, tmp_buf, 16 * 25, 850cabdff1aSopenharmony_ci tmp_buf, 16 * 26, tmp_buf, 16 * 27, m4, n4, m5, n5); 851cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 28, tmp_buf, 16 * 29, 852cabdff1aSopenharmony_ci tmp_buf, 16 * 30, tmp_buf, 16 * 31, m6, n6, m7, n7); 853cabdff1aSopenharmony_ci 854cabdff1aSopenharmony_ci LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, 855cabdff1aSopenharmony_ci m0, n0, m1, n1, m2, n2, m3, n3); 856cabdff1aSopenharmony_ci 857cabdff1aSopenharmony_ci __lsx_vst(m0, dst, 32); 858cabdff1aSopenharmony_ci __lsx_vst(n0, dst, 32 + 32 * 2); 859cabdff1aSopenharmony_ci __lsx_vst(m1, dst, 32 + 32 * 4); 860cabdff1aSopenharmony_ci __lsx_vst(n1, dst, 32 + 32 * 6); 861cabdff1aSopenharmony_ci __lsx_vst(m2, dst, 32 + 32 * 8); 862cabdff1aSopenharmony_ci __lsx_vst(n2, dst, 32 + 32 * 10); 863cabdff1aSopenharmony_ci __lsx_vst(m3, dst, 32 + 32 * 12); 864cabdff1aSopenharmony_ci __lsx_vst(n3, dst, 32 + 32 * 14); 865cabdff1aSopenharmony_ci 866cabdff1aSopenharmony_ci LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, 867cabdff1aSopenharmony_ci m4, n4, m5, n5, m6, n6, m7, n7); 868cabdff1aSopenharmony_ci 869cabdff1aSopenharmony_ci __lsx_vst(m4, dst, 48); 870cabdff1aSopenharmony_ci __lsx_vst(n4, dst, 48 + 32 * 2); 871cabdff1aSopenharmony_ci __lsx_vst(m5, dst, 48 + 32 * 4); 872cabdff1aSopenharmony_ci __lsx_vst(n5, dst, 48 + 32 * 6); 873cabdff1aSopenharmony_ci __lsx_vst(m6, dst, 48 + 32 * 8); 874cabdff1aSopenharmony_ci __lsx_vst(n6, dst, 48 + 32 * 10); 875cabdff1aSopenharmony_ci __lsx_vst(m7, dst, 48 + 32 * 12); 876cabdff1aSopenharmony_ci __lsx_vst(n7, dst, 48 + 32 * 14); 877cabdff1aSopenharmony_ci} 878cabdff1aSopenharmony_ci 879cabdff1aSopenharmony_cistatic void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf, 880cabdff1aSopenharmony_ci int16_t *tmp_eve_buf) 881cabdff1aSopenharmony_ci{ 882cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 883cabdff1aSopenharmony_ci __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 884cabdff1aSopenharmony_ci __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; 885cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 886cabdff1aSopenharmony_ci 887cabdff1aSopenharmony_ci /* Even stage 1 */ 888cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 32 * 8, 889cabdff1aSopenharmony_ci tmp_buf, 32 * 16, tmp_buf, 32 * 24, reg0, reg1, reg2, reg3); 890cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_buf, 32 * 32, tmp_buf, 32 * 40, 891cabdff1aSopenharmony_ci tmp_buf, 32 * 48, tmp_buf, 32 * 56, reg4, reg5, reg6, reg7); 892cabdff1aSopenharmony_ci 893cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 0); 894cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 8); 895cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 16); 896cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 24); 897cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 32); 898cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 40); 899cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 48); 900cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 56); 901cabdff1aSopenharmony_ci 902cabdff1aSopenharmony_ci tmp_buf += (2 * 32); 903cabdff1aSopenharmony_ci 904cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); 905cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); 906cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); 907cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 908cabdff1aSopenharmony_ci 909cabdff1aSopenharmony_ci loc1 = vec3; 910cabdff1aSopenharmony_ci loc0 = vec1; 911cabdff1aSopenharmony_ci 912cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); 913cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); 914cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); 915cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); 916cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); 917cabdff1aSopenharmony_ci 918cabdff1aSopenharmony_ci /* Even stage 2 */ 919cabdff1aSopenharmony_ci /* Load 8 */ 920cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 32 * 8, 921cabdff1aSopenharmony_ci tmp_buf, 32 * 16, tmp_buf, 32 * 24, reg0, reg1, reg2, reg3); 922cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_buf, 32 * 32, tmp_buf, 32 * 40, 923cabdff1aSopenharmony_ci tmp_buf, 32 * 48, tmp_buf, 32 * 56, reg4, reg5, reg6, reg7); 924cabdff1aSopenharmony_ci 925cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 0); 926cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 8); 927cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 16); 928cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 24); 929cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 32); 930cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 40); 931cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 48); 932cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 32 * 56); 933cabdff1aSopenharmony_ci 934cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); 935cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); 936cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); 937cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); 938cabdff1aSopenharmony_ci 939cabdff1aSopenharmony_ci vec0 = __lsx_vadd_h(reg0, reg4); 940cabdff1aSopenharmony_ci reg0 = __lsx_vsub_h(reg0, reg4); 941cabdff1aSopenharmony_ci reg4 = __lsx_vadd_h(reg6, reg2); 942cabdff1aSopenharmony_ci reg6 = __lsx_vsub_h(reg6, reg2); 943cabdff1aSopenharmony_ci reg2 = __lsx_vadd_h(reg1, reg5); 944cabdff1aSopenharmony_ci reg1 = __lsx_vsub_h(reg1, reg5); 945cabdff1aSopenharmony_ci reg5 = __lsx_vadd_h(reg7, reg3); 946cabdff1aSopenharmony_ci reg7 = __lsx_vsub_h(reg7, reg3); 947cabdff1aSopenharmony_ci reg3 = vec0; 948cabdff1aSopenharmony_ci 949cabdff1aSopenharmony_ci vec1 = reg2; 950cabdff1aSopenharmony_ci reg2 = __lsx_vadd_h(reg3, reg4); 951cabdff1aSopenharmony_ci reg3 = __lsx_vsub_h(reg3, reg4); 952cabdff1aSopenharmony_ci reg4 = __lsx_vsub_h(reg5, vec1); 953cabdff1aSopenharmony_ci reg5 = __lsx_vadd_h(reg5, vec1); 954cabdff1aSopenharmony_ci 955cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); 956cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(__lsx_vneg_h(reg6), reg1, cospi_24_64, cospi_8_64, 957cabdff1aSopenharmony_ci reg6, reg1); 958cabdff1aSopenharmony_ci 959cabdff1aSopenharmony_ci vec0 = __lsx_vsub_h(reg0, reg6); 960cabdff1aSopenharmony_ci reg0 = __lsx_vadd_h(reg0, reg6); 961cabdff1aSopenharmony_ci vec1 = __lsx_vsub_h(reg7, reg1); 962cabdff1aSopenharmony_ci reg7 = __lsx_vadd_h(reg7, reg1); 963cabdff1aSopenharmony_ci 964cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); 965cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); 966cabdff1aSopenharmony_ci 967cabdff1aSopenharmony_ci /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ 968cabdff1aSopenharmony_ci /* Store 8 */ 969cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); 970cabdff1aSopenharmony_ci __lsx_vst(loc1, tmp_eve_buf, 0); 971cabdff1aSopenharmony_ci __lsx_vst(loc3, tmp_eve_buf, 16); 972cabdff1aSopenharmony_ci __lsx_vst(loc2, tmp_eve_buf, 14 * 16); 973cabdff1aSopenharmony_ci __lsx_vst(loc0, tmp_eve_buf, 14 * 16 + 16); 974cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); 975cabdff1aSopenharmony_ci __lsx_vst(loc1, tmp_eve_buf, 2 * 16); 976cabdff1aSopenharmony_ci __lsx_vst(loc3, tmp_eve_buf, 2 * 16 + 16); 977cabdff1aSopenharmony_ci __lsx_vst(loc2, tmp_eve_buf, 12 * 16); 978cabdff1aSopenharmony_ci __lsx_vst(loc0, tmp_eve_buf, 12 * 16 + 16); 979cabdff1aSopenharmony_ci 980cabdff1aSopenharmony_ci /* Store 8 */ 981cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); 982cabdff1aSopenharmony_ci __lsx_vst(loc1, tmp_eve_buf, 4 * 16); 983cabdff1aSopenharmony_ci __lsx_vst(loc3, tmp_eve_buf, 4 * 16 + 16); 984cabdff1aSopenharmony_ci __lsx_vst(loc2, tmp_eve_buf, 10 * 16); 985cabdff1aSopenharmony_ci __lsx_vst(loc0, tmp_eve_buf, 10 * 16 + 16); 986cabdff1aSopenharmony_ci 987cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); 988cabdff1aSopenharmony_ci __lsx_vst(loc1, tmp_eve_buf, 6 * 16); 989cabdff1aSopenharmony_ci __lsx_vst(loc3, tmp_eve_buf, 6 * 16 + 16); 990cabdff1aSopenharmony_ci __lsx_vst(loc2, tmp_eve_buf, 8 * 16); 991cabdff1aSopenharmony_ci __lsx_vst(loc0, tmp_eve_buf, 8 * 16 + 16); 992cabdff1aSopenharmony_ci} 993cabdff1aSopenharmony_ci 994cabdff1aSopenharmony_cistatic void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf, 995cabdff1aSopenharmony_ci int16_t *tmp_odd_buf) 996cabdff1aSopenharmony_ci{ 997cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 998cabdff1aSopenharmony_ci __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 999cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 1000cabdff1aSopenharmony_ci 1001cabdff1aSopenharmony_ci /* Odd stage 1 */ 1002cabdff1aSopenharmony_ci reg0 = __lsx_vld(tmp_buf, 64); 1003cabdff1aSopenharmony_ci reg1 = __lsx_vld(tmp_buf, 7 * 64); 1004cabdff1aSopenharmony_ci reg2 = __lsx_vld(tmp_buf, 9 * 64); 1005cabdff1aSopenharmony_ci reg3 = __lsx_vld(tmp_buf, 15 * 64); 1006cabdff1aSopenharmony_ci reg4 = __lsx_vld(tmp_buf, 17 * 64); 1007cabdff1aSopenharmony_ci reg5 = __lsx_vld(tmp_buf, 23 * 64); 1008cabdff1aSopenharmony_ci reg6 = __lsx_vld(tmp_buf, 25 * 64); 1009cabdff1aSopenharmony_ci reg7 = __lsx_vld(tmp_buf, 31 * 64); 1010cabdff1aSopenharmony_ci 1011cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 64); 1012cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 7 * 64); 1013cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 9 * 64); 1014cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 15 * 64); 1015cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 17 * 64); 1016cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 23 * 64); 1017cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 25 * 64); 1018cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 31 * 64); 1019cabdff1aSopenharmony_ci 1020cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); 1021cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); 1022cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); 1023cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); 1024cabdff1aSopenharmony_ci 1025cabdff1aSopenharmony_ci vec0 = __lsx_vadd_h(reg0, reg3); 1026cabdff1aSopenharmony_ci reg0 = __lsx_vsub_h(reg0, reg3); 1027cabdff1aSopenharmony_ci reg3 = __lsx_vadd_h(reg7, reg4); 1028cabdff1aSopenharmony_ci reg7 = __lsx_vsub_h(reg7, reg4); 1029cabdff1aSopenharmony_ci reg4 = __lsx_vadd_h(reg1, reg2); 1030cabdff1aSopenharmony_ci reg1 = __lsx_vsub_h(reg1, reg2); 1031cabdff1aSopenharmony_ci reg2 = __lsx_vadd_h(reg6, reg5); 1032cabdff1aSopenharmony_ci reg6 = __lsx_vsub_h(reg6, reg5); 1033cabdff1aSopenharmony_ci reg5 = vec0; 1034cabdff1aSopenharmony_ci 1035cabdff1aSopenharmony_ci /* 4 Stores */ 1036cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1); 1037cabdff1aSopenharmony_ci __lsx_vst(vec0, tmp_odd_buf, 4 * 16); 1038cabdff1aSopenharmony_ci __lsx_vst(vec1, tmp_odd_buf, 4 * 16 + 16); 1039cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1); 1040cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); 1041cabdff1aSopenharmony_ci __lsx_vst(vec0, tmp_odd_buf, 0); 1042cabdff1aSopenharmony_ci __lsx_vst(vec1, tmp_odd_buf, 16); 1043cabdff1aSopenharmony_ci 1044cabdff1aSopenharmony_ci /* 4 Stores */ 1045cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); 1046cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); 1047cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); 1048cabdff1aSopenharmony_ci __lsx_vst(vec0, tmp_odd_buf, 6 * 16); 1049cabdff1aSopenharmony_ci __lsx_vst(vec1, tmp_odd_buf, 6 * 16 + 16); 1050cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); 1051cabdff1aSopenharmony_ci __lsx_vst(vec2, tmp_odd_buf, 2 * 16); 1052cabdff1aSopenharmony_ci __lsx_vst(vec3, tmp_odd_buf, 2 * 16 + 16); 1053cabdff1aSopenharmony_ci 1054cabdff1aSopenharmony_ci /* Odd stage 2 */ 1055cabdff1aSopenharmony_ci /* 8 loads */ 1056cabdff1aSopenharmony_ci reg0 = __lsx_vld(tmp_buf, 3 * 64); 1057cabdff1aSopenharmony_ci reg1 = __lsx_vld(tmp_buf, 5 * 64); 1058cabdff1aSopenharmony_ci reg2 = __lsx_vld(tmp_buf, 11 * 64); 1059cabdff1aSopenharmony_ci reg3 = __lsx_vld(tmp_buf, 13 * 64); 1060cabdff1aSopenharmony_ci reg4 = __lsx_vld(tmp_buf, 19 * 64); 1061cabdff1aSopenharmony_ci reg5 = __lsx_vld(tmp_buf, 21 * 64); 1062cabdff1aSopenharmony_ci reg6 = __lsx_vld(tmp_buf, 27 * 64); 1063cabdff1aSopenharmony_ci reg7 = __lsx_vld(tmp_buf, 29 * 64); 1064cabdff1aSopenharmony_ci 1065cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 3 * 64); 1066cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 5 * 64); 1067cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 11 * 64); 1068cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 13 * 64); 1069cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 19 * 64); 1070cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 21 * 64); 1071cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 27 * 64); 1072cabdff1aSopenharmony_ci __lsx_vst(zero, tmp_buf, 29 * 64); 1073cabdff1aSopenharmony_ci 1074cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); 1075cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); 1076cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); 1077cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); 1078cabdff1aSopenharmony_ci 1079cabdff1aSopenharmony_ci /* 4 Stores */ 1080cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsub_h,reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, 1081cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1082cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); 1083cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); 1084cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); 1085cabdff1aSopenharmony_ci __lsx_vst(vec0, tmp_odd_buf, 12 * 16); 1086cabdff1aSopenharmony_ci __lsx_vst(vec1, tmp_odd_buf, 12 * 16 + 3 * 16); 1087cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); 1088cabdff1aSopenharmony_ci __lsx_vst(vec0, tmp_odd_buf, 10 * 16); 1089cabdff1aSopenharmony_ci __lsx_vst(vec1, tmp_odd_buf, 10 * 16 + 16); 1090cabdff1aSopenharmony_ci 1091cabdff1aSopenharmony_ci /* 4 Stores */ 1092cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, 1093cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1094cabdff1aSopenharmony_ci LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); 1095cabdff1aSopenharmony_ci __lsx_vst(reg0, tmp_odd_buf, 13 * 16); 1096cabdff1aSopenharmony_ci __lsx_vst(reg1, tmp_odd_buf, 13 * 16 + 16); 1097cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, 1098cabdff1aSopenharmony_ci reg0, reg1); 1099cabdff1aSopenharmony_ci __lsx_vst(reg0, tmp_odd_buf, 8 * 16); 1100cabdff1aSopenharmony_ci __lsx_vst(reg1, tmp_odd_buf, 8 * 16 + 16); 1101cabdff1aSopenharmony_ci 1102cabdff1aSopenharmony_ci /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ 1103cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1104cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, 1105cabdff1aSopenharmony_ci tmp_odd_buf, 32, tmp_odd_buf, 48, reg0, reg1, reg2, reg3); 1106cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_odd_buf, 8 * 16, tmp_odd_buf, 8 * 16 + 16, 1107cabdff1aSopenharmony_ci tmp_odd_buf, 8 * 16 + 32, tmp_odd_buf, 8 * 16 + 48, 1108cabdff1aSopenharmony_ci reg4, reg5, reg6, reg7); 1109cabdff1aSopenharmony_ci 1110cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, 1111cabdff1aSopenharmony_ci loc0, loc1, loc2, loc3); 1112cabdff1aSopenharmony_ci __lsx_vst(loc0, tmp_odd_buf, 0); 1113cabdff1aSopenharmony_ci __lsx_vst(loc1, tmp_odd_buf, 16); 1114cabdff1aSopenharmony_ci __lsx_vst(loc2, tmp_odd_buf, 32); 1115cabdff1aSopenharmony_ci __lsx_vst(loc3, tmp_odd_buf, 48); 1116cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1); 1117cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 1118cabdff1aSopenharmony_ci 1119cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1); 1120cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 1121cabdff1aSopenharmony_ci __lsx_vst(loc0, tmp_odd_buf, 8 * 16); 1122cabdff1aSopenharmony_ci __lsx_vst(loc1, tmp_odd_buf, 8 * 16 + 16); 1123cabdff1aSopenharmony_ci __lsx_vst(loc2, tmp_odd_buf, 8 * 16 + 32); 1124cabdff1aSopenharmony_ci __lsx_vst(loc3, tmp_odd_buf, 8 * 16 + 48); 1125cabdff1aSopenharmony_ci 1126cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1127cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_odd_buf, 4 * 16, tmp_odd_buf, 4 * 16 + 16, 1128cabdff1aSopenharmony_ci tmp_odd_buf, 4 * 16 + 32, tmp_odd_buf, 4 * 16 + 48, 1129cabdff1aSopenharmony_ci reg1, reg2, reg0, reg3); 1130cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, tmp_odd_buf, 12 * 16, tmp_odd_buf, 12 * 16 + 16, 1131cabdff1aSopenharmony_ci tmp_odd_buf, 12 * 16 + 32, tmp_odd_buf, 12 * 16 + 48, 1132cabdff1aSopenharmony_ci reg4, reg5, reg6, reg7); 1133cabdff1aSopenharmony_ci 1134cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, 1135cabdff1aSopenharmony_ci loc0, loc1, loc2, loc3); 1136cabdff1aSopenharmony_ci __lsx_vst(loc0, tmp_odd_buf, 4 * 16); 1137cabdff1aSopenharmony_ci __lsx_vst(loc1, tmp_odd_buf, 4 * 16 + 16); 1138cabdff1aSopenharmony_ci __lsx_vst(loc2, tmp_odd_buf, 4 * 16 + 32); 1139cabdff1aSopenharmony_ci __lsx_vst(loc3, tmp_odd_buf, 4 * 16 + 48); 1140cabdff1aSopenharmony_ci 1141cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1); 1142cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 1143cabdff1aSopenharmony_ci 1144cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1); 1145cabdff1aSopenharmony_ci VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 1146cabdff1aSopenharmony_ci __lsx_vst(loc0, tmp_odd_buf, 12 * 16); 1147cabdff1aSopenharmony_ci __lsx_vst(loc1, tmp_odd_buf, 12 * 16 + 16); 1148cabdff1aSopenharmony_ci __lsx_vst(loc2, tmp_odd_buf, 12 * 16 + 32); 1149cabdff1aSopenharmony_ci __lsx_vst(loc3, tmp_odd_buf, 12 * 16 + 48); 1150cabdff1aSopenharmony_ci} 1151cabdff1aSopenharmony_ci 1152cabdff1aSopenharmony_cistatic void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, 1153cabdff1aSopenharmony_ci int16_t *tmp_odd_buf, 1154cabdff1aSopenharmony_ci uint8_t *dst, 1155cabdff1aSopenharmony_ci int32_t dst_stride) 1156cabdff1aSopenharmony_ci{ 1157cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 1158cabdff1aSopenharmony_ci __m128i m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 1159cabdff1aSopenharmony_ci 1160cabdff1aSopenharmony_ci /* FINAL BUTTERFLY : Dependency on Even & Odd */ 1161cabdff1aSopenharmony_ci vec0 = __lsx_vld(tmp_odd_buf, 0); 1162cabdff1aSopenharmony_ci vec1 = __lsx_vld(tmp_odd_buf, 9 * 16); 1163cabdff1aSopenharmony_ci vec2 = __lsx_vld(tmp_odd_buf, 14 * 16); 1164cabdff1aSopenharmony_ci vec3 = __lsx_vld(tmp_odd_buf, 6 * 16); 1165cabdff1aSopenharmony_ci loc0 = __lsx_vld(tmp_eve_buf, 0); 1166cabdff1aSopenharmony_ci loc1 = __lsx_vld(tmp_eve_buf, 8 * 16); 1167cabdff1aSopenharmony_ci loc2 = __lsx_vld(tmp_eve_buf, 4 * 16); 1168cabdff1aSopenharmony_ci loc3 = __lsx_vld(tmp_eve_buf, 12 * 16); 1169cabdff1aSopenharmony_ci 1170cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 1171cabdff1aSopenharmony_ci m0, m4, m2, m6); 1172cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6); 1173cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); 1174cabdff1aSopenharmony_ci 1175cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 1176cabdff1aSopenharmony_ci m6, m2, m4, m0); 1177cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6); 1178cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), 1179cabdff1aSopenharmony_ci m0, m2, m4, m6); 1180cabdff1aSopenharmony_ci 1181cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1182cabdff1aSopenharmony_ci vec0 = __lsx_vld(tmp_odd_buf, 4 * 16); 1183cabdff1aSopenharmony_ci vec1 = __lsx_vld(tmp_odd_buf, 13 * 16); 1184cabdff1aSopenharmony_ci vec2 = __lsx_vld(tmp_odd_buf, 10 * 16); 1185cabdff1aSopenharmony_ci vec3 = __lsx_vld(tmp_odd_buf, 3 * 16); 1186cabdff1aSopenharmony_ci loc0 = __lsx_vld(tmp_eve_buf, 2 * 16); 1187cabdff1aSopenharmony_ci loc1 = __lsx_vld(tmp_eve_buf, 10 * 16); 1188cabdff1aSopenharmony_ci loc2 = __lsx_vld(tmp_eve_buf, 6 * 16); 1189cabdff1aSopenharmony_ci loc3 = __lsx_vld(tmp_eve_buf, 14 * 16); 1190cabdff1aSopenharmony_ci 1191cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 1192cabdff1aSopenharmony_ci m1, m5, m3, m7); 1193cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7); 1194cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), 1195cabdff1aSopenharmony_ci m1, m3, m5, m7); 1196cabdff1aSopenharmony_ci 1197cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 1198cabdff1aSopenharmony_ci m7, m3, m5, m1); 1199cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7); 1200cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), 1201cabdff1aSopenharmony_ci m1, m3, m5, m7); 1202cabdff1aSopenharmony_ci 1203cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1204cabdff1aSopenharmony_ci vec0 = __lsx_vld(tmp_odd_buf, 2 * 16); 1205cabdff1aSopenharmony_ci vec1 = __lsx_vld(tmp_odd_buf, 11 * 16); 1206cabdff1aSopenharmony_ci vec2 = __lsx_vld(tmp_odd_buf, 12 * 16); 1207cabdff1aSopenharmony_ci vec3 = __lsx_vld(tmp_odd_buf, 7 * 16); 1208cabdff1aSopenharmony_ci loc0 = __lsx_vld(tmp_eve_buf, 1 * 16); 1209cabdff1aSopenharmony_ci loc1 = __lsx_vld(tmp_eve_buf, 9 * 16); 1210cabdff1aSopenharmony_ci loc2 = __lsx_vld(tmp_eve_buf, 5 * 16); 1211cabdff1aSopenharmony_ci loc3 = __lsx_vld(tmp_eve_buf, 13 * 16); 1212cabdff1aSopenharmony_ci 1213cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 1214cabdff1aSopenharmony_ci n0, n4, n2, n6); 1215cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6); 1216cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), 1217cabdff1aSopenharmony_ci n0, n2, n4, n6); 1218cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 1219cabdff1aSopenharmony_ci n6, n2, n4, n0); 1220cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6); 1221cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), 1222cabdff1aSopenharmony_ci n0, n2, n4, n6); 1223cabdff1aSopenharmony_ci 1224cabdff1aSopenharmony_ci /* Load 8 & Store 8 */ 1225cabdff1aSopenharmony_ci vec0 = __lsx_vld(tmp_odd_buf, 5 * 16); 1226cabdff1aSopenharmony_ci vec1 = __lsx_vld(tmp_odd_buf, 15 * 16); 1227cabdff1aSopenharmony_ci vec2 = __lsx_vld(tmp_odd_buf, 8 * 16); 1228cabdff1aSopenharmony_ci vec3 = __lsx_vld(tmp_odd_buf, 1 * 16); 1229cabdff1aSopenharmony_ci loc0 = __lsx_vld(tmp_eve_buf, 3 * 16); 1230cabdff1aSopenharmony_ci loc1 = __lsx_vld(tmp_eve_buf, 11 * 16); 1231cabdff1aSopenharmony_ci loc2 = __lsx_vld(tmp_eve_buf, 7 * 16); 1232cabdff1aSopenharmony_ci loc3 = __lsx_vld(tmp_eve_buf, 15 * 16); 1233cabdff1aSopenharmony_ci 1234cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 1235cabdff1aSopenharmony_ci n1, n5, n3, n7); 1236cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7); 1237cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), 1238cabdff1aSopenharmony_ci n1, n3, n5, n7); 1239cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, 1240cabdff1aSopenharmony_ci n7, n3, n5, n1); 1241cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7); 1242cabdff1aSopenharmony_ci VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), 1243cabdff1aSopenharmony_ci n1, n3, n5, n7); 1244cabdff1aSopenharmony_ci} 1245cabdff1aSopenharmony_ci 1246cabdff1aSopenharmony_cistatic void vp9_idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst, 1247cabdff1aSopenharmony_ci int32_t dst_stride) 1248cabdff1aSopenharmony_ci{ 1249cabdff1aSopenharmony_ci int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(16); 1250cabdff1aSopenharmony_ci int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(16); 1251cabdff1aSopenharmony_ci 1252cabdff1aSopenharmony_ci vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); 1253cabdff1aSopenharmony_ci vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); 1254cabdff1aSopenharmony_ci vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], 1255cabdff1aSopenharmony_ci dst, dst_stride); 1256cabdff1aSopenharmony_ci} 1257cabdff1aSopenharmony_ci 1258cabdff1aSopenharmony_cistatic void vp9_idct8x32_1d_columns_lsx(int16_t *input, int16_t *output, 1259cabdff1aSopenharmony_ci int16_t *tmp_buf) 1260cabdff1aSopenharmony_ci{ 1261cabdff1aSopenharmony_ci int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(16); 1262cabdff1aSopenharmony_ci int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(16); 1263cabdff1aSopenharmony_ci 1264cabdff1aSopenharmony_ci vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); 1265cabdff1aSopenharmony_ci vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); 1266cabdff1aSopenharmony_ci vp9_idct_butterfly_transpose_store(tmp_buf, &tmp_eve_buf[0], 1267cabdff1aSopenharmony_ci &tmp_odd_buf[0], output); 1268cabdff1aSopenharmony_ci} 1269cabdff1aSopenharmony_ci 1270cabdff1aSopenharmony_cistatic void vp9_idct32x32_1_add_lsx(int16_t *input, uint8_t *dst, 1271cabdff1aSopenharmony_ci int32_t dst_stride) 1272cabdff1aSopenharmony_ci{ 1273cabdff1aSopenharmony_ci int32_t i; 1274cabdff1aSopenharmony_ci int16_t out; 1275cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst + dst_stride; 1276cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 1277cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; 1278cabdff1aSopenharmony_ci __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec; 1279cabdff1aSopenharmony_ci 1280cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); 1281cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); 1282cabdff1aSopenharmony_ci out = ROUND_POWER_OF_TWO(out, 6); 1283cabdff1aSopenharmony_ci input[0] = 0; 1284cabdff1aSopenharmony_ci 1285cabdff1aSopenharmony_ci vec = __lsx_vreplgr2vr_h(out); 1286cabdff1aSopenharmony_ci 1287cabdff1aSopenharmony_ci for (i = 16; i--;) { 1288cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); 1289cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst2, dst3); 1290cabdff1aSopenharmony_ci 1291cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3, 1292cabdff1aSopenharmony_ci res0, res1, res2, res3); 1293cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3, 1294cabdff1aSopenharmony_ci res4, res5, res6, res7); 1295cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, 1296cabdff1aSopenharmony_ci res0, res1, res2, res3); 1297cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, 1298cabdff1aSopenharmony_ci res4, res5, res6, res7); 1299cabdff1aSopenharmony_ci DUP4_ARG1(__lsx_vclip255_h, res0, res1, res2, res3, res0, res1, res2, res3); 1300cabdff1aSopenharmony_ci DUP4_ARG1(__lsx_vclip255_h, res4, res5, res6, res7, res4, res5, res6, res7); 1301cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_b, res4, res0, res5, res1, res6, res2, res7, res3, 1302cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1303cabdff1aSopenharmony_ci 1304cabdff1aSopenharmony_ci __lsx_vst(tmp0, dst, 0); 1305cabdff1aSopenharmony_ci __lsx_vst(tmp1, dst, 16); 1306cabdff1aSopenharmony_ci __lsx_vst(tmp2, dst_tmp, 0); 1307cabdff1aSopenharmony_ci __lsx_vst(tmp3, dst_tmp, 16); 1308cabdff1aSopenharmony_ci dst = dst_tmp + dst_stride; 1309cabdff1aSopenharmony_ci dst_tmp = dst + dst_stride; 1310cabdff1aSopenharmony_ci } 1311cabdff1aSopenharmony_ci} 1312cabdff1aSopenharmony_ci 1313cabdff1aSopenharmony_cistatic void vp9_idct32x32_34_colcol_addblk_lsx(int16_t *input, uint8_t *dst, 1314cabdff1aSopenharmony_ci int32_t dst_stride) 1315cabdff1aSopenharmony_ci{ 1316cabdff1aSopenharmony_ci int32_t i; 1317cabdff1aSopenharmony_ci int16_t out_arr[32 * 32] ALLOC_ALIGNED(16); 1318cabdff1aSopenharmony_ci int16_t *out_ptr = out_arr; 1319cabdff1aSopenharmony_ci int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(16); 1320cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 1321cabdff1aSopenharmony_ci 1322cabdff1aSopenharmony_ci for (i = 16; i--;) { 1323cabdff1aSopenharmony_ci __lsx_vst(zero, out_ptr, 0); 1324cabdff1aSopenharmony_ci __lsx_vst(zero, out_ptr, 16); 1325cabdff1aSopenharmony_ci __lsx_vst(zero, out_ptr, 32); 1326cabdff1aSopenharmony_ci __lsx_vst(zero, out_ptr, 48); 1327cabdff1aSopenharmony_ci __lsx_vst(zero, out_ptr, 64); 1328cabdff1aSopenharmony_ci __lsx_vst(zero, out_ptr, 80); 1329cabdff1aSopenharmony_ci __lsx_vst(zero, out_ptr, 96); 1330cabdff1aSopenharmony_ci __lsx_vst(zero, out_ptr, 112); 1331cabdff1aSopenharmony_ci out_ptr += 64; 1332cabdff1aSopenharmony_ci } 1333cabdff1aSopenharmony_ci 1334cabdff1aSopenharmony_ci out_ptr = out_arr; 1335cabdff1aSopenharmony_ci 1336cabdff1aSopenharmony_ci /* process 8*32 block */ 1337cabdff1aSopenharmony_ci vp9_idct8x32_1d_columns_lsx(input, out_ptr, &tmp_buf[0]); 1338cabdff1aSopenharmony_ci 1339cabdff1aSopenharmony_ci /* transform columns */ 1340cabdff1aSopenharmony_ci for (i = 0; i < 4; i++) { 1341cabdff1aSopenharmony_ci /* process 8*32 block */ 1342cabdff1aSopenharmony_ci vp9_idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), 1343cabdff1aSopenharmony_ci (dst + (i << 3)), dst_stride); 1344cabdff1aSopenharmony_ci } 1345cabdff1aSopenharmony_ci} 1346cabdff1aSopenharmony_ci 1347cabdff1aSopenharmony_cistatic void vp9_idct32x32_colcol_addblk_lsx(int16_t *input, uint8_t *dst, 1348cabdff1aSopenharmony_ci int32_t dst_stride) 1349cabdff1aSopenharmony_ci{ 1350cabdff1aSopenharmony_ci int32_t i; 1351cabdff1aSopenharmony_ci int16_t out_arr[32 * 32] ALLOC_ALIGNED(16); 1352cabdff1aSopenharmony_ci int16_t *out_ptr = out_arr; 1353cabdff1aSopenharmony_ci int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(16); 1354cabdff1aSopenharmony_ci 1355cabdff1aSopenharmony_ci /* transform rows */ 1356cabdff1aSopenharmony_ci for (i = 0; i < 4; i++) { 1357cabdff1aSopenharmony_ci /* process 8*32 block */ 1358cabdff1aSopenharmony_ci vp9_idct8x32_1d_columns_lsx((input + (i << 3)), (out_ptr + (i << 8)), 1359cabdff1aSopenharmony_ci &tmp_buf[0]); 1360cabdff1aSopenharmony_ci } 1361cabdff1aSopenharmony_ci 1362cabdff1aSopenharmony_ci /* transform columns */ 1363cabdff1aSopenharmony_ci for (i = 0; i < 4; i++) { 1364cabdff1aSopenharmony_ci /* process 8*32 block */ 1365cabdff1aSopenharmony_ci vp9_idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), 1366cabdff1aSopenharmony_ci (dst + (i << 3)), dst_stride); 1367cabdff1aSopenharmony_ci } 1368cabdff1aSopenharmony_ci} 1369cabdff1aSopenharmony_ci 1370cabdff1aSopenharmony_civoid ff_idct_idct_8x8_add_lsx(uint8_t *dst, ptrdiff_t stride, 1371cabdff1aSopenharmony_ci int16_t *block, int eob) 1372cabdff1aSopenharmony_ci{ 1373cabdff1aSopenharmony_ci if (eob == 1) { 1374cabdff1aSopenharmony_ci vp9_idct8x8_1_add_lsx(block, dst, stride); 1375cabdff1aSopenharmony_ci } 1376cabdff1aSopenharmony_ci else if (eob <= 12) { 1377cabdff1aSopenharmony_ci vp9_idct8x8_12_colcol_addblk_lsx(block, dst, stride); 1378cabdff1aSopenharmony_ci } 1379cabdff1aSopenharmony_ci else { 1380cabdff1aSopenharmony_ci vp9_idct8x8_colcol_addblk_lsx(block, dst, stride); 1381cabdff1aSopenharmony_ci } 1382cabdff1aSopenharmony_ci} 1383cabdff1aSopenharmony_ci 1384cabdff1aSopenharmony_civoid ff_idct_idct_16x16_add_lsx(uint8_t *dst, ptrdiff_t stride, 1385cabdff1aSopenharmony_ci int16_t *block, int eob) 1386cabdff1aSopenharmony_ci{ 1387cabdff1aSopenharmony_ci if (eob == 1) { 1388cabdff1aSopenharmony_ci /* DC only DCT coefficient. */ 1389cabdff1aSopenharmony_ci vp9_idct16x16_1_add_lsx(block, dst, stride); 1390cabdff1aSopenharmony_ci } 1391cabdff1aSopenharmony_ci else if (eob <= 10) { 1392cabdff1aSopenharmony_ci vp9_idct16x16_10_colcol_addblk_lsx(block, dst, stride); 1393cabdff1aSopenharmony_ci } 1394cabdff1aSopenharmony_ci else { 1395cabdff1aSopenharmony_ci vp9_idct16x16_colcol_addblk_lsx(block, dst, stride); 1396cabdff1aSopenharmony_ci } 1397cabdff1aSopenharmony_ci} 1398cabdff1aSopenharmony_ci 1399cabdff1aSopenharmony_civoid ff_idct_idct_32x32_add_lsx(uint8_t *dst, ptrdiff_t stride, 1400cabdff1aSopenharmony_ci int16_t *block, int eob) 1401cabdff1aSopenharmony_ci{ 1402cabdff1aSopenharmony_ci if (eob == 1) { 1403cabdff1aSopenharmony_ci vp9_idct32x32_1_add_lsx(block, dst, stride); 1404cabdff1aSopenharmony_ci } 1405cabdff1aSopenharmony_ci else if (eob <= 34) { 1406cabdff1aSopenharmony_ci vp9_idct32x32_34_colcol_addblk_lsx(block, dst, stride); 1407cabdff1aSopenharmony_ci } 1408cabdff1aSopenharmony_ci else { 1409cabdff1aSopenharmony_ci vp9_idct32x32_colcol_addblk_lsx(block, dst, stride); 1410cabdff1aSopenharmony_ci } 1411cabdff1aSopenharmony_ci} 1412