1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Loongson SIMD optimized vc1dsp 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (c) 2019 Loongson Technology Corporation Limited 5cabdff1aSopenharmony_ci * gxw <guxiwei-hf@loongson.cn> 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * This file is part of FFmpeg. 8cabdff1aSopenharmony_ci * 9cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci * 14cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci * Lesser General Public License for more details. 18cabdff1aSopenharmony_ci * 19cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci */ 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci#include "vc1dsp_mips.h" 25cabdff1aSopenharmony_ci#include "constants.h" 26cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_civoid ff_vc1_inv_trans_8x8_msa(int16_t block[64]) 29cabdff1aSopenharmony_ci{ 30cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 31cabdff1aSopenharmony_ci v4i32 in_r0, in_r1, in_r2, in_r3, in_r4, in_r5, in_r6, in_r7; 32cabdff1aSopenharmony_ci v4i32 in_l0, in_l1, in_l2, in_l3, in_l4, in_l5, in_l6, in_l7; 33cabdff1aSopenharmony_ci v4i32 t_r1, t_r2, t_r3, t_r4, t_r5, t_r6, t_r7, t_r8; 34cabdff1aSopenharmony_ci v4i32 t_l1, t_l2, t_l3, t_l4, t_l5, t_l6, t_l7, t_l8; 35cabdff1aSopenharmony_ci v4i32 cnst_12 = {12, 12, 12, 12}; 36cabdff1aSopenharmony_ci v4i32 cnst_4 = {4, 4, 4, 4}; 37cabdff1aSopenharmony_ci v4i32 cnst_16 = {16, 16, 16, 16}; 38cabdff1aSopenharmony_ci v4i32 cnst_6 = {6, 6, 6, 6}; 39cabdff1aSopenharmony_ci v4i32 cnst_15 = {15, 15, 15, 15}; 40cabdff1aSopenharmony_ci v4i32 cnst_9 = {9, 9, 9, 9}; 41cabdff1aSopenharmony_ci v4i32 cnst_1 = {1, 1, 1, 1}; 42cabdff1aSopenharmony_ci v4i32 cnst_64 = {64, 64, 64, 64}; 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_ci LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); 45cabdff1aSopenharmony_ci UNPCK_SH_SW(in0, in_r0, in_l0); 46cabdff1aSopenharmony_ci UNPCK_SH_SW(in1, in_r1, in_l1); 47cabdff1aSopenharmony_ci UNPCK_SH_SW(in2, in_r2, in_l2); 48cabdff1aSopenharmony_ci UNPCK_SH_SW(in3, in_r3, in_l3); 49cabdff1aSopenharmony_ci UNPCK_SH_SW(in4, in_r4, in_l4); 50cabdff1aSopenharmony_ci UNPCK_SH_SW(in5, in_r5, in_l5); 51cabdff1aSopenharmony_ci UNPCK_SH_SW(in6, in_r6, in_l6); 52cabdff1aSopenharmony_ci UNPCK_SH_SW(in7, in_r7, in_l7); 53cabdff1aSopenharmony_ci // First loop 54cabdff1aSopenharmony_ci t_r1 = cnst_12 * (in_r0 + in_r4) + cnst_4; 55cabdff1aSopenharmony_ci t_l1 = cnst_12 * (in_l0 + in_l4) + cnst_4; 56cabdff1aSopenharmony_ci t_r2 = cnst_12 * (in_r0 - in_r4) + cnst_4; 57cabdff1aSopenharmony_ci t_l2 = cnst_12 * (in_l0 - in_l4) + cnst_4; 58cabdff1aSopenharmony_ci t_r3 = cnst_16 * in_r2 + cnst_6 * in_r6; 59cabdff1aSopenharmony_ci t_l3 = cnst_16 * in_l2 + cnst_6 * in_l6; 60cabdff1aSopenharmony_ci t_r4 = cnst_6 * in_r2 - cnst_16 * in_r6; 61cabdff1aSopenharmony_ci t_l4 = cnst_6 * in_l2 - cnst_16 * in_l6; 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_ci ADD4(t_r1, t_r3, t_l1, t_l3, t_r2, t_r4, t_l2, t_l4, t_r5, t_l5, t_r6, t_l6); 64cabdff1aSopenharmony_ci SUB4(t_r2, t_r4, t_l2, t_l4, t_r1, t_r3, t_l1, t_l3, t_r7, t_l7, t_r8, t_l8); 65cabdff1aSopenharmony_ci t_r1 = cnst_16 * in_r1 + cnst_15 * in_r3 + cnst_9 * in_r5 + cnst_4 * in_r7; 66cabdff1aSopenharmony_ci t_l1 = cnst_16 * in_l1 + cnst_15 * in_l3 + cnst_9 * in_l5 + cnst_4 * in_l7; 67cabdff1aSopenharmony_ci t_r2 = cnst_15 * in_r1 - cnst_4 * in_r3 - cnst_16 * in_r5 - cnst_9 * in_r7; 68cabdff1aSopenharmony_ci t_l2 = cnst_15 * in_l1 - cnst_4 * in_l3 - cnst_16 * in_l5 - cnst_9 * in_l7; 69cabdff1aSopenharmony_ci t_r3 = cnst_9 * in_r1 - cnst_16 * in_r3 + cnst_4 * in_r5 + cnst_15 * in_r7; 70cabdff1aSopenharmony_ci t_l3 = cnst_9 * in_l1 - cnst_16 * in_l3 + cnst_4 * in_l5 + cnst_15 * in_l7; 71cabdff1aSopenharmony_ci t_r4 = cnst_4 * in_r1 - cnst_9 * in_r3 + cnst_15 * in_r5 - cnst_16 * in_r7; 72cabdff1aSopenharmony_ci t_l4 = cnst_4 * in_l1 - cnst_9 * in_l3 + cnst_15 * in_l5 - cnst_16 * in_l7; 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_ci in_r0 = (t_r5 + t_r1) >> 3; 75cabdff1aSopenharmony_ci in_l0 = (t_l5 + t_l1) >> 3; 76cabdff1aSopenharmony_ci in_r1 = (t_r6 + t_r2) >> 3; 77cabdff1aSopenharmony_ci in_l1 = (t_l6 + t_l2) >> 3; 78cabdff1aSopenharmony_ci in_r2 = (t_r7 + t_r3) >> 3; 79cabdff1aSopenharmony_ci in_l2 = (t_l7 + t_l3) >> 3; 80cabdff1aSopenharmony_ci in_r3 = (t_r8 + t_r4) >> 3; 81cabdff1aSopenharmony_ci in_l3 = (t_l8 + t_l4) >> 3; 82cabdff1aSopenharmony_ci 83cabdff1aSopenharmony_ci in_r4 = (t_r8 - t_r4) >> 3; 84cabdff1aSopenharmony_ci in_l4 = (t_l8 - t_l4) >> 3; 85cabdff1aSopenharmony_ci in_r5 = (t_r7 - t_r3) >> 3; 86cabdff1aSopenharmony_ci in_l5 = (t_l7 - t_l3) >> 3; 87cabdff1aSopenharmony_ci in_r6 = (t_r6 - t_r2) >> 3; 88cabdff1aSopenharmony_ci in_l6 = (t_l6 - t_l2) >> 3; 89cabdff1aSopenharmony_ci in_r7 = (t_r5 - t_r1) >> 3; 90cabdff1aSopenharmony_ci in_l7 = (t_l5 - t_l1) >> 3; 91cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in_r0, in_r1, in_r2, in_r3, in_r0, in_r1, in_r2, in_r3); 92cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in_l0, in_l1, in_l2, in_l3, in_l0, in_l1, in_l2, in_l3); 93cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in_r4, in_r5, in_r6, in_r7, in_r4, in_r5, in_r6, in_r7); 94cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in_l4, in_l5, in_l6, in_l7, in_l4, in_l5, in_l6, in_l7); 95cabdff1aSopenharmony_ci // Second loop 96cabdff1aSopenharmony_ci t_r1 = cnst_12 * (in_r0 + in_l0) + cnst_64; 97cabdff1aSopenharmony_ci t_l1 = cnst_12 * (in_r4 + in_l4) + cnst_64; 98cabdff1aSopenharmony_ci t_r2 = cnst_12 * (in_r0 - in_l0) + cnst_64; 99cabdff1aSopenharmony_ci t_l2 = cnst_12 * (in_r4 - in_l4) + cnst_64; 100cabdff1aSopenharmony_ci t_r3 = cnst_16 * in_r2 + cnst_6 * in_l2; 101cabdff1aSopenharmony_ci t_l3 = cnst_16 * in_r6 + cnst_6 * in_l6; 102cabdff1aSopenharmony_ci t_r4 = cnst_6 * in_r2 - cnst_16 * in_l2; 103cabdff1aSopenharmony_ci t_l4 = cnst_6 * in_r6 - cnst_16 * in_l6; 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci ADD4(t_r1, t_r3, t_l1, t_l3, t_r2, t_r4, t_l2, t_l4, t_r5, t_l5, t_r6, t_l6); 106cabdff1aSopenharmony_ci SUB4(t_r2, t_r4, t_l2, t_l4, t_r1, t_r3, t_l1, t_l3, t_r7, t_l7, t_r8, t_l8); 107cabdff1aSopenharmony_ci t_r1 = cnst_16 * in_r1 + cnst_15 * in_r3 + cnst_9 * in_l1 + cnst_4 * in_l3; 108cabdff1aSopenharmony_ci t_l1 = cnst_16 * in_r5 + cnst_15 * in_r7 + cnst_9 * in_l5 + cnst_4 * in_l7; 109cabdff1aSopenharmony_ci t_r2 = cnst_15 * in_r1 - cnst_4 * in_r3 - cnst_16 * in_l1 - cnst_9 * in_l3; 110cabdff1aSopenharmony_ci t_l2 = cnst_15 * in_r5 - cnst_4 * in_r7 - cnst_16 * in_l5 - cnst_9 * in_l7; 111cabdff1aSopenharmony_ci t_r3 = cnst_9 * in_r1 - cnst_16 * in_r3 + cnst_4 * in_l1 + cnst_15 * in_l3; 112cabdff1aSopenharmony_ci t_l3 = cnst_9 * in_r5 - cnst_16 * in_r7 + cnst_4 * in_l5 + cnst_15 * in_l7; 113cabdff1aSopenharmony_ci t_r4 = cnst_4 * in_r1 - cnst_9 * in_r3 + cnst_15 * in_l1 - cnst_16 * in_l3; 114cabdff1aSopenharmony_ci t_l4 = cnst_4 * in_r5 - cnst_9 * in_r7 + cnst_15 * in_l5 - cnst_16 * in_l7; 115cabdff1aSopenharmony_ci 116cabdff1aSopenharmony_ci in_r0 = (t_r5 + t_r1) >> 7; 117cabdff1aSopenharmony_ci in_l0 = (t_l5 + t_l1) >> 7; 118cabdff1aSopenharmony_ci in_r1 = (t_r6 + t_r2) >> 7; 119cabdff1aSopenharmony_ci in_l1 = (t_l6 + t_l2) >> 7; 120cabdff1aSopenharmony_ci in_r2 = (t_r7 + t_r3) >> 7; 121cabdff1aSopenharmony_ci in_l2 = (t_l7 + t_l3) >> 7; 122cabdff1aSopenharmony_ci in_r3 = (t_r8 + t_r4) >> 7; 123cabdff1aSopenharmony_ci in_l3 = (t_l8 + t_l4) >> 7; 124cabdff1aSopenharmony_ci 125cabdff1aSopenharmony_ci in_r4 = (t_r8 - t_r4 + cnst_1) >> 7; 126cabdff1aSopenharmony_ci in_l4 = (t_l8 - t_l4 + cnst_1) >> 7; 127cabdff1aSopenharmony_ci in_r5 = (t_r7 - t_r3 + cnst_1) >> 7; 128cabdff1aSopenharmony_ci in_l5 = (t_l7 - t_l3 + cnst_1) >> 7; 129cabdff1aSopenharmony_ci in_r6 = (t_r6 - t_r2 + cnst_1) >> 7; 130cabdff1aSopenharmony_ci in_l6 = (t_l6 - t_l2 + cnst_1) >> 7; 131cabdff1aSopenharmony_ci in_r7 = (t_r5 - t_r1 + cnst_1) >> 7; 132cabdff1aSopenharmony_ci in_l7 = (t_l5 - t_l1 + cnst_1) >> 7; 133cabdff1aSopenharmony_ci PCKEV_H4_SH(in_l0, in_r0, in_l1, in_r1, in_l2, in_r2, in_l3, in_r3, 134cabdff1aSopenharmony_ci in0, in1, in2, in3); 135cabdff1aSopenharmony_ci PCKEV_H4_SH(in_l4, in_r4, in_l5, in_r5, in_l6, in_r6, in_l7, in_r7, 136cabdff1aSopenharmony_ci in4, in5, in6, in7); 137cabdff1aSopenharmony_ci ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, block, 8); 138cabdff1aSopenharmony_ci} 139cabdff1aSopenharmony_ci 140cabdff1aSopenharmony_civoid ff_vc1_inv_trans_4x8_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 141cabdff1aSopenharmony_ci{ 142cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 143cabdff1aSopenharmony_ci v4i32 in_r0, in_r1, in_r2, in_r3, in_r4, in_r5, in_r6, in_r7; 144cabdff1aSopenharmony_ci v4i32 t1, t2, t3, t4, t5, t6, t7, t8; 145cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 146cabdff1aSopenharmony_ci v16i8 zero_m = { 0 }; 147cabdff1aSopenharmony_ci v4i32 cnst_17 = {17, 17, 17, 17}; 148cabdff1aSopenharmony_ci v4i32 cnst_22 = {22, 22, 22, 22}; 149cabdff1aSopenharmony_ci v4i32 cnst_10 = {10, 10, 10, 10}; 150cabdff1aSopenharmony_ci v4i32 cnst_12 = {12, 12, 12, 12}; 151cabdff1aSopenharmony_ci v4i32 cnst_64 = {64, 64, 64, 64}; 152cabdff1aSopenharmony_ci v4i32 cnst_16 = {16, 16, 16, 16}; 153cabdff1aSopenharmony_ci v4i32 cnst_15 = {15, 15, 15, 15}; 154cabdff1aSopenharmony_ci v4i32 cnst_4 = {4, 4, 4, 4}; 155cabdff1aSopenharmony_ci v4i32 cnst_6 = {6, 6, 6, 6}; 156cabdff1aSopenharmony_ci v4i32 cnst_9 = {9, 9, 9, 9}; 157cabdff1aSopenharmony_ci v4i32 cnst_1 = {1, 1, 1, 1}; 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_ci LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); 160cabdff1aSopenharmony_ci UNPCK_R_SH_SW(in0, in_r0); 161cabdff1aSopenharmony_ci UNPCK_R_SH_SW(in1, in_r1); 162cabdff1aSopenharmony_ci UNPCK_R_SH_SW(in2, in_r2); 163cabdff1aSopenharmony_ci UNPCK_R_SH_SW(in3, in_r3); 164cabdff1aSopenharmony_ci UNPCK_R_SH_SW(in4, in_r4); 165cabdff1aSopenharmony_ci UNPCK_R_SH_SW(in5, in_r5); 166cabdff1aSopenharmony_ci UNPCK_R_SH_SW(in6, in_r6); 167cabdff1aSopenharmony_ci UNPCK_R_SH_SW(in7, in_r7); 168cabdff1aSopenharmony_ci // First loop 169cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in_r0, in_r1, in_r2, in_r3, in_r0, in_r1, in_r2, in_r3); 170cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in_r4, in_r5, in_r6, in_r7, in_r4, in_r5, in_r6, in_r7); 171cabdff1aSopenharmony_ci t1 = cnst_17 * (in_r0 + in_r2) + cnst_4; 172cabdff1aSopenharmony_ci t5 = cnst_17 * (in_r4 + in_r6) + cnst_4; 173cabdff1aSopenharmony_ci t2 = cnst_17 * (in_r0 - in_r2) + cnst_4; 174cabdff1aSopenharmony_ci t6 = cnst_17 * (in_r4 - in_r6) + cnst_4; 175cabdff1aSopenharmony_ci t3 = cnst_22 * in_r1 + cnst_10 * in_r3; 176cabdff1aSopenharmony_ci t7 = cnst_22 * in_r5 + cnst_10 * in_r7; 177cabdff1aSopenharmony_ci t4 = cnst_22 * in_r3 - cnst_10 * in_r1; 178cabdff1aSopenharmony_ci t8 = cnst_22 * in_r7 - cnst_10 * in_r5; 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci in_r0 = (t1 + t3) >> 3; 181cabdff1aSopenharmony_ci in_r4 = (t5 + t7) >> 3; 182cabdff1aSopenharmony_ci in_r1 = (t2 - t4) >> 3; 183cabdff1aSopenharmony_ci in_r5 = (t6 - t8) >> 3; 184cabdff1aSopenharmony_ci in_r2 = (t2 + t4) >> 3; 185cabdff1aSopenharmony_ci in_r6 = (t6 + t8) >> 3; 186cabdff1aSopenharmony_ci in_r3 = (t1 - t3) >> 3; 187cabdff1aSopenharmony_ci in_r7 = (t5 - t7) >> 3; 188cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in_r0, in_r1, in_r2, in_r3, in_r0, in_r1, in_r2, in_r3); 189cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in_r4, in_r5, in_r6, in_r7, in_r4, in_r5, in_r6, in_r7); 190cabdff1aSopenharmony_ci PCKEV_H4_SH(in_r1, in_r0, in_r3, in_r2, in_r5, in_r4, in_r7, in_r6, 191cabdff1aSopenharmony_ci in0, in1, in2, in3); 192cabdff1aSopenharmony_ci ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1, block, 8); 193cabdff1aSopenharmony_ci // Second loop 194cabdff1aSopenharmony_ci t1 = cnst_12 * (in_r0 + in_r4) + cnst_64; 195cabdff1aSopenharmony_ci t2 = cnst_12 * (in_r0 - in_r4) + cnst_64; 196cabdff1aSopenharmony_ci t3 = cnst_16 * in_r2 + cnst_6 * in_r6; 197cabdff1aSopenharmony_ci t4 = cnst_6 * in_r2 - cnst_16 * in_r6; 198cabdff1aSopenharmony_ci t5 = t1 + t3, t6 = t2 + t4; 199cabdff1aSopenharmony_ci t7 = t2 - t4, t8 = t1 - t3; 200cabdff1aSopenharmony_ci t1 = cnst_16 * in_r1 + cnst_15 * in_r3 + cnst_9 * in_r5 + cnst_4 * in_r7; 201cabdff1aSopenharmony_ci t2 = cnst_15 * in_r1 - cnst_4 * in_r3 - cnst_16 * in_r5 - cnst_9 * in_r7; 202cabdff1aSopenharmony_ci t3 = cnst_9 * in_r1 - cnst_16 * in_r3 + cnst_4 * in_r5 + cnst_15 * in_r7; 203cabdff1aSopenharmony_ci t4 = cnst_4 * in_r1 - cnst_9 * in_r3 + cnst_15 * in_r5 - cnst_16 * in_r7; 204cabdff1aSopenharmony_ci LD_SW8(dest, linesize, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 205cabdff1aSopenharmony_ci ILVR_B8_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3, 206cabdff1aSopenharmony_ci zero_m, dst4, zero_m, dst5, zero_m, dst6, zero_m, dst7, 207cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 208cabdff1aSopenharmony_ci ILVR_H4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3, 209cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 210cabdff1aSopenharmony_ci ILVR_H4_SW(zero_m, dst4, zero_m, dst5, zero_m, dst6, zero_m, dst7, 211cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 212cabdff1aSopenharmony_ci in_r0 = (t5 + t1) >> 7; 213cabdff1aSopenharmony_ci in_r1 = (t6 + t2) >> 7; 214cabdff1aSopenharmony_ci in_r2 = (t7 + t3) >> 7; 215cabdff1aSopenharmony_ci in_r3 = (t8 + t4) >> 7; 216cabdff1aSopenharmony_ci in_r4 = (t8 - t4 + cnst_1) >> 7; 217cabdff1aSopenharmony_ci in_r5 = (t7 - t3 + cnst_1) >> 7; 218cabdff1aSopenharmony_ci in_r6 = (t6 - t2 + cnst_1) >> 7; 219cabdff1aSopenharmony_ci in_r7 = (t5 - t1 + cnst_1) >> 7; 220cabdff1aSopenharmony_ci ADD4(in_r0, dst0, in_r1, dst1, in_r2, dst2, in_r3, dst3, 221cabdff1aSopenharmony_ci in_r0, in_r1, in_r2, in_r3); 222cabdff1aSopenharmony_ci ADD4(in_r4, dst4, in_r5, dst5, in_r6, dst6, in_r7, dst7, 223cabdff1aSopenharmony_ci in_r4, in_r5, in_r6, in_r7); 224cabdff1aSopenharmony_ci CLIP_SW8_0_255(in_r0, in_r1, in_r2, in_r3, in_r4, in_r5, in_r6, in_r7); 225cabdff1aSopenharmony_ci PCKEV_H4_SH(in_r1, in_r0, in_r3, in_r2, in_r5, in_r4, in_r7, in_r6, 226cabdff1aSopenharmony_ci in0, in1, in2, in3); 227cabdff1aSopenharmony_ci PCKEV_B2_SH(in1, in0, in3, in2, in0, in1); 228cabdff1aSopenharmony_ci ST_W8(in0, in1, 0, 1, 2, 3, 0, 1, 2, 3, dest, linesize); 229cabdff1aSopenharmony_ci} 230cabdff1aSopenharmony_ci 231cabdff1aSopenharmony_civoid ff_vc1_inv_trans_8x4_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 232cabdff1aSopenharmony_ci{ 233cabdff1aSopenharmony_ci v4i32 in0, in1, in2, in3, in4, in5, in6, in7; 234cabdff1aSopenharmony_ci v4i32 t1, t2, t3, t4, t5, t6, t7, t8; 235cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 236cabdff1aSopenharmony_ci v16i8 zero_m = { 0 }; 237cabdff1aSopenharmony_ci v4i32 cnst_17 = {17, 17, 17, 17}; 238cabdff1aSopenharmony_ci v4i32 cnst_22 = {22, 22, 22, 22}; 239cabdff1aSopenharmony_ci v4i32 cnst_10 = {10, 10, 10, 10}; 240cabdff1aSopenharmony_ci v4i32 cnst_12 = {12, 12, 12, 12}; 241cabdff1aSopenharmony_ci v4i32 cnst_64 = {64, 64, 64, 64}; 242cabdff1aSopenharmony_ci v4i32 cnst_16 = {16, 16, 16, 16}; 243cabdff1aSopenharmony_ci v4i32 cnst_15 = {15, 15, 15, 15}; 244cabdff1aSopenharmony_ci v4i32 cnst_4 = {4, 4, 4, 4}; 245cabdff1aSopenharmony_ci v4i32 cnst_6 = {6, 6, 6, 6}; 246cabdff1aSopenharmony_ci v4i32 cnst_9 = {9, 9, 9, 9}; 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_ci LD_SW4(block, 8, t1, t2, t3, t4); 249cabdff1aSopenharmony_ci UNPCK_SH_SW(t1, in0, in4); 250cabdff1aSopenharmony_ci UNPCK_SH_SW(t2, in1, in5); 251cabdff1aSopenharmony_ci UNPCK_SH_SW(t3, in2, in6); 252cabdff1aSopenharmony_ci UNPCK_SH_SW(t4, in3, in7); 253cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, in0, in1, in2, in3); 254cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in4, in5, in6, in7, in4, in5, in6, in7); 255cabdff1aSopenharmony_ci // First loop 256cabdff1aSopenharmony_ci t1 = cnst_12 * (in0 + in4) + cnst_4; 257cabdff1aSopenharmony_ci t2 = cnst_12 * (in0 - in4) + cnst_4; 258cabdff1aSopenharmony_ci t3 = cnst_16 * in2 + cnst_6 * in6; 259cabdff1aSopenharmony_ci t4 = cnst_6 * in2 - cnst_16 * in6; 260cabdff1aSopenharmony_ci t5 = t1 + t3, t6 = t2 + t4; 261cabdff1aSopenharmony_ci t7 = t2 - t4, t8 = t1 - t3; 262cabdff1aSopenharmony_ci t1 = cnst_16 * in1 + cnst_15 * in3 + cnst_9 * in5 + cnst_4 * in7; 263cabdff1aSopenharmony_ci t2 = cnst_15 * in1 - cnst_4 * in3 - cnst_16 * in5 - cnst_9 * in7; 264cabdff1aSopenharmony_ci t3 = cnst_9 * in1 - cnst_16 * in3 + cnst_4 * in5 + cnst_15 * in7; 265cabdff1aSopenharmony_ci t4 = cnst_4 * in1 - cnst_9 * in3 + cnst_15 * in5 - cnst_16 * in7; 266cabdff1aSopenharmony_ci in0 = (t5 + t1) >> 3; 267cabdff1aSopenharmony_ci in1 = (t6 + t2) >> 3; 268cabdff1aSopenharmony_ci in2 = (t7 + t3) >> 3; 269cabdff1aSopenharmony_ci in3 = (t8 + t4) >> 3; 270cabdff1aSopenharmony_ci in4 = (t8 - t4) >> 3; 271cabdff1aSopenharmony_ci in5 = (t7 - t3) >> 3; 272cabdff1aSopenharmony_ci in6 = (t6 - t2) >> 3; 273cabdff1aSopenharmony_ci in7 = (t5 - t1) >> 3; 274cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, in0, in1, in2, in3); 275cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(in4, in5, in6, in7, in4, in5, in6, in7); 276cabdff1aSopenharmony_ci PCKEV_H4_SW(in4, in0, in5, in1, in6, in2, in7, in3, t1, t2, t3, t4); 277cabdff1aSopenharmony_ci ST_SW4(t1, t2, t3, t4, block, 8); 278cabdff1aSopenharmony_ci // Second loop 279cabdff1aSopenharmony_ci LD_SW4(dest, linesize, dst0, dst1, dst2, dst3); 280cabdff1aSopenharmony_ci ILVR_B4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3, 281cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 282cabdff1aSopenharmony_ci ILVL_H4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3, 283cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 284cabdff1aSopenharmony_ci ILVR_H4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3, 285cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 286cabdff1aSopenharmony_ci // Right part 287cabdff1aSopenharmony_ci t1 = cnst_17 * (in0 + in2) + cnst_64; 288cabdff1aSopenharmony_ci t2 = cnst_17 * (in0 - in2) + cnst_64; 289cabdff1aSopenharmony_ci t3 = cnst_22 * in1 + cnst_10 * in3; 290cabdff1aSopenharmony_ci t4 = cnst_22 * in3 - cnst_10 * in1; 291cabdff1aSopenharmony_ci in0 = (t1 + t3) >> 7; 292cabdff1aSopenharmony_ci in1 = (t2 - t4) >> 7; 293cabdff1aSopenharmony_ci in2 = (t2 + t4) >> 7; 294cabdff1aSopenharmony_ci in3 = (t1 - t3) >> 7; 295cabdff1aSopenharmony_ci ADD4(in0, dst0, in1, dst1, in2, dst2, in3, dst3, in0, in1, in2, in3); 296cabdff1aSopenharmony_ci CLIP_SW4_0_255(in0, in1, in2, in3); 297cabdff1aSopenharmony_ci // Left part 298cabdff1aSopenharmony_ci t5 = cnst_17 * (in4 + in6) + cnst_64; 299cabdff1aSopenharmony_ci t6 = cnst_17 * (in4 - in6) + cnst_64; 300cabdff1aSopenharmony_ci t7 = cnst_22 * in5 + cnst_10 * in7; 301cabdff1aSopenharmony_ci t8 = cnst_22 * in7 - cnst_10 * in5; 302cabdff1aSopenharmony_ci in4 = (t5 + t7) >> 7; 303cabdff1aSopenharmony_ci in5 = (t6 - t8) >> 7; 304cabdff1aSopenharmony_ci in6 = (t6 + t8) >> 7; 305cabdff1aSopenharmony_ci in7 = (t5 - t7) >> 7; 306cabdff1aSopenharmony_ci ADD4(in4, dst4, in5, dst5, in6, dst6, in7, dst7, in4, in5, in6, in7); 307cabdff1aSopenharmony_ci CLIP_SW4_0_255(in4, in5, in6, in7); 308cabdff1aSopenharmony_ci PCKEV_H4_SW(in4, in0, in5, in1, in6, in2, in7, in3, in0, in1, in2, in3); 309cabdff1aSopenharmony_ci PCKEV_B2_SW(in1, in0, in3, in2, in0, in1); 310cabdff1aSopenharmony_ci ST_D4(in0, in1, 0, 1, 0, 1, dest, linesize); 311cabdff1aSopenharmony_ci} 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_cistatic void put_vc1_mspel_mc_h_v_msa(uint8_t *dst, const uint8_t *src, 314cabdff1aSopenharmony_ci ptrdiff_t stride, int hmode, int vmode, 315cabdff1aSopenharmony_ci int rnd) 316cabdff1aSopenharmony_ci{ 317cabdff1aSopenharmony_ci v8i16 in_r0, in_r1, in_r2, in_r3, in_l0, in_l1, in_l2, in_l3; 318cabdff1aSopenharmony_ci v8i16 t0, t1, t2, t3, t4, t5, t6, t7; 319cabdff1aSopenharmony_ci v8i16 t8, t9, t10, t11, t12, t13, t14, t15; 320cabdff1aSopenharmony_ci v8i16 cnst_para0, cnst_para1, cnst_para2, cnst_para3, cnst_r; 321cabdff1aSopenharmony_ci static const int para_value[][4] = {{4, 53, 18, 3}, 322cabdff1aSopenharmony_ci {1, 9, 9, 1}, 323cabdff1aSopenharmony_ci {3, 18, 53, 4}}; 324cabdff1aSopenharmony_ci static const int shift_value[] = {0, 5, 1, 5}; 325cabdff1aSopenharmony_ci int shift = (shift_value[hmode] + shift_value[vmode]) >> 1; 326cabdff1aSopenharmony_ci int r = (1 << (shift - 1)) + rnd - 1; 327cabdff1aSopenharmony_ci cnst_r = __msa_fill_h(r); 328cabdff1aSopenharmony_ci src -= 1, src -= stride; 329cabdff1aSopenharmony_ci cnst_para0 = __msa_fill_h(para_value[vmode - 1][0]); 330cabdff1aSopenharmony_ci cnst_para1 = __msa_fill_h(para_value[vmode - 1][1]); 331cabdff1aSopenharmony_ci cnst_para2 = __msa_fill_h(para_value[vmode - 1][2]); 332cabdff1aSopenharmony_ci cnst_para3 = __msa_fill_h(para_value[vmode - 1][3]); 333cabdff1aSopenharmony_ci LD_SH4(src, stride, in_l0, in_l1, in_l2, in_l3); 334cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l0, in_r0, in_l0); 335cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l1, in_r1, in_l1); 336cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l2, in_r2, in_l2); 337cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l3, in_r3, in_l3); 338cabdff1aSopenharmony_ci // row 0 339cabdff1aSopenharmony_ci t0 = cnst_para1 * in_r1 + cnst_para2 * in_r2 340cabdff1aSopenharmony_ci - cnst_para0 * in_r0 - cnst_para3 * in_r3; 341cabdff1aSopenharmony_ci t8 = cnst_para1 * in_l1 + cnst_para2 * in_l2 342cabdff1aSopenharmony_ci - cnst_para0 * in_l0 - cnst_para3 * in_l3; 343cabdff1aSopenharmony_ci in_l0 = LD_SH(src + 4 * stride); 344cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l0, in_r0, in_l0); 345cabdff1aSopenharmony_ci // row 1 346cabdff1aSopenharmony_ci t1 = cnst_para1 * in_r2 + cnst_para2 * in_r3 347cabdff1aSopenharmony_ci - cnst_para0 * in_r1 - cnst_para3 * in_r0; 348cabdff1aSopenharmony_ci t9 = cnst_para1 * in_l2 + cnst_para2 * in_l3 349cabdff1aSopenharmony_ci - cnst_para0 * in_l1 - cnst_para3 * in_l0; 350cabdff1aSopenharmony_ci in_l1 = LD_SH(src + 5 * stride); 351cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l1, in_r1, in_l1); 352cabdff1aSopenharmony_ci // row 2 353cabdff1aSopenharmony_ci t2 = cnst_para1 * in_r3 + cnst_para2 * in_r0 354cabdff1aSopenharmony_ci - cnst_para0 * in_r2 - cnst_para3 * in_r1; 355cabdff1aSopenharmony_ci t10 = cnst_para1 * in_l3 + cnst_para2 * in_l0 356cabdff1aSopenharmony_ci - cnst_para0 * in_l2 - cnst_para3 * in_l1; 357cabdff1aSopenharmony_ci in_l2 = LD_SH(src + 6 * stride); 358cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l2, in_r2, in_l2); 359cabdff1aSopenharmony_ci // row 3 360cabdff1aSopenharmony_ci t3 = cnst_para1 * in_r0 + cnst_para2 * in_r1 361cabdff1aSopenharmony_ci - cnst_para0 * in_r3 - cnst_para3 * in_r2; 362cabdff1aSopenharmony_ci t11 = cnst_para1 * in_l0 + cnst_para2 * in_l1 363cabdff1aSopenharmony_ci - cnst_para0 * in_l3 - cnst_para3 * in_l2; 364cabdff1aSopenharmony_ci in_l3 = LD_SH(src + 7 * stride); 365cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l3, in_r3, in_l3); 366cabdff1aSopenharmony_ci // row 4 367cabdff1aSopenharmony_ci t4 = cnst_para1 * in_r1 + cnst_para2 * in_r2 368cabdff1aSopenharmony_ci - cnst_para0 * in_r0 - cnst_para3 * in_r3; 369cabdff1aSopenharmony_ci t12 = cnst_para1 * in_l1 + cnst_para2 * in_l2 370cabdff1aSopenharmony_ci - cnst_para0 * in_l0 - cnst_para3 * in_l3; 371cabdff1aSopenharmony_ci in_l0 = LD_SH(src + 8 * stride); 372cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l0, in_r0, in_l0); 373cabdff1aSopenharmony_ci // row 5 374cabdff1aSopenharmony_ci t5 = cnst_para1 * in_r2 + cnst_para2 * in_r3 375cabdff1aSopenharmony_ci - cnst_para0 * in_r1 - cnst_para3 * in_r0; 376cabdff1aSopenharmony_ci t13 = cnst_para1 * in_l2 + cnst_para2 * in_l3 377cabdff1aSopenharmony_ci - cnst_para0 * in_l1 - cnst_para3 * in_l0; 378cabdff1aSopenharmony_ci in_l1 = LD_SH(src + 9 * stride); 379cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l1, in_r1, in_l1); 380cabdff1aSopenharmony_ci // row 6 381cabdff1aSopenharmony_ci t6 = cnst_para1 * in_r3 + cnst_para2 * in_r0 382cabdff1aSopenharmony_ci - cnst_para0 * in_r2 - cnst_para3 * in_r1; 383cabdff1aSopenharmony_ci t14 = cnst_para1 * in_l3 + cnst_para2 * in_l0 384cabdff1aSopenharmony_ci - cnst_para0 * in_l2 - cnst_para3 * in_l1; 385cabdff1aSopenharmony_ci in_l2 = LD_SH(src + 10 * stride); 386cabdff1aSopenharmony_ci UNPCK_UB_SH(in_l2, in_r2, in_l2); 387cabdff1aSopenharmony_ci // row 7 388cabdff1aSopenharmony_ci t7 = cnst_para1 * in_r0 + cnst_para2 * in_r1 389cabdff1aSopenharmony_ci - cnst_para0 * in_r3 - cnst_para3 * in_r2; 390cabdff1aSopenharmony_ci t15 = cnst_para1 * in_l0 + cnst_para2 * in_l1 391cabdff1aSopenharmony_ci - cnst_para0 * in_l3 - cnst_para3 * in_l2; 392cabdff1aSopenharmony_ci 393cabdff1aSopenharmony_ci ADD4(t0, cnst_r, t1, cnst_r, t2, cnst_r, t3, cnst_r, t0, t1, t2, t3); 394cabdff1aSopenharmony_ci ADD4(t4, cnst_r, t5, cnst_r, t6, cnst_r, t7, cnst_r, t4, t5, t6, t7); 395cabdff1aSopenharmony_ci ADD4(t8, cnst_r, t9, cnst_r, t10, cnst_r, t11, cnst_r, 396cabdff1aSopenharmony_ci t8, t9, t10, t11); 397cabdff1aSopenharmony_ci ADD4(t12, cnst_r, t13, cnst_r, t14, cnst_r, t15, cnst_r, 398cabdff1aSopenharmony_ci t12, t13, t14, t15); 399cabdff1aSopenharmony_ci t0 >>= shift, t1 >>= shift, t2 >>= shift, t3 >>= shift; 400cabdff1aSopenharmony_ci t4 >>= shift, t5 >>= shift, t6 >>= shift, t7 >>= shift; 401cabdff1aSopenharmony_ci t8 >>= shift, t9 >>= shift, t10 >>= shift, t11 >>= shift; 402cabdff1aSopenharmony_ci t12 >>= shift, t13 >>= shift, t14 >>= shift, t15 >>= shift; 403cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(t0, t1, t2, t3, t4, t5, t6, t7, 404cabdff1aSopenharmony_ci t0, t1, t2, t3, t4, t5, t6, t7); 405cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(t8, t9, t10, t11, t12, t13, t14, t15, 406cabdff1aSopenharmony_ci t8, t9, t10, t11, t12, t13, t14, t15); 407cabdff1aSopenharmony_ci cnst_para0 = __msa_fill_h(para_value[hmode - 1][0]); 408cabdff1aSopenharmony_ci cnst_para1 = __msa_fill_h(para_value[hmode - 1][1]); 409cabdff1aSopenharmony_ci cnst_para2 = __msa_fill_h(para_value[hmode - 1][2]); 410cabdff1aSopenharmony_ci cnst_para3 = __msa_fill_h(para_value[hmode - 1][3]); 411cabdff1aSopenharmony_ci r = 64 - rnd; 412cabdff1aSopenharmony_ci cnst_r = __msa_fill_h(r); 413cabdff1aSopenharmony_ci // col 0 ~ 7 414cabdff1aSopenharmony_ci t0 = cnst_para1 * t1 + cnst_para2 * t2 - cnst_para0 * t0 - cnst_para3 * t3; 415cabdff1aSopenharmony_ci t1 = cnst_para1 * t2 + cnst_para2 * t3 - cnst_para0 * t1 - cnst_para3 * t4; 416cabdff1aSopenharmony_ci t2 = cnst_para1 * t3 + cnst_para2 * t4 - cnst_para0 * t2 - cnst_para3 * t5; 417cabdff1aSopenharmony_ci t3 = cnst_para1 * t4 + cnst_para2 * t5 - cnst_para0 * t3 - cnst_para3 * t6; 418cabdff1aSopenharmony_ci t4 = cnst_para1 * t5 + cnst_para2 * t6 - cnst_para0 * t4 - cnst_para3 * t7; 419cabdff1aSopenharmony_ci t5 = cnst_para1 * t6 + cnst_para2 * t7 - cnst_para0 * t5 - cnst_para3 * t8; 420cabdff1aSopenharmony_ci t6 = cnst_para1 * t7 + cnst_para2 * t8 - cnst_para0 * t6 - cnst_para3 * t9; 421cabdff1aSopenharmony_ci t7 = cnst_para1 * t8 + cnst_para2 * t9 - cnst_para0 * t7 - cnst_para3 * t10; 422cabdff1aSopenharmony_ci ADD4(t0, cnst_r, t1, cnst_r, t2, cnst_r, t3, cnst_r, t0, t1, t2, t3); 423cabdff1aSopenharmony_ci ADD4(t4, cnst_r, t5, cnst_r, t6, cnst_r, t7, cnst_r, t4, t5, t6, t7); 424cabdff1aSopenharmony_ci t0 >>= 7, t1 >>= 7, t2 >>= 7, t3 >>= 7; 425cabdff1aSopenharmony_ci t4 >>= 7, t5 >>= 7, t6 >>= 7, t7 >>= 7; 426cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(t0, t1, t2, t3, t4, t5, t6, t7, 427cabdff1aSopenharmony_ci t0, t1, t2, t3, t4, t5, t6, t7); 428cabdff1aSopenharmony_ci CLIP_SH8_0_255(t0, t1, t2, t3, t4, t5, t6, t7); 429cabdff1aSopenharmony_ci PCKEV_B4_SH(t1, t0, t3, t2, t5, t4, t7, t6, t0, t1, t2, t3); 430cabdff1aSopenharmony_ci ST_D8(t0, t1, t2, t3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 431cabdff1aSopenharmony_ci} 432cabdff1aSopenharmony_ci 433cabdff1aSopenharmony_ci#define PUT_VC1_MSPEL_MC_MSA(hmode, vmode) \ 434cabdff1aSopenharmony_civoid ff_put_vc1_mspel_mc ## hmode ## vmode ## _msa(uint8_t *dst, \ 435cabdff1aSopenharmony_ci const uint8_t *src, \ 436cabdff1aSopenharmony_ci ptrdiff_t stride, int rnd) \ 437cabdff1aSopenharmony_ci{ \ 438cabdff1aSopenharmony_ci put_vc1_mspel_mc_h_v_msa(dst, src, stride, hmode, vmode, rnd); \ 439cabdff1aSopenharmony_ci} \ 440cabdff1aSopenharmony_civoid ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_msa(uint8_t *dst, \ 441cabdff1aSopenharmony_ci const uint8_t *src, \ 442cabdff1aSopenharmony_ci ptrdiff_t stride, int rnd) \ 443cabdff1aSopenharmony_ci{ \ 444cabdff1aSopenharmony_ci put_vc1_mspel_mc_h_v_msa(dst, src, stride, hmode, vmode, rnd); \ 445cabdff1aSopenharmony_ci put_vc1_mspel_mc_h_v_msa(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 446cabdff1aSopenharmony_ci dst += 8 * stride, src += 8 * stride; \ 447cabdff1aSopenharmony_ci put_vc1_mspel_mc_h_v_msa(dst, src, stride, hmode, vmode, rnd); \ 448cabdff1aSopenharmony_ci put_vc1_mspel_mc_h_v_msa(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 449cabdff1aSopenharmony_ci} 450cabdff1aSopenharmony_ci 451cabdff1aSopenharmony_ciPUT_VC1_MSPEL_MC_MSA(1, 1); 452cabdff1aSopenharmony_ciPUT_VC1_MSPEL_MC_MSA(1, 2); 453cabdff1aSopenharmony_ciPUT_VC1_MSPEL_MC_MSA(1, 3); 454cabdff1aSopenharmony_ci 455cabdff1aSopenharmony_ciPUT_VC1_MSPEL_MC_MSA(2, 1); 456cabdff1aSopenharmony_ciPUT_VC1_MSPEL_MC_MSA(2, 2); 457cabdff1aSopenharmony_ciPUT_VC1_MSPEL_MC_MSA(2, 3); 458cabdff1aSopenharmony_ci 459cabdff1aSopenharmony_ciPUT_VC1_MSPEL_MC_MSA(3, 1); 460cabdff1aSopenharmony_ciPUT_VC1_MSPEL_MC_MSA(3, 2); 461cabdff1aSopenharmony_ciPUT_VC1_MSPEL_MC_MSA(3, 3); 462