1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 22cabdff1aSopenharmony_ci#include "libavcodec/mips/hevcdsp_mips.h" 23cabdff1aSopenharmony_ci#include "libavcodec/mips/hevc_macros_msa.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 26cabdff1aSopenharmony_ci /* 8 width cases */ 27cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28cabdff1aSopenharmony_ci /* 4 width cases */ 29cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 30cabdff1aSopenharmony_ci}; 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_cistatic void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride, 33cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 34cabdff1aSopenharmony_ci int32_t height) 35cabdff1aSopenharmony_ci{ 36cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci if (2 == height) { 39cabdff1aSopenharmony_ci v16i8 src0, src1; 40cabdff1aSopenharmony_ci v8i16 in0; 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src1); 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_ci src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0); 45cabdff1aSopenharmony_ci in0 = (v8i16) __msa_ilvr_b(zero, src0); 46cabdff1aSopenharmony_ci in0 <<= 6; 47cabdff1aSopenharmony_ci ST_D2(in0, 0, 1, dst, dst_stride); 48cabdff1aSopenharmony_ci } else if (4 == height) { 49cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 50cabdff1aSopenharmony_ci v8i16 in0, in1; 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci ILVR_W2_SB(src1, src0, src3, src2, src0, src1); 55cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src0, zero, src1, in0, in1); 56cabdff1aSopenharmony_ci in0 <<= 6; 57cabdff1aSopenharmony_ci in1 <<= 6; 58cabdff1aSopenharmony_ci ST_D4(in0, in1, 0, 1, 0, 1, dst, dst_stride); 59cabdff1aSopenharmony_ci } else if (0 == height % 8) { 60cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 61cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 62cabdff1aSopenharmony_ci uint32_t loop_cnt; 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 65cabdff1aSopenharmony_ci LD_SB8(src, src_stride, 66cabdff1aSopenharmony_ci src0, src1, src2, src3, src4, src5, src6, src7); 67cabdff1aSopenharmony_ci src += (8 * src_stride); 68cabdff1aSopenharmony_ci 69cabdff1aSopenharmony_ci ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 70cabdff1aSopenharmony_ci src0, src1, src2, src3); 71cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 72cabdff1aSopenharmony_ci in0, in1, in2, in3); 73cabdff1aSopenharmony_ci SLLI_4V(in0, in1, in2, in3, 6); 74cabdff1aSopenharmony_ci ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 75cabdff1aSopenharmony_ci dst += (8 * dst_stride); 76cabdff1aSopenharmony_ci } 77cabdff1aSopenharmony_ci } 78cabdff1aSopenharmony_ci} 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_cistatic void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride, 81cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 82cabdff1aSopenharmony_ci int32_t height) 83cabdff1aSopenharmony_ci{ 84cabdff1aSopenharmony_ci uint32_t loop_cnt; 85cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 86cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 87cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 90cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 91cabdff1aSopenharmony_ci src += (8 * src_stride); 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 94cabdff1aSopenharmony_ci in0, in1, in2, in3); 95cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 96cabdff1aSopenharmony_ci in4, in5, in6, in7); 97cabdff1aSopenharmony_ci SLLI_4V(in0, in1, in2, in3, 6); 98cabdff1aSopenharmony_ci SLLI_4V(in4, in5, in6, in7, 6); 99cabdff1aSopenharmony_ci ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride); 100cabdff1aSopenharmony_ci dst += (8 * dst_stride); 101cabdff1aSopenharmony_ci } 102cabdff1aSopenharmony_ci} 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_cistatic void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride, 105cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 106cabdff1aSopenharmony_ci int32_t height) 107cabdff1aSopenharmony_ci{ 108cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 109cabdff1aSopenharmony_ci 110cabdff1aSopenharmony_ci if (2 == height) { 111cabdff1aSopenharmony_ci v16i8 src0, src1; 112cabdff1aSopenharmony_ci v8i16 in0, in1; 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src1); 115cabdff1aSopenharmony_ci 116cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src0, zero, src1, in0, in1); 117cabdff1aSopenharmony_ci in0 <<= 6; 118cabdff1aSopenharmony_ci in1 <<= 6; 119cabdff1aSopenharmony_ci ST_SH2(in0, in1, dst, dst_stride); 120cabdff1aSopenharmony_ci } else if (4 == height) { 121cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 122cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 127cabdff1aSopenharmony_ci in0, in1, in2, in3); 128cabdff1aSopenharmony_ci SLLI_4V(in0, in1, in2, in3, 6); 129cabdff1aSopenharmony_ci ST_SH4(in0, in1, in2, in3, dst, dst_stride); 130cabdff1aSopenharmony_ci } else if (6 == height) { 131cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 132cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 133cabdff1aSopenharmony_ci 134cabdff1aSopenharmony_ci LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5); 135cabdff1aSopenharmony_ci 136cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 137cabdff1aSopenharmony_ci in0, in1, in2, in3); 138cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src4, zero, src5, in4, in5); 139cabdff1aSopenharmony_ci SLLI_4V(in0, in1, in2, in3, 6); 140cabdff1aSopenharmony_ci in4 <<= 6; 141cabdff1aSopenharmony_ci in5 <<= 6; 142cabdff1aSopenharmony_ci ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride); 143cabdff1aSopenharmony_ci } else if (0 == height % 8) { 144cabdff1aSopenharmony_ci uint32_t loop_cnt; 145cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 146cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 147cabdff1aSopenharmony_ci 148cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 149cabdff1aSopenharmony_ci LD_SB8(src, src_stride, 150cabdff1aSopenharmony_ci src0, src1, src2, src3, src4, src5, src6, src7); 151cabdff1aSopenharmony_ci src += (8 * src_stride); 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 154cabdff1aSopenharmony_ci in0, in1, in2, in3); 155cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 156cabdff1aSopenharmony_ci in4, in5, in6, in7); 157cabdff1aSopenharmony_ci SLLI_4V(in0, in1, in2, in3, 6); 158cabdff1aSopenharmony_ci SLLI_4V(in4, in5, in6, in7, 6); 159cabdff1aSopenharmony_ci ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride); 160cabdff1aSopenharmony_ci dst += (8 * dst_stride); 161cabdff1aSopenharmony_ci } 162cabdff1aSopenharmony_ci } 163cabdff1aSopenharmony_ci} 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_cistatic void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride, 166cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 167cabdff1aSopenharmony_ci int32_t height) 168cabdff1aSopenharmony_ci{ 169cabdff1aSopenharmony_ci uint32_t loop_cnt; 170cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 171cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 172cabdff1aSopenharmony_ci v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r; 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 175cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 176cabdff1aSopenharmony_ci src += (8 * src_stride); 177cabdff1aSopenharmony_ci 178cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 179cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 180cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 181cabdff1aSopenharmony_ci ILVL_W2_SB(src1, src0, src3, src2, src0, src1); 182cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src0, zero, src1, in0, in1); 183cabdff1aSopenharmony_ci in0 <<= 6; 184cabdff1aSopenharmony_ci in1 <<= 6; 185cabdff1aSopenharmony_ci ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 186cabdff1aSopenharmony_ci ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride); 187cabdff1aSopenharmony_ci dst += (4 * dst_stride); 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 190cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 191cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 192cabdff1aSopenharmony_ci ILVL_W2_SB(src5, src4, src7, src6, src0, src1); 193cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src0, zero, src1, in0, in1); 194cabdff1aSopenharmony_ci in0 <<= 6; 195cabdff1aSopenharmony_ci in1 <<= 6; 196cabdff1aSopenharmony_ci ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 197cabdff1aSopenharmony_ci ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride); 198cabdff1aSopenharmony_ci dst += (4 * dst_stride); 199cabdff1aSopenharmony_ci } 200cabdff1aSopenharmony_ci} 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_cistatic void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride, 203cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 204cabdff1aSopenharmony_ci int32_t height) 205cabdff1aSopenharmony_ci{ 206cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 207cabdff1aSopenharmony_ci 208cabdff1aSopenharmony_ci if (4 == height) { 209cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 210cabdff1aSopenharmony_ci v8i16 in0_r, in1_r, in2_r, in3_r; 211cabdff1aSopenharmony_ci v8i16 in0_l, in1_l, in2_l, in3_l; 212cabdff1aSopenharmony_ci 213cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 214cabdff1aSopenharmony_ci 215cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 216cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 217cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 218cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 219cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 220cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 221cabdff1aSopenharmony_ci ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 222cabdff1aSopenharmony_ci ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 223cabdff1aSopenharmony_ci } else if (12 == height) { 224cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 225cabdff1aSopenharmony_ci v16i8 src8, src9, src10, src11; 226cabdff1aSopenharmony_ci v8i16 in0_r, in1_r, in2_r, in3_r; 227cabdff1aSopenharmony_ci v8i16 in0_l, in1_l, in2_l, in3_l; 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 230cabdff1aSopenharmony_ci src += (8 * src_stride); 231cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src8, src9, src10, src11); 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 234cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 235cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 236cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 237cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 238cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 239cabdff1aSopenharmony_ci ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 240cabdff1aSopenharmony_ci ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 241cabdff1aSopenharmony_ci dst += (4 * dst_stride); 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 244cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 245cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 246cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 247cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 248cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 249cabdff1aSopenharmony_ci ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 250cabdff1aSopenharmony_ci ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 251cabdff1aSopenharmony_ci dst += (4 * dst_stride); 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11, 254cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 255cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11, 256cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 257cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 258cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 259cabdff1aSopenharmony_ci ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 260cabdff1aSopenharmony_ci ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 261cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 262cabdff1aSopenharmony_ci uint32_t loop_cnt; 263cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 264cabdff1aSopenharmony_ci v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 267cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, 268cabdff1aSopenharmony_ci src7); 269cabdff1aSopenharmony_ci src += (8 * src_stride); 270cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, 271cabdff1aSopenharmony_ci in1_r, in2_r, in3_r); 272cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, 273cabdff1aSopenharmony_ci in1_l, in2_l, in3_l); 274cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 275cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 276cabdff1aSopenharmony_ci ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 277cabdff1aSopenharmony_ci ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 278cabdff1aSopenharmony_ci dst += (4 * dst_stride); 279cabdff1aSopenharmony_ci 280cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, 281cabdff1aSopenharmony_ci in1_r, in2_r, in3_r); 282cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, 283cabdff1aSopenharmony_ci in1_l, in2_l, in3_l); 284cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 285cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 286cabdff1aSopenharmony_ci ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 287cabdff1aSopenharmony_ci ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 288cabdff1aSopenharmony_ci dst += (4 * dst_stride); 289cabdff1aSopenharmony_ci } 290cabdff1aSopenharmony_ci } 291cabdff1aSopenharmony_ci} 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_cistatic void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride, 294cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 295cabdff1aSopenharmony_ci int32_t height) 296cabdff1aSopenharmony_ci{ 297cabdff1aSopenharmony_ci uint32_t loop_cnt; 298cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 299cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 300cabdff1aSopenharmony_ci v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 303cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 304cabdff1aSopenharmony_ci LD_SB4((src + 16), src_stride, src4, src5, src6, src7); 305cabdff1aSopenharmony_ci src += (4 * src_stride); 306cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r, 307cabdff1aSopenharmony_ci in2_r, in3_r); 308cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l, 309cabdff1aSopenharmony_ci in2_l, in3_l); 310cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 311cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 312cabdff1aSopenharmony_ci ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 313cabdff1aSopenharmony_ci ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 314cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r, 315cabdff1aSopenharmony_ci in2_r, in3_r); 316cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 317cabdff1aSopenharmony_ci ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride); 318cabdff1aSopenharmony_ci dst += (4 * dst_stride); 319cabdff1aSopenharmony_ci } 320cabdff1aSopenharmony_ci} 321cabdff1aSopenharmony_ci 322cabdff1aSopenharmony_cistatic void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride, 323cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 324cabdff1aSopenharmony_ci int32_t height) 325cabdff1aSopenharmony_ci{ 326cabdff1aSopenharmony_ci uint32_t loop_cnt; 327cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 328cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 329cabdff1aSopenharmony_ci v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 332cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 333cabdff1aSopenharmony_ci LD_SB4((src + 16), src_stride, src1, src3, src5, src7); 334cabdff1aSopenharmony_ci src += (4 * src_stride); 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r, 337cabdff1aSopenharmony_ci in2_r, in3_r); 338cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l, 339cabdff1aSopenharmony_ci in2_l, in3_l); 340cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 341cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 342cabdff1aSopenharmony_ci ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8); 343cabdff1aSopenharmony_ci dst += dst_stride; 344cabdff1aSopenharmony_ci ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8); 345cabdff1aSopenharmony_ci dst += dst_stride; 346cabdff1aSopenharmony_ci 347cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r, 348cabdff1aSopenharmony_ci in2_r, in3_r); 349cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, in1_l, 350cabdff1aSopenharmony_ci in2_l, in3_l); 351cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 352cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 353cabdff1aSopenharmony_ci ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8); 354cabdff1aSopenharmony_ci dst += dst_stride; 355cabdff1aSopenharmony_ci ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8); 356cabdff1aSopenharmony_ci dst += dst_stride; 357cabdff1aSopenharmony_ci } 358cabdff1aSopenharmony_ci} 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_cistatic void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride, 361cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 362cabdff1aSopenharmony_ci int32_t height) 363cabdff1aSopenharmony_ci{ 364cabdff1aSopenharmony_ci uint32_t loop_cnt; 365cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 366cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 367cabdff1aSopenharmony_ci v16i8 src8, src9, src10, src11; 368cabdff1aSopenharmony_ci v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r; 369cabdff1aSopenharmony_ci v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l; 370cabdff1aSopenharmony_ci 371cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 372cabdff1aSopenharmony_ci LD_SB3(src, 16, src0, src1, src2); 373cabdff1aSopenharmony_ci src += src_stride; 374cabdff1aSopenharmony_ci LD_SB3(src, 16, src3, src4, src5); 375cabdff1aSopenharmony_ci src += src_stride; 376cabdff1aSopenharmony_ci LD_SB3(src, 16, src6, src7, src8); 377cabdff1aSopenharmony_ci src += src_stride; 378cabdff1aSopenharmony_ci LD_SB3(src, 16, src9, src10, src11); 379cabdff1aSopenharmony_ci src += src_stride; 380cabdff1aSopenharmony_ci 381cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 382cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 383cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 384cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 385cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src4, zero, src5, in4_r, in5_r); 386cabdff1aSopenharmony_ci ILVL_B2_SH(zero, src4, zero, src5, in4_l, in5_l); 387cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 388cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 389cabdff1aSopenharmony_ci SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6); 390cabdff1aSopenharmony_ci ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8); 391cabdff1aSopenharmony_ci dst += dst_stride; 392cabdff1aSopenharmony_ci ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8); 393cabdff1aSopenharmony_ci dst += dst_stride; 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9, 396cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 397cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9, 398cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 399cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src10, zero, src11, in4_r, in5_r); 400cabdff1aSopenharmony_ci ILVL_B2_SH(zero, src10, zero, src11, in4_l, in5_l); 401cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 402cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 403cabdff1aSopenharmony_ci SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6); 404cabdff1aSopenharmony_ci ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8); 405cabdff1aSopenharmony_ci dst += dst_stride; 406cabdff1aSopenharmony_ci ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8); 407cabdff1aSopenharmony_ci dst += dst_stride; 408cabdff1aSopenharmony_ci } 409cabdff1aSopenharmony_ci} 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_cistatic void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride, 412cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 413cabdff1aSopenharmony_ci int32_t height) 414cabdff1aSopenharmony_ci{ 415cabdff1aSopenharmony_ci uint32_t loop_cnt; 416cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 417cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 418cabdff1aSopenharmony_ci v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 419cabdff1aSopenharmony_ci 420cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 421cabdff1aSopenharmony_ci LD_SB4(src, 16, src0, src1, src2, src3); 422cabdff1aSopenharmony_ci src += src_stride; 423cabdff1aSopenharmony_ci LD_SB4(src, 16, src4, src5, src6, src7); 424cabdff1aSopenharmony_ci src += src_stride; 425cabdff1aSopenharmony_ci 426cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 427cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 428cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 429cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 430cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 431cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 432cabdff1aSopenharmony_ci ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8); 433cabdff1aSopenharmony_ci ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8); 434cabdff1aSopenharmony_ci dst += dst_stride; 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 437cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 438cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 439cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 440cabdff1aSopenharmony_ci SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 441cabdff1aSopenharmony_ci SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 442cabdff1aSopenharmony_ci ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8); 443cabdff1aSopenharmony_ci ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8); 444cabdff1aSopenharmony_ci dst += dst_stride; 445cabdff1aSopenharmony_ci } 446cabdff1aSopenharmony_ci} 447cabdff1aSopenharmony_ci 448cabdff1aSopenharmony_cistatic void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, 449cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 450cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 451cabdff1aSopenharmony_ci{ 452cabdff1aSopenharmony_ci uint32_t loop_cnt; 453cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 454cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 455cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 456cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 457cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 458cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 459cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 460cabdff1aSopenharmony_ci 461cabdff1aSopenharmony_ci src -= 3; 462cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 463cabdff1aSopenharmony_ci const_vec <<= 6; 464cabdff1aSopenharmony_ci 465cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 466cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 467cabdff1aSopenharmony_ci 468cabdff1aSopenharmony_ci mask1 = mask0 + 2; 469cabdff1aSopenharmony_ci mask2 = mask0 + 4; 470cabdff1aSopenharmony_ci mask3 = mask0 + 6; 471cabdff1aSopenharmony_ci 472cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 473cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 474cabdff1aSopenharmony_ci src += (8 * src_stride); 475cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 476cabdff1aSopenharmony_ci 477cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, 478cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 479cabdff1aSopenharmony_ci dst0 = const_vec; 480cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 481cabdff1aSopenharmony_ci dst0, dst0, dst0, dst0); 482cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, 483cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 484cabdff1aSopenharmony_ci dst1 = const_vec; 485cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 486cabdff1aSopenharmony_ci dst1, dst1, dst1, dst1); 487cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3, 488cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 489cabdff1aSopenharmony_ci dst2 = const_vec; 490cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 491cabdff1aSopenharmony_ci dst2, dst2, dst2, dst2); 492cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3, 493cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 494cabdff1aSopenharmony_ci dst3 = const_vec; 495cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 496cabdff1aSopenharmony_ci dst3, dst3, dst3, dst3); 497cabdff1aSopenharmony_ci 498cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 499cabdff1aSopenharmony_ci dst += (8 * dst_stride); 500cabdff1aSopenharmony_ci } 501cabdff1aSopenharmony_ci} 502cabdff1aSopenharmony_ci 503cabdff1aSopenharmony_cistatic void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, 504cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 505cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 506cabdff1aSopenharmony_ci{ 507cabdff1aSopenharmony_ci uint32_t loop_cnt; 508cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 509cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 510cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 511cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 512cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 513cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 514cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 515cabdff1aSopenharmony_ci 516cabdff1aSopenharmony_ci src -= 3; 517cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 518cabdff1aSopenharmony_ci const_vec <<= 6; 519cabdff1aSopenharmony_ci 520cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 521cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 522cabdff1aSopenharmony_ci 523cabdff1aSopenharmony_ci mask1 = mask0 + 2; 524cabdff1aSopenharmony_ci mask2 = mask0 + 4; 525cabdff1aSopenharmony_ci mask3 = mask0 + 6; 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 528cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 529cabdff1aSopenharmony_ci src += (4 * src_stride); 530cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 531cabdff1aSopenharmony_ci 532cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 533cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 534cabdff1aSopenharmony_ci dst0 = const_vec; 535cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 536cabdff1aSopenharmony_ci dst0, dst0, dst0, dst0); 537cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 538cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 539cabdff1aSopenharmony_ci dst1 = const_vec; 540cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 541cabdff1aSopenharmony_ci dst1, dst1, dst1, dst1); 542cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 543cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 544cabdff1aSopenharmony_ci dst2 = const_vec; 545cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 546cabdff1aSopenharmony_ci dst2, dst2, dst2, dst2); 547cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 548cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 549cabdff1aSopenharmony_ci dst3 = const_vec; 550cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 551cabdff1aSopenharmony_ci dst3, dst3, dst3, dst3); 552cabdff1aSopenharmony_ci 553cabdff1aSopenharmony_ci ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); 554cabdff1aSopenharmony_ci dst += (4 * dst_stride); 555cabdff1aSopenharmony_ci } 556cabdff1aSopenharmony_ci} 557cabdff1aSopenharmony_ci 558cabdff1aSopenharmony_cistatic void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, 559cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 560cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 561cabdff1aSopenharmony_ci{ 562cabdff1aSopenharmony_ci uint32_t loop_cnt; 563cabdff1aSopenharmony_ci int64_t res0, res1, res2, res3; 564cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 565cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 566cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 567cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5; 568cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 569cabdff1aSopenharmony_ci 570cabdff1aSopenharmony_ci src -= 3; 571cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 572cabdff1aSopenharmony_ci const_vec <<= 6; 573cabdff1aSopenharmony_ci 574cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 575cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 576cabdff1aSopenharmony_ci 577cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 578cabdff1aSopenharmony_ci mask1 = mask0 + 2; 579cabdff1aSopenharmony_ci mask2 = mask0 + 4; 580cabdff1aSopenharmony_ci mask3 = mask0 + 6; 581cabdff1aSopenharmony_ci mask4 = LD_SB(ff_hevc_mask_arr + 16); 582cabdff1aSopenharmony_ci mask5 = mask4 + 2; 583cabdff1aSopenharmony_ci mask6 = mask4 + 4; 584cabdff1aSopenharmony_ci mask7 = mask4 + 6; 585cabdff1aSopenharmony_ci 586cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 587cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 588cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src4, src5, src6, src7); 589cabdff1aSopenharmony_ci src += (4 * src_stride); 590cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 591cabdff1aSopenharmony_ci 592cabdff1aSopenharmony_ci dst0 = const_vec; 593cabdff1aSopenharmony_ci dst1 = const_vec; 594cabdff1aSopenharmony_ci dst2 = const_vec; 595cabdff1aSopenharmony_ci dst3 = const_vec; 596cabdff1aSopenharmony_ci dst4 = const_vec; 597cabdff1aSopenharmony_ci dst5 = const_vec; 598cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 599cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 600cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec4, vec5); 601cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 602cabdff1aSopenharmony_ci dst1, dst2, dst3); 603cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5); 604cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 605cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 606cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5); 607cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 608cabdff1aSopenharmony_ci dst1, dst2, dst3); 609cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5); 610cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 611cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 612cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec4, vec5); 613cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 614cabdff1aSopenharmony_ci dst1, dst2, dst3); 615cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5); 616cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1); 617cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3); 618cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask7, mask7, vec4, vec5); 619cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 620cabdff1aSopenharmony_ci dst1, dst2, dst3); 621cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5); 622cabdff1aSopenharmony_ci 623cabdff1aSopenharmony_ci res0 = __msa_copy_s_d((v2i64) dst4, 0); 624cabdff1aSopenharmony_ci res1 = __msa_copy_s_d((v2i64) dst4, 1); 625cabdff1aSopenharmony_ci res2 = __msa_copy_s_d((v2i64) dst5, 0); 626cabdff1aSopenharmony_ci res3 = __msa_copy_s_d((v2i64) dst5, 1); 627cabdff1aSopenharmony_ci ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); 628cabdff1aSopenharmony_ci SD4(res0, res1, res2, res3, (dst + 8), dst_stride); 629cabdff1aSopenharmony_ci dst += (4 * dst_stride); 630cabdff1aSopenharmony_ci } 631cabdff1aSopenharmony_ci} 632cabdff1aSopenharmony_ci 633cabdff1aSopenharmony_cistatic void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, 634cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 635cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 636cabdff1aSopenharmony_ci{ 637cabdff1aSopenharmony_ci uint32_t loop_cnt; 638cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 639cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 640cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 641cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 642cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 643cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 644cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 645cabdff1aSopenharmony_ci 646cabdff1aSopenharmony_ci src -= 3; 647cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 648cabdff1aSopenharmony_ci const_vec <<= 6; 649cabdff1aSopenharmony_ci 650cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 651cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_ci mask1 = mask0 + 2; 654cabdff1aSopenharmony_ci mask2 = mask0 + 4; 655cabdff1aSopenharmony_ci mask3 = mask0 + 6; 656cabdff1aSopenharmony_ci 657cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 658cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src2); 659cabdff1aSopenharmony_ci LD_SB2(src + 8, src_stride, src1, src3); 660cabdff1aSopenharmony_ci src += (2 * src_stride); 661cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 662cabdff1aSopenharmony_ci 663cabdff1aSopenharmony_ci dst0 = const_vec; 664cabdff1aSopenharmony_ci dst1 = const_vec; 665cabdff1aSopenharmony_ci dst2 = const_vec; 666cabdff1aSopenharmony_ci dst3 = const_vec; 667cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 668cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 669cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 670cabdff1aSopenharmony_ci dst1, dst2, dst3); 671cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 672cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 673cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 674cabdff1aSopenharmony_ci dst1, dst2, dst3); 675cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 676cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 677cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 678cabdff1aSopenharmony_ci dst1, dst2, dst3); 679cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1); 680cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3); 681cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 682cabdff1aSopenharmony_ci dst1, dst2, dst3); 683cabdff1aSopenharmony_ci 684cabdff1aSopenharmony_ci ST_SH2(dst0, dst2, dst, dst_stride); 685cabdff1aSopenharmony_ci ST_SH2(dst1, dst3, dst + 8, dst_stride); 686cabdff1aSopenharmony_ci dst += (2 * dst_stride); 687cabdff1aSopenharmony_ci } 688cabdff1aSopenharmony_ci} 689cabdff1aSopenharmony_ci 690cabdff1aSopenharmony_cistatic void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, 691cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 692cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 693cabdff1aSopenharmony_ci{ 694cabdff1aSopenharmony_ci uint32_t loop_cnt; 695cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 696cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 697cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 698cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 699cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 700cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 701cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 702cabdff1aSopenharmony_ci 703cabdff1aSopenharmony_ci src -= 3; 704cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 705cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_ci mask1 = mask0 + 2; 708cabdff1aSopenharmony_ci mask2 = mask0 + 4; 709cabdff1aSopenharmony_ci mask3 = mask0 + 6; 710cabdff1aSopenharmony_ci mask4 = mask0 + 8; 711cabdff1aSopenharmony_ci mask5 = mask0 + 10; 712cabdff1aSopenharmony_ci mask6 = mask0 + 12; 713cabdff1aSopenharmony_ci mask7 = mask0 + 14; 714cabdff1aSopenharmony_ci 715cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 716cabdff1aSopenharmony_ci const_vec <<= 6; 717cabdff1aSopenharmony_ci 718cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 719cabdff1aSopenharmony_ci LD_SB2(src, 16, src0, src1); 720cabdff1aSopenharmony_ci src += src_stride; 721cabdff1aSopenharmony_ci LD_SB2(src, 16, src2, src3); 722cabdff1aSopenharmony_ci src += src_stride; 723cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 724cabdff1aSopenharmony_ci 725cabdff1aSopenharmony_ci dst0 = const_vec; 726cabdff1aSopenharmony_ci dst1 = const_vec; 727cabdff1aSopenharmony_ci dst2 = const_vec; 728cabdff1aSopenharmony_ci dst3 = const_vec; 729cabdff1aSopenharmony_ci dst4 = const_vec; 730cabdff1aSopenharmony_ci dst5 = const_vec; 731cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 732cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 733cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src3, src3, mask4, mask0, vec4, vec5); 734cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 735cabdff1aSopenharmony_ci dst1, dst2, dst3); 736cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5); 737cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 738cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 739cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src3, src3, mask5, mask1, vec4, vec5); 740cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 741cabdff1aSopenharmony_ci dst1, dst2, dst3); 742cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5); 743cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 744cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3); 745cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src3, src3, mask6, mask2, vec4, vec5); 746cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 747cabdff1aSopenharmony_ci dst1, dst2, dst3); 748cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5); 749cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 750cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3); 751cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src3, src3, mask7, mask3, vec4, vec5); 752cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 753cabdff1aSopenharmony_ci dst1, dst2, dst3); 754cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5); 755cabdff1aSopenharmony_ci 756cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, 8); 757cabdff1aSopenharmony_ci ST_SH(dst2, dst + 16); 758cabdff1aSopenharmony_ci dst += dst_stride; 759cabdff1aSopenharmony_ci ST_SH2(dst3, dst4, dst, 8); 760cabdff1aSopenharmony_ci ST_SH(dst5, dst + 16); 761cabdff1aSopenharmony_ci dst += dst_stride; 762cabdff1aSopenharmony_ci } 763cabdff1aSopenharmony_ci} 764cabdff1aSopenharmony_ci 765cabdff1aSopenharmony_cistatic void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, 766cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 767cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 768cabdff1aSopenharmony_ci{ 769cabdff1aSopenharmony_ci uint32_t loop_cnt; 770cabdff1aSopenharmony_ci v16i8 src0, src1, src2; 771cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 772cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 773cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 774cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 775cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 776cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 777cabdff1aSopenharmony_ci 778cabdff1aSopenharmony_ci src -= 3; 779cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 780cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 781cabdff1aSopenharmony_ci 782cabdff1aSopenharmony_ci mask1 = mask0 + 2; 783cabdff1aSopenharmony_ci mask2 = mask0 + 4; 784cabdff1aSopenharmony_ci mask3 = mask0 + 6; 785cabdff1aSopenharmony_ci mask4 = mask0 + 8; 786cabdff1aSopenharmony_ci mask5 = mask0 + 10; 787cabdff1aSopenharmony_ci mask6 = mask0 + 12; 788cabdff1aSopenharmony_ci mask7 = mask0 + 14; 789cabdff1aSopenharmony_ci 790cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 791cabdff1aSopenharmony_ci const_vec <<= 6; 792cabdff1aSopenharmony_ci 793cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 794cabdff1aSopenharmony_ci LD_SB2(src, 16, src0, src1); 795cabdff1aSopenharmony_ci src2 = LD_SB(src + 24); 796cabdff1aSopenharmony_ci src += src_stride; 797cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 798cabdff1aSopenharmony_ci 799cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 800cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 801cabdff1aSopenharmony_ci dst0 = const_vec; 802cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 803cabdff1aSopenharmony_ci dst0, dst0, dst0, dst0); 804cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 805cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 806cabdff1aSopenharmony_ci dst1 = const_vec; 807cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 808cabdff1aSopenharmony_ci dst1, dst1, dst1, dst1); 809cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 810cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 811cabdff1aSopenharmony_ci dst2 = const_vec; 812cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 813cabdff1aSopenharmony_ci dst2, dst2, dst2, dst2); 814cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 815cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 816cabdff1aSopenharmony_ci dst3 = const_vec; 817cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 818cabdff1aSopenharmony_ci dst3, dst3, dst3, dst3); 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_ci ST_SH4(dst0, dst1, dst2, dst3, dst, 8); 821cabdff1aSopenharmony_ci dst += dst_stride; 822cabdff1aSopenharmony_ci } 823cabdff1aSopenharmony_ci} 824cabdff1aSopenharmony_ci 825cabdff1aSopenharmony_cistatic void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, 826cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 827cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 828cabdff1aSopenharmony_ci{ 829cabdff1aSopenharmony_ci uint32_t loop_cnt; 830cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 831cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 832cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 833cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 834cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 835cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 836cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 837cabdff1aSopenharmony_ci 838cabdff1aSopenharmony_ci src -= 3; 839cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 840cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 841cabdff1aSopenharmony_ci 842cabdff1aSopenharmony_ci mask1 = mask0 + 2; 843cabdff1aSopenharmony_ci mask2 = mask0 + 4; 844cabdff1aSopenharmony_ci mask3 = mask0 + 6; 845cabdff1aSopenharmony_ci mask4 = mask0 + 8; 846cabdff1aSopenharmony_ci mask5 = mask0 + 10; 847cabdff1aSopenharmony_ci mask6 = mask0 + 12; 848cabdff1aSopenharmony_ci mask7 = mask0 + 14; 849cabdff1aSopenharmony_ci 850cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 851cabdff1aSopenharmony_ci const_vec <<= 6; 852cabdff1aSopenharmony_ci 853cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 854cabdff1aSopenharmony_ci LD_SB3(src, 16, src0, src1, src2); 855cabdff1aSopenharmony_ci src3 = LD_SB(src + 40); 856cabdff1aSopenharmony_ci src += src_stride; 857cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 858cabdff1aSopenharmony_ci 859cabdff1aSopenharmony_ci dst0 = const_vec; 860cabdff1aSopenharmony_ci dst1 = const_vec; 861cabdff1aSopenharmony_ci dst2 = const_vec; 862cabdff1aSopenharmony_ci dst3 = const_vec; 863cabdff1aSopenharmony_ci dst4 = const_vec; 864cabdff1aSopenharmony_ci dst5 = const_vec; 865cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 866cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3); 867cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 868cabdff1aSopenharmony_ci dst1, dst2, dst3); 869cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 870cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3); 871cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 872cabdff1aSopenharmony_ci dst1, dst2, dst3); 873cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 874cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3); 875cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 876cabdff1aSopenharmony_ci dst1, dst2, dst3); 877cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 878cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3); 879cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 880cabdff1aSopenharmony_ci dst1, dst2, dst3); 881cabdff1aSopenharmony_ci ST_SH4(dst0, dst1, dst2, dst3, dst, 8); 882cabdff1aSopenharmony_ci 883cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec4, vec5); 884cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5); 885cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec4, vec5); 886cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5); 887cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec4, vec5); 888cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5); 889cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec4, vec5); 890cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5); 891cabdff1aSopenharmony_ci ST_SH2(dst4, dst5, (dst + 32), 8); 892cabdff1aSopenharmony_ci dst += dst_stride; 893cabdff1aSopenharmony_ci } 894cabdff1aSopenharmony_ci} 895cabdff1aSopenharmony_ci 896cabdff1aSopenharmony_cistatic void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, 897cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 898cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 899cabdff1aSopenharmony_ci{ 900cabdff1aSopenharmony_ci uint32_t loop_cnt; 901cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 902cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 903cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 904cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 905cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 906cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 907cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 908cabdff1aSopenharmony_ci 909cabdff1aSopenharmony_ci src -= 3; 910cabdff1aSopenharmony_ci 911cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 912cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 913cabdff1aSopenharmony_ci 914cabdff1aSopenharmony_ci mask1 = mask0 + 2; 915cabdff1aSopenharmony_ci mask2 = mask0 + 4; 916cabdff1aSopenharmony_ci mask3 = mask0 + 6; 917cabdff1aSopenharmony_ci mask4 = mask0 + 8; 918cabdff1aSopenharmony_ci mask5 = mask0 + 10; 919cabdff1aSopenharmony_ci mask6 = mask0 + 12; 920cabdff1aSopenharmony_ci mask7 = mask0 + 14; 921cabdff1aSopenharmony_ci 922cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 923cabdff1aSopenharmony_ci const_vec <<= 6; 924cabdff1aSopenharmony_ci 925cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 926cabdff1aSopenharmony_ci LD_SB4(src, 16, src0, src1, src2, src3); 927cabdff1aSopenharmony_ci src4 = LD_SB(src + 56); 928cabdff1aSopenharmony_ci src += src_stride; 929cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 930cabdff1aSopenharmony_ci 931cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 932cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 933cabdff1aSopenharmony_ci dst0 = const_vec; 934cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 935cabdff1aSopenharmony_ci dst0, dst0, dst0, dst0); 936cabdff1aSopenharmony_ci ST_SH(dst0, dst); 937cabdff1aSopenharmony_ci 938cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 939cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 940cabdff1aSopenharmony_ci dst1 = const_vec; 941cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 942cabdff1aSopenharmony_ci dst1, dst1, dst1, dst1); 943cabdff1aSopenharmony_ci ST_SH(dst1, dst + 8); 944cabdff1aSopenharmony_ci 945cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 946cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 947cabdff1aSopenharmony_ci dst2 = const_vec; 948cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 949cabdff1aSopenharmony_ci dst2, dst2, dst2, dst2); 950cabdff1aSopenharmony_ci ST_SH(dst2, dst + 16); 951cabdff1aSopenharmony_ci 952cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, 953cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 954cabdff1aSopenharmony_ci dst3 = const_vec; 955cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 956cabdff1aSopenharmony_ci dst3, dst3, dst3, dst3); 957cabdff1aSopenharmony_ci ST_SH(dst3, dst + 24); 958cabdff1aSopenharmony_ci 959cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 960cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 961cabdff1aSopenharmony_ci dst4 = const_vec; 962cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 963cabdff1aSopenharmony_ci dst4, dst4, dst4, dst4); 964cabdff1aSopenharmony_ci ST_SH(dst4, dst + 32); 965cabdff1aSopenharmony_ci 966cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, 967cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 968cabdff1aSopenharmony_ci dst5 = const_vec; 969cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 970cabdff1aSopenharmony_ci dst5, dst5, dst5, dst5); 971cabdff1aSopenharmony_ci ST_SH(dst5, dst + 40); 972cabdff1aSopenharmony_ci 973cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 974cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 975cabdff1aSopenharmony_ci dst6 = const_vec; 976cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 977cabdff1aSopenharmony_ci dst6, dst6, dst6, dst6); 978cabdff1aSopenharmony_ci ST_SH(dst6, dst + 48); 979cabdff1aSopenharmony_ci 980cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 981cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 982cabdff1aSopenharmony_ci dst7 = const_vec; 983cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 984cabdff1aSopenharmony_ci dst7, dst7, dst7, dst7); 985cabdff1aSopenharmony_ci ST_SH(dst7, dst + 56); 986cabdff1aSopenharmony_ci dst += dst_stride; 987cabdff1aSopenharmony_ci } 988cabdff1aSopenharmony_ci} 989cabdff1aSopenharmony_ci 990cabdff1aSopenharmony_cistatic void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, 991cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 992cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 993cabdff1aSopenharmony_ci{ 994cabdff1aSopenharmony_ci int32_t loop_cnt; 995cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 996cabdff1aSopenharmony_ci v16i8 src9, src10, src11, src12, src13, src14; 997cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 998cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 999cabdff1aSopenharmony_ci v16i8 src1110_r, src1211_r, src1312_r, src1413_r; 1000cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776, src10998; 1001cabdff1aSopenharmony_ci v16i8 src12111110, src14131312; 1002cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst76; 1003cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1004cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1005cabdff1aSopenharmony_ci 1006cabdff1aSopenharmony_ci src -= (3 * src_stride); 1007cabdff1aSopenharmony_ci 1008cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1009cabdff1aSopenharmony_ci const_vec <<= 6; 1010cabdff1aSopenharmony_ci 1011cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1012cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1013cabdff1aSopenharmony_ci 1014cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1015cabdff1aSopenharmony_ci src += (7 * src_stride); 1016cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1017cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1018cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1019cabdff1aSopenharmony_ci ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 1020cabdff1aSopenharmony_ci src2110, src4332, src6554); 1021cabdff1aSopenharmony_ci XORI_B3_128_SB(src2110, src4332, src6554); 1022cabdff1aSopenharmony_ci 1023cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 1024cabdff1aSopenharmony_ci LD_SB8(src, src_stride, 1025cabdff1aSopenharmony_ci src7, src8, src9, src10, src11, src12, src13, src14); 1026cabdff1aSopenharmony_ci src += (8 * src_stride); 1027cabdff1aSopenharmony_ci 1028cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1029cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1030cabdff1aSopenharmony_ci ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, 1031cabdff1aSopenharmony_ci src1110_r, src1211_r, src1312_r, src1413_r); 1032cabdff1aSopenharmony_ci ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, 1033cabdff1aSopenharmony_ci src1211_r, src1110_r, src1413_r, src1312_r, 1034cabdff1aSopenharmony_ci src8776, src10998, src12111110, src14131312); 1035cabdff1aSopenharmony_ci XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); 1036cabdff1aSopenharmony_ci 1037cabdff1aSopenharmony_ci dst10 = const_vec; 1038cabdff1aSopenharmony_ci DPADD_SB4_SH(src2110, src4332, src6554, src8776, 1039cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10); 1040cabdff1aSopenharmony_ci dst32 = const_vec; 1041cabdff1aSopenharmony_ci DPADD_SB4_SH(src4332, src6554, src8776, src10998, 1042cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32); 1043cabdff1aSopenharmony_ci dst54 = const_vec; 1044cabdff1aSopenharmony_ci DPADD_SB4_SH(src6554, src8776, src10998, src12111110, 1045cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54); 1046cabdff1aSopenharmony_ci dst76 = const_vec; 1047cabdff1aSopenharmony_ci DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, 1048cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76); 1049cabdff1aSopenharmony_ci 1050cabdff1aSopenharmony_ci ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 1051cabdff1aSopenharmony_ci dst += (8 * dst_stride); 1052cabdff1aSopenharmony_ci 1053cabdff1aSopenharmony_ci src2110 = src10998; 1054cabdff1aSopenharmony_ci src4332 = src12111110; 1055cabdff1aSopenharmony_ci src6554 = src14131312; 1056cabdff1aSopenharmony_ci src6 = src14; 1057cabdff1aSopenharmony_ci } 1058cabdff1aSopenharmony_ci} 1059cabdff1aSopenharmony_ci 1060cabdff1aSopenharmony_cistatic void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, 1061cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1062cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1063cabdff1aSopenharmony_ci{ 1064cabdff1aSopenharmony_ci int32_t loop_cnt; 1065cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1066cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1067cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1068cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 1069cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1070cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1071cabdff1aSopenharmony_ci 1072cabdff1aSopenharmony_ci src -= (3 * src_stride); 1073cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1074cabdff1aSopenharmony_ci const_vec <<= 6; 1075cabdff1aSopenharmony_ci 1076cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1077cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1078cabdff1aSopenharmony_ci 1079cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1080cabdff1aSopenharmony_ci src += (7 * src_stride); 1081cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1082cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1083cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1084cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1085cabdff1aSopenharmony_ci 1086cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1087cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 1088cabdff1aSopenharmony_ci src += (4 * src_stride); 1089cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1090cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1091cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1092cabdff1aSopenharmony_ci 1093cabdff1aSopenharmony_ci dst0_r = const_vec; 1094cabdff1aSopenharmony_ci DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1095cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1096cabdff1aSopenharmony_ci dst0_r, dst0_r, dst0_r, dst0_r); 1097cabdff1aSopenharmony_ci dst1_r = const_vec; 1098cabdff1aSopenharmony_ci DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1099cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1100cabdff1aSopenharmony_ci dst1_r, dst1_r, dst1_r, dst1_r); 1101cabdff1aSopenharmony_ci dst2_r = const_vec; 1102cabdff1aSopenharmony_ci DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, 1103cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1104cabdff1aSopenharmony_ci dst2_r, dst2_r, dst2_r, dst2_r); 1105cabdff1aSopenharmony_ci dst3_r = const_vec; 1106cabdff1aSopenharmony_ci DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, 1107cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1108cabdff1aSopenharmony_ci dst3_r, dst3_r, dst3_r, dst3_r); 1109cabdff1aSopenharmony_ci 1110cabdff1aSopenharmony_ci ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); 1111cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1112cabdff1aSopenharmony_ci 1113cabdff1aSopenharmony_ci src10_r = src54_r; 1114cabdff1aSopenharmony_ci src32_r = src76_r; 1115cabdff1aSopenharmony_ci src54_r = src98_r; 1116cabdff1aSopenharmony_ci src21_r = src65_r; 1117cabdff1aSopenharmony_ci src43_r = src87_r; 1118cabdff1aSopenharmony_ci src65_r = src109_r; 1119cabdff1aSopenharmony_ci src6 = src10; 1120cabdff1aSopenharmony_ci } 1121cabdff1aSopenharmony_ci} 1122cabdff1aSopenharmony_ci 1123cabdff1aSopenharmony_cistatic void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, 1124cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1125cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1126cabdff1aSopenharmony_ci{ 1127cabdff1aSopenharmony_ci int32_t loop_cnt; 1128cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1129cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1130cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1131cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 1132cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; 1133cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; 1134cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776, src10998; 1135cabdff1aSopenharmony_ci v8i16 dst0_l, dst1_l; 1136cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1137cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1138cabdff1aSopenharmony_ci 1139cabdff1aSopenharmony_ci src -= (3 * src_stride); 1140cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1141cabdff1aSopenharmony_ci const_vec <<= 6; 1142cabdff1aSopenharmony_ci 1143cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1144cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1145cabdff1aSopenharmony_ci 1146cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1147cabdff1aSopenharmony_ci src += (7 * src_stride); 1148cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1149cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1150cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1151cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1152cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1153cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 1154cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1155cabdff1aSopenharmony_ci ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, 1156cabdff1aSopenharmony_ci src2110, src4332, src6554); 1157cabdff1aSopenharmony_ci 1158cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1159cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 1160cabdff1aSopenharmony_ci src += (4 * src_stride); 1161cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1162cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1163cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1164cabdff1aSopenharmony_ci ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1165cabdff1aSopenharmony_ci src76_l, src87_l, src98_l, src109_l); 1166cabdff1aSopenharmony_ci ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998); 1167cabdff1aSopenharmony_ci 1168cabdff1aSopenharmony_ci dst0_r = const_vec; 1169cabdff1aSopenharmony_ci DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1170cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1171cabdff1aSopenharmony_ci dst0_r, dst0_r, dst0_r, dst0_r); 1172cabdff1aSopenharmony_ci dst1_r = const_vec; 1173cabdff1aSopenharmony_ci DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1174cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1175cabdff1aSopenharmony_ci dst1_r, dst1_r, dst1_r, dst1_r); 1176cabdff1aSopenharmony_ci dst2_r = const_vec; 1177cabdff1aSopenharmony_ci DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, 1178cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1179cabdff1aSopenharmony_ci dst2_r, dst2_r, dst2_r, dst2_r); 1180cabdff1aSopenharmony_ci dst3_r = const_vec; 1181cabdff1aSopenharmony_ci DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, 1182cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1183cabdff1aSopenharmony_ci dst3_r, dst3_r, dst3_r, dst3_r); 1184cabdff1aSopenharmony_ci dst0_l = const_vec; 1185cabdff1aSopenharmony_ci DPADD_SB4_SH(src2110, src4332, src6554, src8776, 1186cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1187cabdff1aSopenharmony_ci dst0_l, dst0_l, dst0_l, dst0_l); 1188cabdff1aSopenharmony_ci dst1_l = const_vec; 1189cabdff1aSopenharmony_ci DPADD_SB4_SH(src4332, src6554, src8776, src10998, 1190cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1191cabdff1aSopenharmony_ci dst1_l, dst1_l, dst1_l, dst1_l); 1192cabdff1aSopenharmony_ci 1193cabdff1aSopenharmony_ci ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); 1194cabdff1aSopenharmony_ci ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride); 1195cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1196cabdff1aSopenharmony_ci 1197cabdff1aSopenharmony_ci src10_r = src54_r; 1198cabdff1aSopenharmony_ci src32_r = src76_r; 1199cabdff1aSopenharmony_ci src54_r = src98_r; 1200cabdff1aSopenharmony_ci src21_r = src65_r; 1201cabdff1aSopenharmony_ci src43_r = src87_r; 1202cabdff1aSopenharmony_ci src65_r = src109_r; 1203cabdff1aSopenharmony_ci src2110 = src6554; 1204cabdff1aSopenharmony_ci src4332 = src8776; 1205cabdff1aSopenharmony_ci src6554 = src10998; 1206cabdff1aSopenharmony_ci src6 = src10; 1207cabdff1aSopenharmony_ci } 1208cabdff1aSopenharmony_ci} 1209cabdff1aSopenharmony_ci 1210cabdff1aSopenharmony_cistatic void hevc_vt_8t_16multx4mult_msa(uint8_t *src, 1211cabdff1aSopenharmony_ci int32_t src_stride, 1212cabdff1aSopenharmony_ci int16_t *dst, 1213cabdff1aSopenharmony_ci int32_t dst_stride, 1214cabdff1aSopenharmony_ci const int8_t *filter, 1215cabdff1aSopenharmony_ci int32_t height, 1216cabdff1aSopenharmony_ci int32_t width) 1217cabdff1aSopenharmony_ci{ 1218cabdff1aSopenharmony_ci uint8_t *src_tmp; 1219cabdff1aSopenharmony_ci int16_t *dst_tmp; 1220cabdff1aSopenharmony_ci int32_t loop_cnt, cnt; 1221cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1222cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1223cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1224cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 1225cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; 1226cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; 1227cabdff1aSopenharmony_ci v8i16 dst0_l, dst1_l, dst2_l, dst3_l; 1228cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1229cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1230cabdff1aSopenharmony_ci 1231cabdff1aSopenharmony_ci src -= (3 * src_stride); 1232cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1233cabdff1aSopenharmony_ci const_vec <<= 6; 1234cabdff1aSopenharmony_ci 1235cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1236cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1237cabdff1aSopenharmony_ci 1238cabdff1aSopenharmony_ci for (cnt = width >> 4; cnt--;) { 1239cabdff1aSopenharmony_ci src_tmp = src; 1240cabdff1aSopenharmony_ci dst_tmp = dst; 1241cabdff1aSopenharmony_ci 1242cabdff1aSopenharmony_ci LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1243cabdff1aSopenharmony_ci src_tmp += (7 * src_stride); 1244cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1245cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1246cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1247cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1248cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1249cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 1250cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1251cabdff1aSopenharmony_ci 1252cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1253cabdff1aSopenharmony_ci LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); 1254cabdff1aSopenharmony_ci src_tmp += (4 * src_stride); 1255cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1256cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1257cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1258cabdff1aSopenharmony_ci ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1259cabdff1aSopenharmony_ci src76_l, src87_l, src98_l, src109_l); 1260cabdff1aSopenharmony_ci 1261cabdff1aSopenharmony_ci dst0_r = const_vec; 1262cabdff1aSopenharmony_ci DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1263cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1264cabdff1aSopenharmony_ci dst0_r, dst0_r, dst0_r, dst0_r); 1265cabdff1aSopenharmony_ci dst1_r = const_vec; 1266cabdff1aSopenharmony_ci DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1267cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1268cabdff1aSopenharmony_ci dst1_r, dst1_r, dst1_r, dst1_r); 1269cabdff1aSopenharmony_ci dst2_r = const_vec; 1270cabdff1aSopenharmony_ci DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, 1271cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1272cabdff1aSopenharmony_ci dst2_r, dst2_r, dst2_r, dst2_r); 1273cabdff1aSopenharmony_ci dst3_r = const_vec; 1274cabdff1aSopenharmony_ci DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, 1275cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1276cabdff1aSopenharmony_ci dst3_r, dst3_r, dst3_r, dst3_r); 1277cabdff1aSopenharmony_ci dst0_l = const_vec; 1278cabdff1aSopenharmony_ci DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l, 1279cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1280cabdff1aSopenharmony_ci dst0_l, dst0_l, dst0_l, dst0_l); 1281cabdff1aSopenharmony_ci dst1_l = const_vec; 1282cabdff1aSopenharmony_ci DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l, 1283cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1284cabdff1aSopenharmony_ci dst1_l, dst1_l, dst1_l, dst1_l); 1285cabdff1aSopenharmony_ci dst2_l = const_vec; 1286cabdff1aSopenharmony_ci DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l, 1287cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1288cabdff1aSopenharmony_ci dst2_l, dst2_l, dst2_l, dst2_l); 1289cabdff1aSopenharmony_ci dst3_l = const_vec; 1290cabdff1aSopenharmony_ci DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l, 1291cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1292cabdff1aSopenharmony_ci dst3_l, dst3_l, dst3_l, dst3_l); 1293cabdff1aSopenharmony_ci 1294cabdff1aSopenharmony_ci ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride); 1295cabdff1aSopenharmony_ci ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride); 1296cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 1297cabdff1aSopenharmony_ci 1298cabdff1aSopenharmony_ci src10_r = src54_r; 1299cabdff1aSopenharmony_ci src32_r = src76_r; 1300cabdff1aSopenharmony_ci src54_r = src98_r; 1301cabdff1aSopenharmony_ci src21_r = src65_r; 1302cabdff1aSopenharmony_ci src43_r = src87_r; 1303cabdff1aSopenharmony_ci src65_r = src109_r; 1304cabdff1aSopenharmony_ci src10_l = src54_l; 1305cabdff1aSopenharmony_ci src32_l = src76_l; 1306cabdff1aSopenharmony_ci src54_l = src98_l; 1307cabdff1aSopenharmony_ci src21_l = src65_l; 1308cabdff1aSopenharmony_ci src43_l = src87_l; 1309cabdff1aSopenharmony_ci src65_l = src109_l; 1310cabdff1aSopenharmony_ci src6 = src10; 1311cabdff1aSopenharmony_ci } 1312cabdff1aSopenharmony_ci 1313cabdff1aSopenharmony_ci src += 16; 1314cabdff1aSopenharmony_ci dst += 16; 1315cabdff1aSopenharmony_ci } 1316cabdff1aSopenharmony_ci} 1317cabdff1aSopenharmony_ci 1318cabdff1aSopenharmony_cistatic void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, 1319cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1320cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1321cabdff1aSopenharmony_ci{ 1322cabdff1aSopenharmony_ci hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1323cabdff1aSopenharmony_ci filter, height, 16); 1324cabdff1aSopenharmony_ci} 1325cabdff1aSopenharmony_ci 1326cabdff1aSopenharmony_cistatic void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, 1327cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1328cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1329cabdff1aSopenharmony_ci{ 1330cabdff1aSopenharmony_ci hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1331cabdff1aSopenharmony_ci filter, height, 16); 1332cabdff1aSopenharmony_ci hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, 1333cabdff1aSopenharmony_ci filter, height); 1334cabdff1aSopenharmony_ci} 1335cabdff1aSopenharmony_ci 1336cabdff1aSopenharmony_cistatic void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, 1337cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1338cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1339cabdff1aSopenharmony_ci{ 1340cabdff1aSopenharmony_ci hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1341cabdff1aSopenharmony_ci filter, height, 32); 1342cabdff1aSopenharmony_ci} 1343cabdff1aSopenharmony_ci 1344cabdff1aSopenharmony_cistatic void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, 1345cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1346cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1347cabdff1aSopenharmony_ci{ 1348cabdff1aSopenharmony_ci hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1349cabdff1aSopenharmony_ci filter, height, 48); 1350cabdff1aSopenharmony_ci} 1351cabdff1aSopenharmony_ci 1352cabdff1aSopenharmony_cistatic void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, 1353cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1354cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1355cabdff1aSopenharmony_ci{ 1356cabdff1aSopenharmony_ci hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1357cabdff1aSopenharmony_ci filter, height, 64); 1358cabdff1aSopenharmony_ci} 1359cabdff1aSopenharmony_ci 1360cabdff1aSopenharmony_cistatic void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride, 1361cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1362cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1363cabdff1aSopenharmony_ci int32_t height) 1364cabdff1aSopenharmony_ci{ 1365cabdff1aSopenharmony_ci uint32_t loop_cnt; 1366cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1367cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1368cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1369cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 1370cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1371cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1372cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1373cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1374cabdff1aSopenharmony_ci v4i32 dst0_r, dst1_r, dst2_r, dst3_r; 1375cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r; 1376cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r; 1377cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1378cabdff1aSopenharmony_ci 1379cabdff1aSopenharmony_ci src -= ((3 * src_stride) + 3); 1380cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1381cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1382cabdff1aSopenharmony_ci 1383cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1384cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1385cabdff1aSopenharmony_ci 1386cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1387cabdff1aSopenharmony_ci 1388cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1389cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1390cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1391cabdff1aSopenharmony_ci 1392cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1393cabdff1aSopenharmony_ci const_vec <<= 6; 1394cabdff1aSopenharmony_ci 1395cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1396cabdff1aSopenharmony_ci src += (7 * src_stride); 1397cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1398cabdff1aSopenharmony_ci 1399cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1400cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1401cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, 1402cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1403cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, 1404cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1405cabdff1aSopenharmony_ci dst30 = const_vec; 1406cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 1407cabdff1aSopenharmony_ci dst30, dst30, dst30, dst30); 1408cabdff1aSopenharmony_ci dst41 = const_vec; 1409cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, 1410cabdff1aSopenharmony_ci dst41, dst41, dst41, dst41); 1411cabdff1aSopenharmony_ci dst52 = const_vec; 1412cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, 1413cabdff1aSopenharmony_ci dst52, dst52, dst52, dst52); 1414cabdff1aSopenharmony_ci dst63 = const_vec; 1415cabdff1aSopenharmony_ci DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, 1416cabdff1aSopenharmony_ci dst63, dst63, dst63, dst63); 1417cabdff1aSopenharmony_ci 1418cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 1419cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 1420cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 1421cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1422cabdff1aSopenharmony_ci 1423cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 1424cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 1425cabdff1aSopenharmony_ci src += (4 * src_stride); 1426cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1427cabdff1aSopenharmony_ci 1428cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3, 1429cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1430cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3, 1431cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1432cabdff1aSopenharmony_ci dst97 = const_vec; 1433cabdff1aSopenharmony_ci dst108 = const_vec; 1434cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 1435cabdff1aSopenharmony_ci dst97, dst97, dst97, dst97); 1436cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, 1437cabdff1aSopenharmony_ci dst108, dst108, dst108, dst108); 1438cabdff1aSopenharmony_ci 1439cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst97, dst66); 1440cabdff1aSopenharmony_ci ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r); 1441cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 1442cabdff1aSopenharmony_ci dst98_r = __msa_ilvr_h(dst66, dst108); 1443cabdff1aSopenharmony_ci 1444cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1445cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1446cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, 1447cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1448cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, 1449cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1450cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, 1451cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1452cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1453cabdff1aSopenharmony_ci PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r); 1454cabdff1aSopenharmony_ci ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride); 1455cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1456cabdff1aSopenharmony_ci 1457cabdff1aSopenharmony_ci dst10_r = dst54_r; 1458cabdff1aSopenharmony_ci dst32_r = dst76_r; 1459cabdff1aSopenharmony_ci dst54_r = dst98_r; 1460cabdff1aSopenharmony_ci dst21_r = dst65_r; 1461cabdff1aSopenharmony_ci dst43_r = dst87_r; 1462cabdff1aSopenharmony_ci dst65_r = dst109_r; 1463cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 1464cabdff1aSopenharmony_ci } 1465cabdff1aSopenharmony_ci} 1466cabdff1aSopenharmony_ci 1467cabdff1aSopenharmony_cistatic void hevc_hv_8t_8multx1mult_msa(uint8_t *src, 1468cabdff1aSopenharmony_ci int32_t src_stride, 1469cabdff1aSopenharmony_ci int16_t *dst, 1470cabdff1aSopenharmony_ci int32_t dst_stride, 1471cabdff1aSopenharmony_ci const int8_t *filter_x, 1472cabdff1aSopenharmony_ci const int8_t *filter_y, 1473cabdff1aSopenharmony_ci int32_t height, int32_t width) 1474cabdff1aSopenharmony_ci{ 1475cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 1476cabdff1aSopenharmony_ci uint8_t *src_tmp; 1477cabdff1aSopenharmony_ci int16_t *dst_tmp; 1478cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1479cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1480cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1481cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 1482cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1483cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1484cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1485cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1486cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l; 1487cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 1488cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 1489cabdff1aSopenharmony_ci v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1490cabdff1aSopenharmony_ci 1491cabdff1aSopenharmony_ci src -= ((3 * src_stride) + 3); 1492cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1493cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1494cabdff1aSopenharmony_ci 1495cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1496cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1497cabdff1aSopenharmony_ci 1498cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1499cabdff1aSopenharmony_ci 1500cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1501cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1502cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1503cabdff1aSopenharmony_ci 1504cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1505cabdff1aSopenharmony_ci const_vec <<= 6; 1506cabdff1aSopenharmony_ci 1507cabdff1aSopenharmony_ci for (cnt = width >> 3; cnt--;) { 1508cabdff1aSopenharmony_ci src_tmp = src; 1509cabdff1aSopenharmony_ci dst_tmp = dst; 1510cabdff1aSopenharmony_ci 1511cabdff1aSopenharmony_ci LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1512cabdff1aSopenharmony_ci src_tmp += (7 * src_stride); 1513cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1514cabdff1aSopenharmony_ci 1515cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 1516cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1517cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1518cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1519cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1520cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1521cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1522cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1523cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1524cabdff1aSopenharmony_ci dst0 = const_vec; 1525cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 1526cabdff1aSopenharmony_ci dst0, dst0, dst0, dst0); 1527cabdff1aSopenharmony_ci dst1 = const_vec; 1528cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, 1529cabdff1aSopenharmony_ci dst1, dst1, dst1, dst1); 1530cabdff1aSopenharmony_ci dst2 = const_vec; 1531cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, 1532cabdff1aSopenharmony_ci dst2, dst2, dst2, dst2); 1533cabdff1aSopenharmony_ci dst3 = const_vec; 1534cabdff1aSopenharmony_ci DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, 1535cabdff1aSopenharmony_ci dst3, dst3, dst3, dst3); 1536cabdff1aSopenharmony_ci 1537cabdff1aSopenharmony_ci /* row 4 row 5 row 6 */ 1538cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1539cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1540cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 1541cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1542cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 1543cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1544cabdff1aSopenharmony_ci dst4 = const_vec; 1545cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 1546cabdff1aSopenharmony_ci dst4, dst4, dst4, dst4); 1547cabdff1aSopenharmony_ci dst5 = const_vec; 1548cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, 1549cabdff1aSopenharmony_ci dst5, dst5, dst5, dst5); 1550cabdff1aSopenharmony_ci dst6 = const_vec; 1551cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, 1552cabdff1aSopenharmony_ci dst6, dst6, dst6, dst6); 1553cabdff1aSopenharmony_ci 1554cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1555cabdff1aSopenharmony_ci src7 = LD_SB(src_tmp); 1556cabdff1aSopenharmony_ci src7 = (v16i8) __msa_xori_b((v16u8) src7, 128); 1557cabdff1aSopenharmony_ci src_tmp += src_stride; 1558cabdff1aSopenharmony_ci 1559cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 1560cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1561cabdff1aSopenharmony_ci dst7 = const_vec; 1562cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 1563cabdff1aSopenharmony_ci dst7, dst7, dst7, dst7); 1564cabdff1aSopenharmony_ci 1565cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 1566cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 1567cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 1568cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1569cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1570cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1571cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 1572cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1573cabdff1aSopenharmony_ci dst0_r >>= 6; 1574cabdff1aSopenharmony_ci dst0_l >>= 6; 1575cabdff1aSopenharmony_ci 1576cabdff1aSopenharmony_ci dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 1577cabdff1aSopenharmony_ci ST_SW(dst0_r, dst_tmp); 1578cabdff1aSopenharmony_ci dst_tmp += dst_stride; 1579cabdff1aSopenharmony_ci 1580cabdff1aSopenharmony_ci dst0 = dst1; 1581cabdff1aSopenharmony_ci dst1 = dst2; 1582cabdff1aSopenharmony_ci dst2 = dst3; 1583cabdff1aSopenharmony_ci dst3 = dst4; 1584cabdff1aSopenharmony_ci dst4 = dst5; 1585cabdff1aSopenharmony_ci dst5 = dst6; 1586cabdff1aSopenharmony_ci dst6 = dst7; 1587cabdff1aSopenharmony_ci } 1588cabdff1aSopenharmony_ci 1589cabdff1aSopenharmony_ci src += 8; 1590cabdff1aSopenharmony_ci dst += 8; 1591cabdff1aSopenharmony_ci } 1592cabdff1aSopenharmony_ci} 1593cabdff1aSopenharmony_ci 1594cabdff1aSopenharmony_cistatic void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride, 1595cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1596cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1597cabdff1aSopenharmony_ci int32_t height) 1598cabdff1aSopenharmony_ci{ 1599cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1600cabdff1aSopenharmony_ci filter_x, filter_y, height, 8); 1601cabdff1aSopenharmony_ci} 1602cabdff1aSopenharmony_ci 1603cabdff1aSopenharmony_cistatic void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride, 1604cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1605cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1606cabdff1aSopenharmony_ci int32_t height) 1607cabdff1aSopenharmony_ci{ 1608cabdff1aSopenharmony_ci uint32_t loop_cnt; 1609cabdff1aSopenharmony_ci uint8_t *src_tmp; 1610cabdff1aSopenharmony_ci int16_t *dst_tmp; 1611cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1612cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1613cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1614cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1615cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 1616cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1617cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1618cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1619cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r; 1620cabdff1aSopenharmony_ci v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l; 1621cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r; 1622cabdff1aSopenharmony_ci 1623cabdff1aSopenharmony_ci src -= ((3 * src_stride) + 3); 1624cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1625cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1626cabdff1aSopenharmony_ci 1627cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1628cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1629cabdff1aSopenharmony_ci 1630cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1631cabdff1aSopenharmony_ci 1632cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 1633cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1634cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1635cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1636cabdff1aSopenharmony_ci 1637cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1638cabdff1aSopenharmony_ci const_vec <<= 6; 1639cabdff1aSopenharmony_ci 1640cabdff1aSopenharmony_ci src_tmp = src; 1641cabdff1aSopenharmony_ci dst_tmp = dst; 1642cabdff1aSopenharmony_ci 1643cabdff1aSopenharmony_ci LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1644cabdff1aSopenharmony_ci src_tmp += (7 * src_stride); 1645cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1646cabdff1aSopenharmony_ci 1647cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 1648cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1649cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1650cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1651cabdff1aSopenharmony_ci vec11); 1652cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14, 1653cabdff1aSopenharmony_ci vec15); 1654cabdff1aSopenharmony_ci dst0 = const_vec; 1655cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst0, dst0, 1656cabdff1aSopenharmony_ci dst0, dst0); 1657cabdff1aSopenharmony_ci dst1 = const_vec; 1658cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst1, dst1, 1659cabdff1aSopenharmony_ci dst1, dst1); 1660cabdff1aSopenharmony_ci dst2 = const_vec; 1661cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst2, 1662cabdff1aSopenharmony_ci dst2, dst2, dst2); 1663cabdff1aSopenharmony_ci dst3 = const_vec; 1664cabdff1aSopenharmony_ci DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst3, 1665cabdff1aSopenharmony_ci dst3, dst3, dst3); 1666cabdff1aSopenharmony_ci 1667cabdff1aSopenharmony_ci /* row 4 row 5 row 6 */ 1668cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1669cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1670cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1671cabdff1aSopenharmony_ci vec11); 1672cabdff1aSopenharmony_ci dst4 = const_vec; 1673cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst4, dst4, 1674cabdff1aSopenharmony_ci dst4, dst4); 1675cabdff1aSopenharmony_ci dst5 = const_vec; 1676cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst5, dst5, 1677cabdff1aSopenharmony_ci dst5, dst5); 1678cabdff1aSopenharmony_ci dst6 = const_vec; 1679cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst6, 1680cabdff1aSopenharmony_ci dst6, dst6, dst6); 1681cabdff1aSopenharmony_ci 1682cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1683cabdff1aSopenharmony_ci src7 = LD_SB(src_tmp); 1684cabdff1aSopenharmony_ci src7 = (v16i8) __msa_xori_b((v16u8) src7, 128); 1685cabdff1aSopenharmony_ci src_tmp += src_stride; 1686cabdff1aSopenharmony_ci 1687cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1688cabdff1aSopenharmony_ci vec3); 1689cabdff1aSopenharmony_ci dst7 = const_vec; 1690cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst7, 1691cabdff1aSopenharmony_ci dst7, dst7, dst7); 1692cabdff1aSopenharmony_ci 1693cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 1694cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 1695cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 1696cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1697cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1698cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1699cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0, 1700cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1701cabdff1aSopenharmony_ci dst0_r >>= 6; 1702cabdff1aSopenharmony_ci dst0_l >>= 6; 1703cabdff1aSopenharmony_ci 1704cabdff1aSopenharmony_ci dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 1705cabdff1aSopenharmony_ci ST_SW(dst0_r, dst_tmp); 1706cabdff1aSopenharmony_ci dst_tmp += dst_stride; 1707cabdff1aSopenharmony_ci 1708cabdff1aSopenharmony_ci dst0 = dst1; 1709cabdff1aSopenharmony_ci dst1 = dst2; 1710cabdff1aSopenharmony_ci dst2 = dst3; 1711cabdff1aSopenharmony_ci dst3 = dst4; 1712cabdff1aSopenharmony_ci dst4 = dst5; 1713cabdff1aSopenharmony_ci dst5 = dst6; 1714cabdff1aSopenharmony_ci dst6 = dst7; 1715cabdff1aSopenharmony_ci } 1716cabdff1aSopenharmony_ci 1717cabdff1aSopenharmony_ci src += 8; 1718cabdff1aSopenharmony_ci dst += 8; 1719cabdff1aSopenharmony_ci 1720cabdff1aSopenharmony_ci mask4 = LD_SB(ff_hevc_mask_arr + 16); 1721cabdff1aSopenharmony_ci mask5 = mask4 + 2; 1722cabdff1aSopenharmony_ci mask6 = mask4 + 4; 1723cabdff1aSopenharmony_ci mask7 = mask4 + 6; 1724cabdff1aSopenharmony_ci 1725cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1726cabdff1aSopenharmony_ci src += (7 * src_stride); 1727cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1728cabdff1aSopenharmony_ci 1729cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 1730cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7); 1731cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10, 1732cabdff1aSopenharmony_ci vec11); 1733cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14, 1734cabdff1aSopenharmony_ci vec15); 1735cabdff1aSopenharmony_ci dst30 = const_vec; 1736cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst30, 1737cabdff1aSopenharmony_ci dst30, dst30, dst30); 1738cabdff1aSopenharmony_ci dst41 = const_vec; 1739cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst41, 1740cabdff1aSopenharmony_ci dst41, dst41, dst41); 1741cabdff1aSopenharmony_ci dst52 = const_vec; 1742cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst52, 1743cabdff1aSopenharmony_ci dst52, dst52, dst52); 1744cabdff1aSopenharmony_ci dst63 = const_vec; 1745cabdff1aSopenharmony_ci DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst63, 1746cabdff1aSopenharmony_ci dst63, dst63, dst63); 1747cabdff1aSopenharmony_ci 1748cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 1749cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 1750cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 1751cabdff1aSopenharmony_ci 1752cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1753cabdff1aSopenharmony_ci 1754cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 1755cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 1756cabdff1aSopenharmony_ci src += (4 * src_stride); 1757cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1758cabdff1aSopenharmony_ci 1759cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 1760cabdff1aSopenharmony_ci vec3); 1761cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6, 1762cabdff1aSopenharmony_ci vec7); 1763cabdff1aSopenharmony_ci dst97 = const_vec; 1764cabdff1aSopenharmony_ci dst108 = const_vec; 1765cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst97, 1766cabdff1aSopenharmony_ci dst97, dst97, dst97); 1767cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst108, 1768cabdff1aSopenharmony_ci dst108, dst108, dst108); 1769cabdff1aSopenharmony_ci 1770cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst97, dst66); 1771cabdff1aSopenharmony_ci ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r); 1772cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 1773cabdff1aSopenharmony_ci dst98_r = __msa_ilvr_h(dst66, dst108); 1774cabdff1aSopenharmony_ci 1775cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1776cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1777cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 1778cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1779cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0, 1780cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1781cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0, 1782cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1783cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1784cabdff1aSopenharmony_ci PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r); 1785cabdff1aSopenharmony_ci ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride); 1786cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1787cabdff1aSopenharmony_ci 1788cabdff1aSopenharmony_ci dst10_r = dst54_r; 1789cabdff1aSopenharmony_ci dst32_r = dst76_r; 1790cabdff1aSopenharmony_ci dst54_r = dst98_r; 1791cabdff1aSopenharmony_ci dst21_r = dst65_r; 1792cabdff1aSopenharmony_ci dst43_r = dst87_r; 1793cabdff1aSopenharmony_ci dst65_r = dst109_r; 1794cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 1795cabdff1aSopenharmony_ci } 1796cabdff1aSopenharmony_ci} 1797cabdff1aSopenharmony_ci 1798cabdff1aSopenharmony_cistatic void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride, 1799cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1800cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1801cabdff1aSopenharmony_ci int32_t height) 1802cabdff1aSopenharmony_ci{ 1803cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1804cabdff1aSopenharmony_ci filter_x, filter_y, height, 16); 1805cabdff1aSopenharmony_ci} 1806cabdff1aSopenharmony_ci 1807cabdff1aSopenharmony_cistatic void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride, 1808cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1809cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1810cabdff1aSopenharmony_ci int32_t height) 1811cabdff1aSopenharmony_ci{ 1812cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1813cabdff1aSopenharmony_ci filter_x, filter_y, height, 24); 1814cabdff1aSopenharmony_ci} 1815cabdff1aSopenharmony_ci 1816cabdff1aSopenharmony_cistatic void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride, 1817cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1818cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1819cabdff1aSopenharmony_ci int32_t height) 1820cabdff1aSopenharmony_ci{ 1821cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1822cabdff1aSopenharmony_ci filter_x, filter_y, height, 32); 1823cabdff1aSopenharmony_ci} 1824cabdff1aSopenharmony_ci 1825cabdff1aSopenharmony_cistatic void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride, 1826cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1827cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1828cabdff1aSopenharmony_ci int32_t height) 1829cabdff1aSopenharmony_ci{ 1830cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1831cabdff1aSopenharmony_ci filter_x, filter_y, height, 48); 1832cabdff1aSopenharmony_ci} 1833cabdff1aSopenharmony_ci 1834cabdff1aSopenharmony_cistatic void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride, 1835cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1836cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1837cabdff1aSopenharmony_ci int32_t height) 1838cabdff1aSopenharmony_ci{ 1839cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1840cabdff1aSopenharmony_ci filter_x, filter_y, height, 64); 1841cabdff1aSopenharmony_ci} 1842cabdff1aSopenharmony_ci 1843cabdff1aSopenharmony_cistatic void hevc_hz_4t_4x2_msa(uint8_t *src, 1844cabdff1aSopenharmony_ci int32_t src_stride, 1845cabdff1aSopenharmony_ci int16_t *dst, 1846cabdff1aSopenharmony_ci int32_t dst_stride, 1847cabdff1aSopenharmony_ci const int8_t *filter) 1848cabdff1aSopenharmony_ci{ 1849cabdff1aSopenharmony_ci v8i16 filt0, filt1; 1850cabdff1aSopenharmony_ci v16i8 src0, src1; 1851cabdff1aSopenharmony_ci v16i8 mask1, vec0, vec1; 1852cabdff1aSopenharmony_ci v8i16 dst0; 1853cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1854cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1855cabdff1aSopenharmony_ci 1856cabdff1aSopenharmony_ci src -= 1; 1857cabdff1aSopenharmony_ci 1858cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1859cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 1860cabdff1aSopenharmony_ci 1861cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1862cabdff1aSopenharmony_ci 1863cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1864cabdff1aSopenharmony_ci const_vec <<= 6; 1865cabdff1aSopenharmony_ci 1866cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src1); 1867cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 1868cabdff1aSopenharmony_ci 1869cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 1870cabdff1aSopenharmony_ci dst0 = const_vec; 1871cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 1872cabdff1aSopenharmony_ci 1873cabdff1aSopenharmony_ci ST_D2(dst0, 0, 1, dst, dst_stride); 1874cabdff1aSopenharmony_ci} 1875cabdff1aSopenharmony_ci 1876cabdff1aSopenharmony_cistatic void hevc_hz_4t_4x4_msa(uint8_t *src, 1877cabdff1aSopenharmony_ci int32_t src_stride, 1878cabdff1aSopenharmony_ci int16_t *dst, 1879cabdff1aSopenharmony_ci int32_t dst_stride, 1880cabdff1aSopenharmony_ci const int8_t *filter) 1881cabdff1aSopenharmony_ci{ 1882cabdff1aSopenharmony_ci v8i16 filt0, filt1; 1883cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 1884cabdff1aSopenharmony_ci v16i8 mask1, vec0, vec1; 1885cabdff1aSopenharmony_ci v8i16 dst0, dst1; 1886cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1887cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1888cabdff1aSopenharmony_ci 1889cabdff1aSopenharmony_ci src -= 1; 1890cabdff1aSopenharmony_ci 1891cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1892cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 1893cabdff1aSopenharmony_ci 1894cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1895cabdff1aSopenharmony_ci 1896cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1897cabdff1aSopenharmony_ci const_vec <<= 6; 1898cabdff1aSopenharmony_ci 1899cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 1900cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 1901cabdff1aSopenharmony_ci 1902cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 1903cabdff1aSopenharmony_ci dst0 = const_vec; 1904cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 1905cabdff1aSopenharmony_ci 1906cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); 1907cabdff1aSopenharmony_ci dst1 = const_vec; 1908cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 1909cabdff1aSopenharmony_ci 1910cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 1911cabdff1aSopenharmony_ci} 1912cabdff1aSopenharmony_ci 1913cabdff1aSopenharmony_cistatic void hevc_hz_4t_4x8multiple_msa(uint8_t *src, 1914cabdff1aSopenharmony_ci int32_t src_stride, 1915cabdff1aSopenharmony_ci int16_t *dst, 1916cabdff1aSopenharmony_ci int32_t dst_stride, 1917cabdff1aSopenharmony_ci const int8_t *filter, 1918cabdff1aSopenharmony_ci int32_t height) 1919cabdff1aSopenharmony_ci{ 1920cabdff1aSopenharmony_ci uint32_t loop_cnt; 1921cabdff1aSopenharmony_ci v8i16 filt0, filt1; 1922cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1923cabdff1aSopenharmony_ci v16i8 mask1, vec0, vec1; 1924cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 1925cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1926cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1927cabdff1aSopenharmony_ci 1928cabdff1aSopenharmony_ci src -= 1; 1929cabdff1aSopenharmony_ci 1930cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1931cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 1932cabdff1aSopenharmony_ci 1933cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1934cabdff1aSopenharmony_ci 1935cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1936cabdff1aSopenharmony_ci const_vec <<= 6; 1937cabdff1aSopenharmony_ci 1938cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 1939cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1940cabdff1aSopenharmony_ci src += (8 * src_stride); 1941cabdff1aSopenharmony_ci 1942cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 1943cabdff1aSopenharmony_ci 1944cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 1945cabdff1aSopenharmony_ci dst0 = const_vec; 1946cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 1947cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); 1948cabdff1aSopenharmony_ci dst1 = const_vec; 1949cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 1950cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1); 1951cabdff1aSopenharmony_ci dst2 = const_vec; 1952cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 1953cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1); 1954cabdff1aSopenharmony_ci dst3 = const_vec; 1955cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 1956cabdff1aSopenharmony_ci 1957cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 1958cabdff1aSopenharmony_ci dst += (8 * dst_stride); 1959cabdff1aSopenharmony_ci } 1960cabdff1aSopenharmony_ci} 1961cabdff1aSopenharmony_ci 1962cabdff1aSopenharmony_cistatic void hevc_hz_4t_4w_msa(uint8_t *src, 1963cabdff1aSopenharmony_ci int32_t src_stride, 1964cabdff1aSopenharmony_ci int16_t *dst, 1965cabdff1aSopenharmony_ci int32_t dst_stride, 1966cabdff1aSopenharmony_ci const int8_t *filter, 1967cabdff1aSopenharmony_ci int32_t height) 1968cabdff1aSopenharmony_ci{ 1969cabdff1aSopenharmony_ci if (2 == height) { 1970cabdff1aSopenharmony_ci hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); 1971cabdff1aSopenharmony_ci } else if (4 == height) { 1972cabdff1aSopenharmony_ci hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); 1973cabdff1aSopenharmony_ci } else if (0 == height % 8) { 1974cabdff1aSopenharmony_ci hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride, 1975cabdff1aSopenharmony_ci filter, height); 1976cabdff1aSopenharmony_ci } 1977cabdff1aSopenharmony_ci} 1978cabdff1aSopenharmony_ci 1979cabdff1aSopenharmony_cistatic void hevc_hz_4t_6w_msa(uint8_t *src, 1980cabdff1aSopenharmony_ci int32_t src_stride, 1981cabdff1aSopenharmony_ci int16_t *dst, 1982cabdff1aSopenharmony_ci int32_t dst_stride, 1983cabdff1aSopenharmony_ci const int8_t *filter, 1984cabdff1aSopenharmony_ci int32_t height) 1985cabdff1aSopenharmony_ci{ 1986cabdff1aSopenharmony_ci uint32_t loop_cnt; 1987cabdff1aSopenharmony_ci uint64_t dst_val0, dst_val1, dst_val2, dst_val3; 1988cabdff1aSopenharmony_ci uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3; 1989cabdff1aSopenharmony_ci v8i16 filt0, filt1, dst0, dst1, dst2, dst3; 1990cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 1991cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 1992cabdff1aSopenharmony_ci v16i8 mask1; 1993cabdff1aSopenharmony_ci v16i8 vec0, vec1; 1994cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1995cabdff1aSopenharmony_ci 1996cabdff1aSopenharmony_ci src -= 1; 1997cabdff1aSopenharmony_ci 1998cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1999cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2000cabdff1aSopenharmony_ci 2001cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2002cabdff1aSopenharmony_ci 2003cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2004cabdff1aSopenharmony_ci const_vec <<= 6; 2005cabdff1aSopenharmony_ci 2006cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 2007cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 2008cabdff1aSopenharmony_ci src += (4 * src_stride); 2009cabdff1aSopenharmony_ci 2010cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2011cabdff1aSopenharmony_ci 2012cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2013cabdff1aSopenharmony_ci dst0 = const_vec; 2014cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2015cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2016cabdff1aSopenharmony_ci dst1 = const_vec; 2017cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2018cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2019cabdff1aSopenharmony_ci dst2 = const_vec; 2020cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2021cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2022cabdff1aSopenharmony_ci dst3 = const_vec; 2023cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2024cabdff1aSopenharmony_ci 2025cabdff1aSopenharmony_ci dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); 2026cabdff1aSopenharmony_ci dst_val1 = __msa_copy_u_d((v2i64) dst1, 0); 2027cabdff1aSopenharmony_ci dst_val2 = __msa_copy_u_d((v2i64) dst2, 0); 2028cabdff1aSopenharmony_ci dst_val3 = __msa_copy_u_d((v2i64) dst3, 0); 2029cabdff1aSopenharmony_ci 2030cabdff1aSopenharmony_ci dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2); 2031cabdff1aSopenharmony_ci dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2); 2032cabdff1aSopenharmony_ci dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2); 2033cabdff1aSopenharmony_ci dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2); 2034cabdff1aSopenharmony_ci 2035cabdff1aSopenharmony_ci SD(dst_val0, dst); 2036cabdff1aSopenharmony_ci SW(dst_val_int0, dst + 4); 2037cabdff1aSopenharmony_ci dst += dst_stride; 2038cabdff1aSopenharmony_ci SD(dst_val1, dst); 2039cabdff1aSopenharmony_ci SW(dst_val_int1, dst + 4); 2040cabdff1aSopenharmony_ci dst += dst_stride; 2041cabdff1aSopenharmony_ci SD(dst_val2, dst); 2042cabdff1aSopenharmony_ci SW(dst_val_int2, dst + 4); 2043cabdff1aSopenharmony_ci dst += dst_stride; 2044cabdff1aSopenharmony_ci SD(dst_val3, dst); 2045cabdff1aSopenharmony_ci SW(dst_val_int3, dst + 4); 2046cabdff1aSopenharmony_ci dst += dst_stride; 2047cabdff1aSopenharmony_ci } 2048cabdff1aSopenharmony_ci} 2049cabdff1aSopenharmony_ci 2050cabdff1aSopenharmony_cistatic void hevc_hz_4t_8x2multiple_msa(uint8_t *src, 2051cabdff1aSopenharmony_ci int32_t src_stride, 2052cabdff1aSopenharmony_ci int16_t *dst, 2053cabdff1aSopenharmony_ci int32_t dst_stride, 2054cabdff1aSopenharmony_ci const int8_t *filter, 2055cabdff1aSopenharmony_ci int32_t height) 2056cabdff1aSopenharmony_ci{ 2057cabdff1aSopenharmony_ci uint32_t loop_cnt; 2058cabdff1aSopenharmony_ci v8i16 filt0, filt1, dst0, dst1; 2059cabdff1aSopenharmony_ci v16i8 src0, src1; 2060cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2061cabdff1aSopenharmony_ci v16i8 mask1; 2062cabdff1aSopenharmony_ci v16i8 vec0, vec1; 2063cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2064cabdff1aSopenharmony_ci 2065cabdff1aSopenharmony_ci src -= 1; 2066cabdff1aSopenharmony_ci 2067cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2068cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2069cabdff1aSopenharmony_ci 2070cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2071cabdff1aSopenharmony_ci 2072cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2073cabdff1aSopenharmony_ci const_vec <<= 6; 2074cabdff1aSopenharmony_ci 2075cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 2076cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src1); 2077cabdff1aSopenharmony_ci src += (2 * src_stride); 2078cabdff1aSopenharmony_ci 2079cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 2080cabdff1aSopenharmony_ci 2081cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2082cabdff1aSopenharmony_ci dst0 = const_vec; 2083cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2084cabdff1aSopenharmony_ci 2085cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2086cabdff1aSopenharmony_ci dst1 = const_vec; 2087cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2088cabdff1aSopenharmony_ci 2089cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, dst_stride); 2090cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2091cabdff1aSopenharmony_ci } 2092cabdff1aSopenharmony_ci} 2093cabdff1aSopenharmony_ci 2094cabdff1aSopenharmony_cistatic void hevc_hz_4t_8x4multiple_msa(uint8_t *src, 2095cabdff1aSopenharmony_ci int32_t src_stride, 2096cabdff1aSopenharmony_ci int16_t *dst, 2097cabdff1aSopenharmony_ci int32_t dst_stride, 2098cabdff1aSopenharmony_ci const int8_t *filter, 2099cabdff1aSopenharmony_ci int32_t height) 2100cabdff1aSopenharmony_ci{ 2101cabdff1aSopenharmony_ci uint32_t loop_cnt; 2102cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2103cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2104cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2105cabdff1aSopenharmony_ci v16i8 mask1; 2106cabdff1aSopenharmony_ci v16i8 vec0, vec1; 2107cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 2108cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2109cabdff1aSopenharmony_ci 2110cabdff1aSopenharmony_ci src -= 1; 2111cabdff1aSopenharmony_ci 2112cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2113cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2114cabdff1aSopenharmony_ci 2115cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2116cabdff1aSopenharmony_ci 2117cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2118cabdff1aSopenharmony_ci const_vec <<= 6; 2119cabdff1aSopenharmony_ci 2120cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2121cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 2122cabdff1aSopenharmony_ci src += (4 * src_stride); 2123cabdff1aSopenharmony_ci 2124cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2125cabdff1aSopenharmony_ci 2126cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2127cabdff1aSopenharmony_ci dst0 = const_vec; 2128cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2129cabdff1aSopenharmony_ci 2130cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2131cabdff1aSopenharmony_ci dst1 = const_vec; 2132cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2133cabdff1aSopenharmony_ci 2134cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2135cabdff1aSopenharmony_ci dst2 = const_vec; 2136cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2137cabdff1aSopenharmony_ci 2138cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2139cabdff1aSopenharmony_ci dst3 = const_vec; 2140cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2141cabdff1aSopenharmony_ci 2142cabdff1aSopenharmony_ci ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); 2143cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2144cabdff1aSopenharmony_ci } 2145cabdff1aSopenharmony_ci} 2146cabdff1aSopenharmony_ci 2147cabdff1aSopenharmony_cistatic void hevc_hz_4t_8w_msa(uint8_t *src, 2148cabdff1aSopenharmony_ci int32_t src_stride, 2149cabdff1aSopenharmony_ci int16_t *dst, 2150cabdff1aSopenharmony_ci int32_t dst_stride, 2151cabdff1aSopenharmony_ci const int8_t *filter, 2152cabdff1aSopenharmony_ci int32_t height) 2153cabdff1aSopenharmony_ci{ 2154cabdff1aSopenharmony_ci if (2 == height || 6 == height) { 2155cabdff1aSopenharmony_ci hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride, 2156cabdff1aSopenharmony_ci filter, height); 2157cabdff1aSopenharmony_ci } else { 2158cabdff1aSopenharmony_ci hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride, 2159cabdff1aSopenharmony_ci filter, height); 2160cabdff1aSopenharmony_ci } 2161cabdff1aSopenharmony_ci} 2162cabdff1aSopenharmony_ci 2163cabdff1aSopenharmony_cistatic void hevc_hz_4t_12w_msa(uint8_t *src, 2164cabdff1aSopenharmony_ci int32_t src_stride, 2165cabdff1aSopenharmony_ci int16_t *dst, 2166cabdff1aSopenharmony_ci int32_t dst_stride, 2167cabdff1aSopenharmony_ci const int8_t *filter, 2168cabdff1aSopenharmony_ci int32_t height) 2169cabdff1aSopenharmony_ci{ 2170cabdff1aSopenharmony_ci uint32_t loop_cnt; 2171cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2172cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2173cabdff1aSopenharmony_ci v16i8 mask1; 2174cabdff1aSopenharmony_ci v16i8 vec0, vec1; 2175cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 2176cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2177cabdff1aSopenharmony_ci v16i8 mask3; 2178cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2179cabdff1aSopenharmony_ci v16i8 mask2 = { 2180cabdff1aSopenharmony_ci 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 2181cabdff1aSopenharmony_ci }; 2182cabdff1aSopenharmony_ci 2183cabdff1aSopenharmony_ci src -= 1; 2184cabdff1aSopenharmony_ci 2185cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2186cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2187cabdff1aSopenharmony_ci 2188cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2189cabdff1aSopenharmony_ci mask3 = mask2 + 2; 2190cabdff1aSopenharmony_ci 2191cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2192cabdff1aSopenharmony_ci const_vec <<= 6; 2193cabdff1aSopenharmony_ci 2194cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2195cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 2196cabdff1aSopenharmony_ci src += (4 * src_stride); 2197cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2198cabdff1aSopenharmony_ci 2199cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2200cabdff1aSopenharmony_ci dst0 = const_vec; 2201cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2202cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2203cabdff1aSopenharmony_ci dst1 = const_vec; 2204cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2205cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2206cabdff1aSopenharmony_ci dst2 = const_vec; 2207cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2208cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2209cabdff1aSopenharmony_ci dst3 = const_vec; 2210cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2211cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 2212cabdff1aSopenharmony_ci dst4 = const_vec; 2213cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); 2214cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); 2215cabdff1aSopenharmony_ci dst5 = const_vec; 2216cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); 2217cabdff1aSopenharmony_ci 2218cabdff1aSopenharmony_ci ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); 2219cabdff1aSopenharmony_ci ST_D4(dst4, dst5, 0, 1, 0, 1, dst + 8, dst_stride); 2220cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2221cabdff1aSopenharmony_ci } 2222cabdff1aSopenharmony_ci} 2223cabdff1aSopenharmony_ci 2224cabdff1aSopenharmony_cistatic void hevc_hz_4t_16w_msa(uint8_t *src, 2225cabdff1aSopenharmony_ci int32_t src_stride, 2226cabdff1aSopenharmony_ci int16_t *dst, 2227cabdff1aSopenharmony_ci int32_t dst_stride, 2228cabdff1aSopenharmony_ci const int8_t *filter, 2229cabdff1aSopenharmony_ci int32_t height) 2230cabdff1aSopenharmony_ci{ 2231cabdff1aSopenharmony_ci uint32_t loop_cnt; 2232cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2233cabdff1aSopenharmony_ci v16i8 src4, src5, src6, src7; 2234cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2235cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2236cabdff1aSopenharmony_ci v16i8 mask1; 2237cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2238cabdff1aSopenharmony_ci v16i8 vec0, vec1; 2239cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2240cabdff1aSopenharmony_ci 2241cabdff1aSopenharmony_ci src -= 1; 2242cabdff1aSopenharmony_ci 2243cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2244cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2245cabdff1aSopenharmony_ci 2246cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2247cabdff1aSopenharmony_ci 2248cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2249cabdff1aSopenharmony_ci const_vec <<= 6; 2250cabdff1aSopenharmony_ci 2251cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2252cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 2253cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 2254cabdff1aSopenharmony_ci src += (4 * src_stride); 2255cabdff1aSopenharmony_ci 2256cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2257cabdff1aSopenharmony_ci 2258cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2259cabdff1aSopenharmony_ci dst0 = const_vec; 2260cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2261cabdff1aSopenharmony_ci 2262cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2263cabdff1aSopenharmony_ci dst1 = const_vec; 2264cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2265cabdff1aSopenharmony_ci 2266cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2267cabdff1aSopenharmony_ci dst2 = const_vec; 2268cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2269cabdff1aSopenharmony_ci 2270cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2271cabdff1aSopenharmony_ci dst3 = const_vec; 2272cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2273cabdff1aSopenharmony_ci 2274cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 2275cabdff1aSopenharmony_ci dst4 = const_vec; 2276cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); 2277cabdff1aSopenharmony_ci 2278cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); 2279cabdff1aSopenharmony_ci dst5 = const_vec; 2280cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); 2281cabdff1aSopenharmony_ci 2282cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); 2283cabdff1aSopenharmony_ci dst6 = const_vec; 2284cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); 2285cabdff1aSopenharmony_ci 2286cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 2287cabdff1aSopenharmony_ci dst7 = const_vec; 2288cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); 2289cabdff1aSopenharmony_ci 2290cabdff1aSopenharmony_ci ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride); 2291cabdff1aSopenharmony_ci ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride); 2292cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2293cabdff1aSopenharmony_ci } 2294cabdff1aSopenharmony_ci} 2295cabdff1aSopenharmony_ci 2296cabdff1aSopenharmony_cistatic void hevc_hz_4t_24w_msa(uint8_t *src, 2297cabdff1aSopenharmony_ci int32_t src_stride, 2298cabdff1aSopenharmony_ci int16_t *dst, 2299cabdff1aSopenharmony_ci int32_t dst_stride, 2300cabdff1aSopenharmony_ci const int8_t *filter, 2301cabdff1aSopenharmony_ci int32_t height) 2302cabdff1aSopenharmony_ci{ 2303cabdff1aSopenharmony_ci uint32_t loop_cnt; 2304cabdff1aSopenharmony_ci int16_t *dst_tmp = dst + 16; 2305cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2306cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2307cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2308cabdff1aSopenharmony_ci v16i8 mask1, mask00, mask11; 2309cabdff1aSopenharmony_ci v16i8 vec0, vec1; 2310cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 2311cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2312cabdff1aSopenharmony_ci 2313cabdff1aSopenharmony_ci src -= 1; 2314cabdff1aSopenharmony_ci 2315cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2316cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2317cabdff1aSopenharmony_ci 2318cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2319cabdff1aSopenharmony_ci mask00 = mask0 + 8; 2320cabdff1aSopenharmony_ci mask11 = mask0 + 10; 2321cabdff1aSopenharmony_ci 2322cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2323cabdff1aSopenharmony_ci const_vec <<= 6; 2324cabdff1aSopenharmony_ci 2325cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2326cabdff1aSopenharmony_ci /* 16 width */ 2327cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 2328cabdff1aSopenharmony_ci LD_SB4(src + 16, src_stride, src1, src3, src5, src7); 2329cabdff1aSopenharmony_ci src += (4 * src_stride); 2330cabdff1aSopenharmony_ci 2331cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2332cabdff1aSopenharmony_ci 2333cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2334cabdff1aSopenharmony_ci dst0 = const_vec; 2335cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2336cabdff1aSopenharmony_ci 2337cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1); 2338cabdff1aSopenharmony_ci dst1 = const_vec; 2339cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2340cabdff1aSopenharmony_ci 2341cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2342cabdff1aSopenharmony_ci dst2 = const_vec; 2343cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2344cabdff1aSopenharmony_ci 2345cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1); 2346cabdff1aSopenharmony_ci dst3 = const_vec; 2347cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2348cabdff1aSopenharmony_ci 2349cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, 8); 2350cabdff1aSopenharmony_ci dst += dst_stride; 2351cabdff1aSopenharmony_ci ST_SH2(dst2, dst3, dst, 8); 2352cabdff1aSopenharmony_ci dst += dst_stride; 2353cabdff1aSopenharmony_ci 2354cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 2355cabdff1aSopenharmony_ci dst0 = const_vec; 2356cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2357cabdff1aSopenharmony_ci 2358cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1); 2359cabdff1aSopenharmony_ci dst1 = const_vec; 2360cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2361cabdff1aSopenharmony_ci 2362cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); 2363cabdff1aSopenharmony_ci dst2 = const_vec; 2364cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2365cabdff1aSopenharmony_ci 2366cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1); 2367cabdff1aSopenharmony_ci dst3 = const_vec; 2368cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2369cabdff1aSopenharmony_ci 2370cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, 8); 2371cabdff1aSopenharmony_ci dst += dst_stride; 2372cabdff1aSopenharmony_ci ST_SH2(dst2, dst3, dst, 8); 2373cabdff1aSopenharmony_ci dst += dst_stride; 2374cabdff1aSopenharmony_ci 2375cabdff1aSopenharmony_ci /* 8 width */ 2376cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2377cabdff1aSopenharmony_ci dst0 = const_vec; 2378cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2379cabdff1aSopenharmony_ci 2380cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2381cabdff1aSopenharmony_ci dst1 = const_vec; 2382cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2383cabdff1aSopenharmony_ci 2384cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); 2385cabdff1aSopenharmony_ci dst2 = const_vec; 2386cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2387cabdff1aSopenharmony_ci 2388cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 2389cabdff1aSopenharmony_ci dst3 = const_vec; 2390cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2391cabdff1aSopenharmony_ci 2392cabdff1aSopenharmony_ci ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); 2393cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 2394cabdff1aSopenharmony_ci } 2395cabdff1aSopenharmony_ci} 2396cabdff1aSopenharmony_ci 2397cabdff1aSopenharmony_cistatic void hevc_hz_4t_32w_msa(uint8_t *src, 2398cabdff1aSopenharmony_ci int32_t src_stride, 2399cabdff1aSopenharmony_ci int16_t *dst, 2400cabdff1aSopenharmony_ci int32_t dst_stride, 2401cabdff1aSopenharmony_ci const int8_t *filter, 2402cabdff1aSopenharmony_ci int32_t height) 2403cabdff1aSopenharmony_ci{ 2404cabdff1aSopenharmony_ci uint32_t loop_cnt; 2405cabdff1aSopenharmony_ci v16i8 src0, src1, src2; 2406cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2407cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2408cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 2409cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 2410cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 2411cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2412cabdff1aSopenharmony_ci 2413cabdff1aSopenharmony_ci src -= 1; 2414cabdff1aSopenharmony_ci 2415cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2416cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2417cabdff1aSopenharmony_ci 2418cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2419cabdff1aSopenharmony_ci const_vec <<= 6; 2420cabdff1aSopenharmony_ci 2421cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2422cabdff1aSopenharmony_ci mask2 = mask0 + 8; 2423cabdff1aSopenharmony_ci mask3 = mask0 + 10; 2424cabdff1aSopenharmony_ci 2425cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 2426cabdff1aSopenharmony_ci LD_SB2(src, 16, src0, src1); 2427cabdff1aSopenharmony_ci src2 = LD_SB(src + 24); 2428cabdff1aSopenharmony_ci src += src_stride; 2429cabdff1aSopenharmony_ci 2430cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2431cabdff1aSopenharmony_ci 2432cabdff1aSopenharmony_ci dst0 = const_vec; 2433cabdff1aSopenharmony_ci dst1 = const_vec; 2434cabdff1aSopenharmony_ci dst2 = const_vec; 2435cabdff1aSopenharmony_ci dst3 = const_vec; 2436cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1); 2437cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 2438cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2439cabdff1aSopenharmony_ci dst1, dst2, dst3); 2440cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1); 2441cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 2442cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2443cabdff1aSopenharmony_ci dst1, dst2, dst3); 2444cabdff1aSopenharmony_ci ST_SH4(dst0, dst1, dst2, dst3, dst, 8); 2445cabdff1aSopenharmony_ci dst += dst_stride; 2446cabdff1aSopenharmony_ci } 2447cabdff1aSopenharmony_ci} 2448cabdff1aSopenharmony_ci 2449cabdff1aSopenharmony_cistatic void hevc_vt_4t_4x2_msa(uint8_t *src, 2450cabdff1aSopenharmony_ci int32_t src_stride, 2451cabdff1aSopenharmony_ci int16_t *dst, 2452cabdff1aSopenharmony_ci int32_t dst_stride, 2453cabdff1aSopenharmony_ci const int8_t *filter) 2454cabdff1aSopenharmony_ci{ 2455cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 2456cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 2457cabdff1aSopenharmony_ci v16i8 src2110, src4332; 2458cabdff1aSopenharmony_ci v8i16 dst10; 2459cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2460cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2461cabdff1aSopenharmony_ci 2462cabdff1aSopenharmony_ci src -= src_stride; 2463cabdff1aSopenharmony_ci 2464cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2465cabdff1aSopenharmony_ci const_vec <<= 6; 2466cabdff1aSopenharmony_ci 2467cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2468cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2469cabdff1aSopenharmony_ci 2470cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 2471cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, 2472cabdff1aSopenharmony_ci src10_r, src21_r, src32_r, src43_r); 2473cabdff1aSopenharmony_ci 2474cabdff1aSopenharmony_ci ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 2475cabdff1aSopenharmony_ci XORI_B2_128_SB(src2110, src4332); 2476cabdff1aSopenharmony_ci dst10 = const_vec; 2477cabdff1aSopenharmony_ci DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2478cabdff1aSopenharmony_ci 2479cabdff1aSopenharmony_ci ST_D2(dst10, 0, 1, dst, dst_stride); 2480cabdff1aSopenharmony_ci} 2481cabdff1aSopenharmony_ci 2482cabdff1aSopenharmony_cistatic void hevc_vt_4t_4x4_msa(uint8_t *src, 2483cabdff1aSopenharmony_ci int32_t src_stride, 2484cabdff1aSopenharmony_ci int16_t *dst, 2485cabdff1aSopenharmony_ci int32_t dst_stride, 2486cabdff1aSopenharmony_ci const int8_t *filter, 2487cabdff1aSopenharmony_ci int32_t height) 2488cabdff1aSopenharmony_ci{ 2489cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 2490cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 2491cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554; 2492cabdff1aSopenharmony_ci v8i16 dst10, dst32; 2493cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2494cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2495cabdff1aSopenharmony_ci 2496cabdff1aSopenharmony_ci src -= src_stride; 2497cabdff1aSopenharmony_ci 2498cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2499cabdff1aSopenharmony_ci const_vec <<= 6; 2500cabdff1aSopenharmony_ci 2501cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2502cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2503cabdff1aSopenharmony_ci 2504cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 2505cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, 2506cabdff1aSopenharmony_ci src10_r, src21_r, src32_r, src43_r); 2507cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2508cabdff1aSopenharmony_ci ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 2509cabdff1aSopenharmony_ci src2110, src4332, src6554); 2510cabdff1aSopenharmony_ci XORI_B3_128_SB(src2110, src4332, src6554); 2511cabdff1aSopenharmony_ci dst10 = const_vec; 2512cabdff1aSopenharmony_ci DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2513cabdff1aSopenharmony_ci dst32 = const_vec; 2514cabdff1aSopenharmony_ci DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 2515cabdff1aSopenharmony_ci 2516cabdff1aSopenharmony_ci ST_D4(dst10, dst32, 0, 1, 0, 1, dst, dst_stride); 2517cabdff1aSopenharmony_ci} 2518cabdff1aSopenharmony_ci 2519cabdff1aSopenharmony_cistatic void hevc_vt_4t_4x8_msa(uint8_t *src, 2520cabdff1aSopenharmony_ci int32_t src_stride, 2521cabdff1aSopenharmony_ci int16_t *dst, 2522cabdff1aSopenharmony_ci int32_t dst_stride, 2523cabdff1aSopenharmony_ci const int8_t *filter, 2524cabdff1aSopenharmony_ci int32_t height) 2525cabdff1aSopenharmony_ci{ 2526cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2527cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 2528cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 2529cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776, src10998; 2530cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst76; 2531cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2532cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2533cabdff1aSopenharmony_ci 2534cabdff1aSopenharmony_ci src -= src_stride; 2535cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2536cabdff1aSopenharmony_ci const_vec <<= 6; 2537cabdff1aSopenharmony_ci 2538cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2539cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2540cabdff1aSopenharmony_ci 2541cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2542cabdff1aSopenharmony_ci src += (3 * src_stride); 2543cabdff1aSopenharmony_ci 2544cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2545cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2546cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2547cabdff1aSopenharmony_ci 2548cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 2549cabdff1aSopenharmony_ci src += (8 * src_stride); 2550cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 2551cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 2552cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 2553cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 2554cabdff1aSopenharmony_ci ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r, 2555cabdff1aSopenharmony_ci src98_r, src4332, src6554, src8776, src10998); 2556cabdff1aSopenharmony_ci XORI_B4_128_SB(src4332, src6554, src8776, src10998); 2557cabdff1aSopenharmony_ci dst10 = const_vec; 2558cabdff1aSopenharmony_ci dst32 = const_vec; 2559cabdff1aSopenharmony_ci dst54 = const_vec; 2560cabdff1aSopenharmony_ci dst76 = const_vec; 2561cabdff1aSopenharmony_ci DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2562cabdff1aSopenharmony_ci DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 2563cabdff1aSopenharmony_ci DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); 2564cabdff1aSopenharmony_ci DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76); 2565cabdff1aSopenharmony_ci ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 2566cabdff1aSopenharmony_ci} 2567cabdff1aSopenharmony_ci 2568cabdff1aSopenharmony_cistatic void hevc_vt_4t_4x16_msa(uint8_t *src, int32_t src_stride, 2569cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 2570cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2571cabdff1aSopenharmony_ci{ 2572cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2573cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 2574cabdff1aSopenharmony_ci v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; 2575cabdff1aSopenharmony_ci v16i8 src10998; 2576cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst76, filt0, filt1, filter_vec, const_vec; 2577cabdff1aSopenharmony_ci 2578cabdff1aSopenharmony_ci src -= src_stride; 2579cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2580cabdff1aSopenharmony_ci const_vec <<= 6; 2581cabdff1aSopenharmony_ci 2582cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2583cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2584cabdff1aSopenharmony_ci 2585cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2586cabdff1aSopenharmony_ci src += (3 * src_stride); 2587cabdff1aSopenharmony_ci 2588cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2589cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2590cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2591cabdff1aSopenharmony_ci 2592cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 2593cabdff1aSopenharmony_ci src += (8 * src_stride); 2594cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r, 2595cabdff1aSopenharmony_ci src54_r, src65_r); 2596cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 2597cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 2598cabdff1aSopenharmony_ci ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r, 2599cabdff1aSopenharmony_ci src98_r, src4332, src6554, src8776, src10998); 2600cabdff1aSopenharmony_ci XORI_B4_128_SB(src4332, src6554, src8776, src10998); 2601cabdff1aSopenharmony_ci 2602cabdff1aSopenharmony_ci dst10 = const_vec; 2603cabdff1aSopenharmony_ci dst32 = const_vec; 2604cabdff1aSopenharmony_ci dst54 = const_vec; 2605cabdff1aSopenharmony_ci dst76 = const_vec; 2606cabdff1aSopenharmony_ci DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2607cabdff1aSopenharmony_ci DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 2608cabdff1aSopenharmony_ci DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); 2609cabdff1aSopenharmony_ci DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76); 2610cabdff1aSopenharmony_ci ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 2611cabdff1aSopenharmony_ci dst += (8 * dst_stride); 2612cabdff1aSopenharmony_ci 2613cabdff1aSopenharmony_ci src2 = src10; 2614cabdff1aSopenharmony_ci src2110 = src10998; 2615cabdff1aSopenharmony_ci 2616cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 2617cabdff1aSopenharmony_ci src += (8 * src_stride); 2618cabdff1aSopenharmony_ci 2619cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r, 2620cabdff1aSopenharmony_ci src54_r, src65_r); 2621cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 2622cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 2623cabdff1aSopenharmony_ci ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r, 2624cabdff1aSopenharmony_ci src98_r, src4332, src6554, src8776, src10998); 2625cabdff1aSopenharmony_ci XORI_B4_128_SB(src4332, src6554, src8776, src10998); 2626cabdff1aSopenharmony_ci 2627cabdff1aSopenharmony_ci dst10 = const_vec; 2628cabdff1aSopenharmony_ci dst32 = const_vec; 2629cabdff1aSopenharmony_ci dst54 = const_vec; 2630cabdff1aSopenharmony_ci dst76 = const_vec; 2631cabdff1aSopenharmony_ci DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2632cabdff1aSopenharmony_ci DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 2633cabdff1aSopenharmony_ci DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); 2634cabdff1aSopenharmony_ci DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76); 2635cabdff1aSopenharmony_ci ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 2636cabdff1aSopenharmony_ci} 2637cabdff1aSopenharmony_ci 2638cabdff1aSopenharmony_cistatic void hevc_vt_4t_4w_msa(uint8_t *src, 2639cabdff1aSopenharmony_ci int32_t src_stride, 2640cabdff1aSopenharmony_ci int16_t *dst, 2641cabdff1aSopenharmony_ci int32_t dst_stride, 2642cabdff1aSopenharmony_ci const int8_t *filter, 2643cabdff1aSopenharmony_ci int32_t height) 2644cabdff1aSopenharmony_ci{ 2645cabdff1aSopenharmony_ci if (2 == height) { 2646cabdff1aSopenharmony_ci hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); 2647cabdff1aSopenharmony_ci } else if (4 == height) { 2648cabdff1aSopenharmony_ci hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height); 2649cabdff1aSopenharmony_ci } else if (8 == height) { 2650cabdff1aSopenharmony_ci hevc_vt_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, height); 2651cabdff1aSopenharmony_ci } else if (16 == height) { 2652cabdff1aSopenharmony_ci hevc_vt_4t_4x16_msa(src, src_stride, dst, dst_stride, filter, height); 2653cabdff1aSopenharmony_ci } 2654cabdff1aSopenharmony_ci} 2655cabdff1aSopenharmony_ci 2656cabdff1aSopenharmony_cistatic void hevc_vt_4t_6w_msa(uint8_t *src, 2657cabdff1aSopenharmony_ci int32_t src_stride, 2658cabdff1aSopenharmony_ci int16_t *dst, 2659cabdff1aSopenharmony_ci int32_t dst_stride, 2660cabdff1aSopenharmony_ci const int8_t *filter, 2661cabdff1aSopenharmony_ci int32_t height) 2662cabdff1aSopenharmony_ci{ 2663cabdff1aSopenharmony_ci int32_t loop_cnt; 2664cabdff1aSopenharmony_ci uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3; 2665cabdff1aSopenharmony_ci uint64_t dst_val0, dst_val1, dst_val2, dst_val3; 2666cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 2667cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 2668cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 2669cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2670cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2671cabdff1aSopenharmony_ci 2672cabdff1aSopenharmony_ci src -= src_stride; 2673cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2674cabdff1aSopenharmony_ci const_vec <<= 6; 2675cabdff1aSopenharmony_ci 2676cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2677cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2678cabdff1aSopenharmony_ci 2679cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2680cabdff1aSopenharmony_ci src += (3 * src_stride); 2681cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2682cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2683cabdff1aSopenharmony_ci 2684cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2685cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 2686cabdff1aSopenharmony_ci src += (2 * src_stride); 2687cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 2688cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2689cabdff1aSopenharmony_ci 2690cabdff1aSopenharmony_ci dst0_r = const_vec; 2691cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2692cabdff1aSopenharmony_ci dst1_r = const_vec; 2693cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2694cabdff1aSopenharmony_ci 2695cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src1, src2); 2696cabdff1aSopenharmony_ci src += (2 * src_stride); 2697cabdff1aSopenharmony_ci XORI_B2_128_SB(src1, src2); 2698cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); 2699cabdff1aSopenharmony_ci 2700cabdff1aSopenharmony_ci dst2_r = const_vec; 2701cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); 2702cabdff1aSopenharmony_ci dst3_r = const_vec; 2703cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); 2704cabdff1aSopenharmony_ci 2705cabdff1aSopenharmony_ci dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0); 2706cabdff1aSopenharmony_ci dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0); 2707cabdff1aSopenharmony_ci dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0); 2708cabdff1aSopenharmony_ci dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0); 2709cabdff1aSopenharmony_ci 2710cabdff1aSopenharmony_ci dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2); 2711cabdff1aSopenharmony_ci dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2); 2712cabdff1aSopenharmony_ci dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2); 2713cabdff1aSopenharmony_ci dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2); 2714cabdff1aSopenharmony_ci 2715cabdff1aSopenharmony_ci SD(dst_val0, dst); 2716cabdff1aSopenharmony_ci SW(dst_val_int0, dst + 4); 2717cabdff1aSopenharmony_ci dst += dst_stride; 2718cabdff1aSopenharmony_ci SD(dst_val1, dst); 2719cabdff1aSopenharmony_ci SW(dst_val_int1, dst + 4); 2720cabdff1aSopenharmony_ci dst += dst_stride; 2721cabdff1aSopenharmony_ci SD(dst_val2, dst); 2722cabdff1aSopenharmony_ci SW(dst_val_int2, dst + 4); 2723cabdff1aSopenharmony_ci dst += dst_stride; 2724cabdff1aSopenharmony_ci SD(dst_val3, dst); 2725cabdff1aSopenharmony_ci SW(dst_val_int3, dst + 4); 2726cabdff1aSopenharmony_ci dst += dst_stride; 2727cabdff1aSopenharmony_ci } 2728cabdff1aSopenharmony_ci} 2729cabdff1aSopenharmony_ci 2730cabdff1aSopenharmony_cistatic void hevc_vt_4t_8x2_msa(uint8_t *src, 2731cabdff1aSopenharmony_ci int32_t src_stride, 2732cabdff1aSopenharmony_ci int16_t *dst, 2733cabdff1aSopenharmony_ci int32_t dst_stride, 2734cabdff1aSopenharmony_ci const int8_t *filter) 2735cabdff1aSopenharmony_ci{ 2736cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 2737cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 2738cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r; 2739cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2740cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2741cabdff1aSopenharmony_ci 2742cabdff1aSopenharmony_ci src -= src_stride; 2743cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2744cabdff1aSopenharmony_ci const_vec <<= 6; 2745cabdff1aSopenharmony_ci 2746cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2747cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2748cabdff1aSopenharmony_ci 2749cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2750cabdff1aSopenharmony_ci src += (3 * src_stride); 2751cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2752cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2753cabdff1aSopenharmony_ci 2754cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 2755cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 2756cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2757cabdff1aSopenharmony_ci dst0_r = const_vec; 2758cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2759cabdff1aSopenharmony_ci dst1_r = const_vec; 2760cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2761cabdff1aSopenharmony_ci 2762cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst1_r, dst, dst_stride); 2763cabdff1aSopenharmony_ci} 2764cabdff1aSopenharmony_ci 2765cabdff1aSopenharmony_cistatic void hevc_vt_4t_8x6_msa(uint8_t *src, 2766cabdff1aSopenharmony_ci int32_t src_stride, 2767cabdff1aSopenharmony_ci int16_t *dst, 2768cabdff1aSopenharmony_ci int32_t dst_stride, 2769cabdff1aSopenharmony_ci const int8_t *filter) 2770cabdff1aSopenharmony_ci{ 2771cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 2772cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 2773cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r; 2774cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2775cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2776cabdff1aSopenharmony_ci 2777cabdff1aSopenharmony_ci src -= src_stride; 2778cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2779cabdff1aSopenharmony_ci const_vec <<= 6; 2780cabdff1aSopenharmony_ci 2781cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2782cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2783cabdff1aSopenharmony_ci 2784cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2785cabdff1aSopenharmony_ci src += (3 * src_stride); 2786cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2787cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2788cabdff1aSopenharmony_ci 2789cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 2790cabdff1aSopenharmony_ci src += (2 * src_stride); 2791cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 2792cabdff1aSopenharmony_ci 2793cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2794cabdff1aSopenharmony_ci dst0_r = const_vec; 2795cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2796cabdff1aSopenharmony_ci dst1_r = const_vec; 2797cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2798cabdff1aSopenharmony_ci 2799cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst1_r, dst, dst_stride); 2800cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2801cabdff1aSopenharmony_ci 2802cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src1, src2); 2803cabdff1aSopenharmony_ci src += (2 * src_stride); 2804cabdff1aSopenharmony_ci XORI_B2_128_SB(src1, src2); 2805cabdff1aSopenharmony_ci 2806cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); 2807cabdff1aSopenharmony_ci dst0_r = const_vec; 2808cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 2809cabdff1aSopenharmony_ci dst1_r = const_vec; 2810cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 2811cabdff1aSopenharmony_ci 2812cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst1_r, dst, dst_stride); 2813cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2814cabdff1aSopenharmony_ci 2815cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 2816cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 2817cabdff1aSopenharmony_ci 2818cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2819cabdff1aSopenharmony_ci dst0_r = const_vec; 2820cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2821cabdff1aSopenharmony_ci dst1_r = const_vec; 2822cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2823cabdff1aSopenharmony_ci 2824cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst1_r, dst, dst_stride); 2825cabdff1aSopenharmony_ci} 2826cabdff1aSopenharmony_ci 2827cabdff1aSopenharmony_cistatic void hevc_vt_4t_8x4multiple_msa(uint8_t *src, 2828cabdff1aSopenharmony_ci int32_t src_stride, 2829cabdff1aSopenharmony_ci int16_t *dst, 2830cabdff1aSopenharmony_ci int32_t dst_stride, 2831cabdff1aSopenharmony_ci const int8_t *filter, 2832cabdff1aSopenharmony_ci int32_t height) 2833cabdff1aSopenharmony_ci{ 2834cabdff1aSopenharmony_ci int32_t loop_cnt; 2835cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 2836cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 2837cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 2838cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2839cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2840cabdff1aSopenharmony_ci 2841cabdff1aSopenharmony_ci src -= src_stride; 2842cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2843cabdff1aSopenharmony_ci const_vec <<= 6; 2844cabdff1aSopenharmony_ci 2845cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2846cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2847cabdff1aSopenharmony_ci 2848cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2849cabdff1aSopenharmony_ci src += (3 * src_stride); 2850cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2851cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2852cabdff1aSopenharmony_ci 2853cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2854cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src3, src4, src5, src6); 2855cabdff1aSopenharmony_ci src += (4 * src_stride); 2856cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 2857cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2858cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2859cabdff1aSopenharmony_ci dst0_r = const_vec; 2860cabdff1aSopenharmony_ci dst1_r = const_vec; 2861cabdff1aSopenharmony_ci dst2_r = const_vec; 2862cabdff1aSopenharmony_ci dst3_r = const_vec; 2863cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2864cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2865cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 2866cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 2867cabdff1aSopenharmony_ci ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); 2868cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2869cabdff1aSopenharmony_ci 2870cabdff1aSopenharmony_ci src2 = src6; 2871cabdff1aSopenharmony_ci src10_r = src54_r; 2872cabdff1aSopenharmony_ci src21_r = src65_r; 2873cabdff1aSopenharmony_ci } 2874cabdff1aSopenharmony_ci} 2875cabdff1aSopenharmony_ci 2876cabdff1aSopenharmony_cistatic void hevc_vt_4t_8w_msa(uint8_t *src, 2877cabdff1aSopenharmony_ci int32_t src_stride, 2878cabdff1aSopenharmony_ci int16_t *dst, 2879cabdff1aSopenharmony_ci int32_t dst_stride, 2880cabdff1aSopenharmony_ci const int8_t *filter, 2881cabdff1aSopenharmony_ci int32_t height) 2882cabdff1aSopenharmony_ci{ 2883cabdff1aSopenharmony_ci if (2 == height) { 2884cabdff1aSopenharmony_ci hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter); 2885cabdff1aSopenharmony_ci } else if (6 == height) { 2886cabdff1aSopenharmony_ci hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter); 2887cabdff1aSopenharmony_ci } else { 2888cabdff1aSopenharmony_ci hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride, 2889cabdff1aSopenharmony_ci filter, height); 2890cabdff1aSopenharmony_ci } 2891cabdff1aSopenharmony_ci} 2892cabdff1aSopenharmony_ci 2893cabdff1aSopenharmony_cistatic void hevc_vt_4t_12w_msa(uint8_t *src, 2894cabdff1aSopenharmony_ci int32_t src_stride, 2895cabdff1aSopenharmony_ci int16_t *dst, 2896cabdff1aSopenharmony_ci int32_t dst_stride, 2897cabdff1aSopenharmony_ci const int8_t *filter, 2898cabdff1aSopenharmony_ci int32_t height) 2899cabdff1aSopenharmony_ci{ 2900cabdff1aSopenharmony_ci int32_t loop_cnt; 2901cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 2902cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 2903cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 2904cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 2905cabdff1aSopenharmony_ci v16i8 src2110, src4332; 2906cabdff1aSopenharmony_ci v16i8 src54_r, src65_r, src6554; 2907cabdff1aSopenharmony_ci v8i16 dst0_l, dst1_l; 2908cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2909cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2910cabdff1aSopenharmony_ci 2911cabdff1aSopenharmony_ci src -= (1 * src_stride); 2912cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2913cabdff1aSopenharmony_ci const_vec <<= 6; 2914cabdff1aSopenharmony_ci 2915cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2916cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2917cabdff1aSopenharmony_ci 2918cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2919cabdff1aSopenharmony_ci src += (3 * src_stride); 2920cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2921cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2922cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 2923cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); 2924cabdff1aSopenharmony_ci 2925cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 2926cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 2927cabdff1aSopenharmony_ci src += (2 * src_stride); 2928cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src5, src6); 2929cabdff1aSopenharmony_ci src += (2 * src_stride); 2930cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 2931cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 2932cabdff1aSopenharmony_ci 2933cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2934cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 2935cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); 2936cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2937cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l); 2938cabdff1aSopenharmony_ci src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); 2939cabdff1aSopenharmony_ci 2940cabdff1aSopenharmony_ci dst0_r = const_vec; 2941cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2942cabdff1aSopenharmony_ci dst1_r = const_vec; 2943cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2944cabdff1aSopenharmony_ci dst2_r = const_vec; 2945cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 2946cabdff1aSopenharmony_ci dst3_r = const_vec; 2947cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 2948cabdff1aSopenharmony_ci dst0_l = const_vec; 2949cabdff1aSopenharmony_ci DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l); 2950cabdff1aSopenharmony_ci dst1_l = const_vec; 2951cabdff1aSopenharmony_ci DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l); 2952cabdff1aSopenharmony_ci 2953cabdff1aSopenharmony_ci ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); 2954cabdff1aSopenharmony_ci ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride); 2955cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2956cabdff1aSopenharmony_ci 2957cabdff1aSopenharmony_ci src2 = src6; 2958cabdff1aSopenharmony_ci src10_r = src54_r; 2959cabdff1aSopenharmony_ci src21_r = src65_r; 2960cabdff1aSopenharmony_ci src2110 = src6554; 2961cabdff1aSopenharmony_ci } 2962cabdff1aSopenharmony_ci} 2963cabdff1aSopenharmony_ci 2964cabdff1aSopenharmony_cistatic void hevc_vt_4t_16w_msa(uint8_t *src, 2965cabdff1aSopenharmony_ci int32_t src_stride, 2966cabdff1aSopenharmony_ci int16_t *dst, 2967cabdff1aSopenharmony_ci int32_t dst_stride, 2968cabdff1aSopenharmony_ci const int8_t *filter, 2969cabdff1aSopenharmony_ci int32_t height) 2970cabdff1aSopenharmony_ci{ 2971cabdff1aSopenharmony_ci int32_t loop_cnt; 2972cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 2973cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 2974cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src21_l, src43_l; 2975cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst0_l, dst1_l; 2976cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2977cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2978cabdff1aSopenharmony_ci 2979cabdff1aSopenharmony_ci src -= src_stride; 2980cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2981cabdff1aSopenharmony_ci const_vec <<= 6; 2982cabdff1aSopenharmony_ci 2983cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2984cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2985cabdff1aSopenharmony_ci 2986cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2987cabdff1aSopenharmony_ci src += (3 * src_stride); 2988cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2989cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2990cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 2991cabdff1aSopenharmony_ci 2992cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2993cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 2994cabdff1aSopenharmony_ci src += (2 * src_stride); 2995cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 2996cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2997cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 2998cabdff1aSopenharmony_ci dst0_r = const_vec; 2999cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3000cabdff1aSopenharmony_ci dst0_l = const_vec; 3001cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3002cabdff1aSopenharmony_ci dst1_r = const_vec; 3003cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3004cabdff1aSopenharmony_ci dst1_l = const_vec; 3005cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3006cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst0_l, dst, 8); 3007cabdff1aSopenharmony_ci dst += dst_stride; 3008cabdff1aSopenharmony_ci ST_SH2(dst1_r, dst1_l, dst, 8); 3009cabdff1aSopenharmony_ci dst += dst_stride; 3010cabdff1aSopenharmony_ci 3011cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src5, src2); 3012cabdff1aSopenharmony_ci src += (2 * src_stride); 3013cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src2); 3014cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3015cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 3016cabdff1aSopenharmony_ci dst0_r = const_vec; 3017cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 3018cabdff1aSopenharmony_ci dst0_l = const_vec; 3019cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); 3020cabdff1aSopenharmony_ci dst1_r = const_vec; 3021cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 3022cabdff1aSopenharmony_ci dst1_l = const_vec; 3023cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); 3024cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst0_l, dst, 8); 3025cabdff1aSopenharmony_ci dst += dst_stride; 3026cabdff1aSopenharmony_ci ST_SH2(dst1_r, dst1_l, dst, 8); 3027cabdff1aSopenharmony_ci dst += dst_stride; 3028cabdff1aSopenharmony_ci } 3029cabdff1aSopenharmony_ci} 3030cabdff1aSopenharmony_ci 3031cabdff1aSopenharmony_cistatic void hevc_vt_4t_24w_msa(uint8_t *src, 3032cabdff1aSopenharmony_ci int32_t src_stride, 3033cabdff1aSopenharmony_ci int16_t *dst, 3034cabdff1aSopenharmony_ci int32_t dst_stride, 3035cabdff1aSopenharmony_ci const int8_t *filter, 3036cabdff1aSopenharmony_ci int32_t height) 3037cabdff1aSopenharmony_ci{ 3038cabdff1aSopenharmony_ci int32_t loop_cnt; 3039cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 3040cabdff1aSopenharmony_ci v16i8 src6, src7, src8, src9, src10, src11; 3041cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r; 3042cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src87_r, src109_r; 3043cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3044cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src21_l, src43_l; 3045cabdff1aSopenharmony_ci v8i16 dst0_l, dst1_l; 3046cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3047cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3048cabdff1aSopenharmony_ci 3049cabdff1aSopenharmony_ci src -= src_stride; 3050cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3051cabdff1aSopenharmony_ci const_vec <<= 6; 3052cabdff1aSopenharmony_ci 3053cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3054cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3055cabdff1aSopenharmony_ci 3056cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3057cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3058cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3059cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3060cabdff1aSopenharmony_ci 3061cabdff1aSopenharmony_ci LD_SB3(src + 16, src_stride, src6, src7, src8); 3062cabdff1aSopenharmony_ci src += (3 * src_stride); 3063cabdff1aSopenharmony_ci XORI_B3_128_SB(src6, src7, src8); 3064cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3065cabdff1aSopenharmony_ci 3066cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3067cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 3068cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3069cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3070cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3071cabdff1aSopenharmony_ci 3072cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src9, src10); 3073cabdff1aSopenharmony_ci src += (2 * src_stride); 3074cabdff1aSopenharmony_ci XORI_B2_128_SB(src9, src10); 3075cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3076cabdff1aSopenharmony_ci 3077cabdff1aSopenharmony_ci dst0_r = const_vec; 3078cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3079cabdff1aSopenharmony_ci dst0_l = const_vec; 3080cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3081cabdff1aSopenharmony_ci dst1_r = const_vec; 3082cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3083cabdff1aSopenharmony_ci dst1_l = const_vec; 3084cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3085cabdff1aSopenharmony_ci dst2_r = const_vec; 3086cabdff1aSopenharmony_ci DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); 3087cabdff1aSopenharmony_ci dst3_r = const_vec; 3088cabdff1aSopenharmony_ci DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); 3089cabdff1aSopenharmony_ci 3090cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst0_l, dst, 8); 3091cabdff1aSopenharmony_ci ST_SH(dst2_r, dst + 16); 3092cabdff1aSopenharmony_ci dst += dst_stride; 3093cabdff1aSopenharmony_ci ST_SH2(dst1_r, dst1_l, dst, 8); 3094cabdff1aSopenharmony_ci ST_SH(dst3_r, dst + 16); 3095cabdff1aSopenharmony_ci dst += dst_stride; 3096cabdff1aSopenharmony_ci 3097cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src5, src2); 3098cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src2); 3099cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3100cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 3101cabdff1aSopenharmony_ci 3102cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src11, src8); 3103cabdff1aSopenharmony_ci src += (2 * src_stride); 3104cabdff1aSopenharmony_ci XORI_B2_128_SB(src11, src8); 3105cabdff1aSopenharmony_ci ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); 3106cabdff1aSopenharmony_ci 3107cabdff1aSopenharmony_ci dst0_r = const_vec; 3108cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 3109cabdff1aSopenharmony_ci dst0_l = const_vec; 3110cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); 3111cabdff1aSopenharmony_ci dst1_r = const_vec; 3112cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 3113cabdff1aSopenharmony_ci dst1_l = const_vec; 3114cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); 3115cabdff1aSopenharmony_ci dst2_r = const_vec; 3116cabdff1aSopenharmony_ci DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r); 3117cabdff1aSopenharmony_ci dst3_r = const_vec; 3118cabdff1aSopenharmony_ci DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r); 3119cabdff1aSopenharmony_ci 3120cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst0_l, dst, 8); 3121cabdff1aSopenharmony_ci ST_SH(dst2_r, dst + 16); 3122cabdff1aSopenharmony_ci dst += dst_stride; 3123cabdff1aSopenharmony_ci ST_SH2(dst1_r, dst1_l, dst, 8); 3124cabdff1aSopenharmony_ci ST_SH(dst3_r, dst + 16); 3125cabdff1aSopenharmony_ci dst += dst_stride; 3126cabdff1aSopenharmony_ci } 3127cabdff1aSopenharmony_ci} 3128cabdff1aSopenharmony_ci 3129cabdff1aSopenharmony_cistatic void hevc_vt_4t_32w_msa(uint8_t *src, 3130cabdff1aSopenharmony_ci int32_t src_stride, 3131cabdff1aSopenharmony_ci int16_t *dst, 3132cabdff1aSopenharmony_ci int32_t dst_stride, 3133cabdff1aSopenharmony_ci const int8_t *filter, 3134cabdff1aSopenharmony_ci int32_t height) 3135cabdff1aSopenharmony_ci{ 3136cabdff1aSopenharmony_ci int32_t loop_cnt; 3137cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 3138cabdff1aSopenharmony_ci v16i8 src6, src7, src8, src9, src10, src11; 3139cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r; 3140cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src87_r, src109_r; 3141cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3142cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src76_l, src98_l; 3143cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src87_l, src109_l; 3144cabdff1aSopenharmony_ci v8i16 dst0_l, dst1_l, dst2_l, dst3_l; 3145cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3146cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3147cabdff1aSopenharmony_ci 3148cabdff1aSopenharmony_ci src -= src_stride; 3149cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3150cabdff1aSopenharmony_ci const_vec <<= 6; 3151cabdff1aSopenharmony_ci 3152cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3153cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3154cabdff1aSopenharmony_ci 3155cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3156cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3157cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3158cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3159cabdff1aSopenharmony_ci 3160cabdff1aSopenharmony_ci LD_SB3(src + 16, src_stride, src6, src7, src8); 3161cabdff1aSopenharmony_ci src += (3 * src_stride); 3162cabdff1aSopenharmony_ci XORI_B3_128_SB(src6, src7, src8); 3163cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3164cabdff1aSopenharmony_ci ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 3165cabdff1aSopenharmony_ci 3166cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3167cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 3168cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3169cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3170cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3171cabdff1aSopenharmony_ci 3172cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src9, src10); 3173cabdff1aSopenharmony_ci src += (2 * src_stride); 3174cabdff1aSopenharmony_ci XORI_B2_128_SB(src9, src10); 3175cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3176cabdff1aSopenharmony_ci ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); 3177cabdff1aSopenharmony_ci 3178cabdff1aSopenharmony_ci dst0_r = const_vec; 3179cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3180cabdff1aSopenharmony_ci dst0_l = const_vec; 3181cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3182cabdff1aSopenharmony_ci dst1_r = const_vec; 3183cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3184cabdff1aSopenharmony_ci dst1_l = const_vec; 3185cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3186cabdff1aSopenharmony_ci dst2_r = const_vec; 3187cabdff1aSopenharmony_ci DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); 3188cabdff1aSopenharmony_ci dst2_l = const_vec; 3189cabdff1aSopenharmony_ci DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l); 3190cabdff1aSopenharmony_ci dst3_r = const_vec; 3191cabdff1aSopenharmony_ci DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); 3192cabdff1aSopenharmony_ci dst3_l = const_vec; 3193cabdff1aSopenharmony_ci DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l); 3194cabdff1aSopenharmony_ci 3195cabdff1aSopenharmony_ci ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8); 3196cabdff1aSopenharmony_ci dst += dst_stride; 3197cabdff1aSopenharmony_ci ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8); 3198cabdff1aSopenharmony_ci dst += dst_stride; 3199cabdff1aSopenharmony_ci 3200cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src5, src2); 3201cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src2); 3202cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3203cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 3204cabdff1aSopenharmony_ci 3205cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src11, src8); 3206cabdff1aSopenharmony_ci src += (2 * src_stride); 3207cabdff1aSopenharmony_ci XORI_B2_128_SB(src11, src8); 3208cabdff1aSopenharmony_ci ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); 3209cabdff1aSopenharmony_ci ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l); 3210cabdff1aSopenharmony_ci 3211cabdff1aSopenharmony_ci dst0_r = const_vec; 3212cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 3213cabdff1aSopenharmony_ci dst0_l = const_vec; 3214cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); 3215cabdff1aSopenharmony_ci dst1_r = const_vec; 3216cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 3217cabdff1aSopenharmony_ci dst1_l = const_vec; 3218cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); 3219cabdff1aSopenharmony_ci dst2_r = const_vec; 3220cabdff1aSopenharmony_ci DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r); 3221cabdff1aSopenharmony_ci dst2_l = const_vec; 3222cabdff1aSopenharmony_ci DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l); 3223cabdff1aSopenharmony_ci dst3_r = const_vec; 3224cabdff1aSopenharmony_ci DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r); 3225cabdff1aSopenharmony_ci dst3_l = const_vec; 3226cabdff1aSopenharmony_ci DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l); 3227cabdff1aSopenharmony_ci 3228cabdff1aSopenharmony_ci ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8); 3229cabdff1aSopenharmony_ci dst += dst_stride; 3230cabdff1aSopenharmony_ci ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8); 3231cabdff1aSopenharmony_ci dst += dst_stride; 3232cabdff1aSopenharmony_ci } 3233cabdff1aSopenharmony_ci} 3234cabdff1aSopenharmony_ci 3235cabdff1aSopenharmony_cistatic void hevc_hv_4t_4x2_msa(uint8_t *src, 3236cabdff1aSopenharmony_ci int32_t src_stride, 3237cabdff1aSopenharmony_ci int16_t *dst, 3238cabdff1aSopenharmony_ci int32_t dst_stride, 3239cabdff1aSopenharmony_ci const int8_t *filter_x, 3240cabdff1aSopenharmony_ci const int8_t *filter_y) 3241cabdff1aSopenharmony_ci{ 3242cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3243cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3244cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3245cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3246cabdff1aSopenharmony_ci v16i8 mask1; 3247cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3248cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 3249cabdff1aSopenharmony_ci v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43; 3250cabdff1aSopenharmony_ci v4i32 dst0, dst1; 3251cabdff1aSopenharmony_ci 3252cabdff1aSopenharmony_ci src -= (src_stride + 1); 3253cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3254cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3255cabdff1aSopenharmony_ci 3256cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3257cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3258cabdff1aSopenharmony_ci 3259cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3260cabdff1aSopenharmony_ci 3261cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3262cabdff1aSopenharmony_ci 3263cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3264cabdff1aSopenharmony_ci const_vec <<= 6; 3265cabdff1aSopenharmony_ci 3266cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3267cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3268cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1); 3269cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3); 3270cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5); 3271cabdff1aSopenharmony_ci 3272cabdff1aSopenharmony_ci dst20 = const_vec; 3273cabdff1aSopenharmony_ci dst31 = const_vec; 3274cabdff1aSopenharmony_ci dst42 = const_vec; 3275cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst20, dst20); 3276cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst31, dst31); 3277cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst42, dst42); 3278cabdff1aSopenharmony_ci ILVRL_H2_SH(dst31, dst20, dst10, dst32); 3279cabdff1aSopenharmony_ci ILVRL_H2_SH(dst42, dst31, dst21, dst43); 3280cabdff1aSopenharmony_ci 3281cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3282cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3283cabdff1aSopenharmony_ci dst0 >>= 6; 3284cabdff1aSopenharmony_ci dst1 >>= 6; 3285cabdff1aSopenharmony_ci dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 3286cabdff1aSopenharmony_ci ST_D2(dst0, 0, 1, dst, dst_stride); 3287cabdff1aSopenharmony_ci} 3288cabdff1aSopenharmony_ci 3289cabdff1aSopenharmony_cistatic void hevc_hv_4t_4x4_msa(uint8_t *src, 3290cabdff1aSopenharmony_ci int32_t src_stride, 3291cabdff1aSopenharmony_ci int16_t *dst, 3292cabdff1aSopenharmony_ci int32_t dst_stride, 3293cabdff1aSopenharmony_ci const int8_t *filter_x, 3294cabdff1aSopenharmony_ci const int8_t *filter_y) 3295cabdff1aSopenharmony_ci{ 3296cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 3297cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3298cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3299cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3300cabdff1aSopenharmony_ci v16i8 mask1; 3301cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3302cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3303cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65; 3304cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3; 3305cabdff1aSopenharmony_ci 3306cabdff1aSopenharmony_ci src -= (src_stride + 1); 3307cabdff1aSopenharmony_ci 3308cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3309cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3310cabdff1aSopenharmony_ci 3311cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3312cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3313cabdff1aSopenharmony_ci 3314cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3315cabdff1aSopenharmony_ci 3316cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3317cabdff1aSopenharmony_ci 3318cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3319cabdff1aSopenharmony_ci const_vec <<= 6; 3320cabdff1aSopenharmony_ci 3321cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3322cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3323cabdff1aSopenharmony_ci 3324cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1); 3325cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3); 3326cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5); 3327cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7); 3328cabdff1aSopenharmony_ci 3329cabdff1aSopenharmony_ci dst30 = const_vec; 3330cabdff1aSopenharmony_ci dst41 = const_vec; 3331cabdff1aSopenharmony_ci dst52 = const_vec; 3332cabdff1aSopenharmony_ci dst63 = const_vec; 3333cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst30, dst30); 3334cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst41, dst41); 3335cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst52, dst52); 3336cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst63, dst63); 3337cabdff1aSopenharmony_ci 3338cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10, dst43); 3339cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21, dst54); 3340cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32, dst65); 3341cabdff1aSopenharmony_ci 3342cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3343cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3344cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1); 3345cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1); 3346cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 3347cabdff1aSopenharmony_ci PCKEV_H2_SW(dst1, dst0, dst3, dst2, dst0, dst2); 3348cabdff1aSopenharmony_ci ST_D4(dst0, dst2, 0, 1, 0, 1, dst, dst_stride); 3349cabdff1aSopenharmony_ci} 3350cabdff1aSopenharmony_ci 3351cabdff1aSopenharmony_ci 3352cabdff1aSopenharmony_cistatic void hevc_hv_4t_4multx8mult_msa(uint8_t *src, 3353cabdff1aSopenharmony_ci int32_t src_stride, 3354cabdff1aSopenharmony_ci int16_t *dst, 3355cabdff1aSopenharmony_ci int32_t dst_stride, 3356cabdff1aSopenharmony_ci const int8_t *filter_x, 3357cabdff1aSopenharmony_ci const int8_t *filter_y, 3358cabdff1aSopenharmony_ci int32_t height) 3359cabdff1aSopenharmony_ci{ 3360cabdff1aSopenharmony_ci uint32_t loop_cnt; 3361cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 3362cabdff1aSopenharmony_ci v16i8 src7, src8, src9, src10; 3363cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3364cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3365cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3366cabdff1aSopenharmony_ci v16i8 mask1; 3367cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3368cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3369cabdff1aSopenharmony_ci v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 3370cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r; 3371cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r; 3372cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3373cabdff1aSopenharmony_ci 3374cabdff1aSopenharmony_ci src -= (src_stride + 1); 3375cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3376cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3377cabdff1aSopenharmony_ci 3378cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3379cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3380cabdff1aSopenharmony_ci 3381cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3382cabdff1aSopenharmony_ci 3383cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3384cabdff1aSopenharmony_ci 3385cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3386cabdff1aSopenharmony_ci const_vec <<= 6; 3387cabdff1aSopenharmony_ci 3388cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3389cabdff1aSopenharmony_ci src += (3 * src_stride); 3390cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3391cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 3392cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3); 3393cabdff1aSopenharmony_ci dst10 = const_vec; 3394cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10); 3395cabdff1aSopenharmony_ci dst21 = const_vec; 3396cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21); 3397cabdff1aSopenharmony_ci ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 3398cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 3399cabdff1aSopenharmony_ci 3400cabdff1aSopenharmony_ci for (loop_cnt = height >> 3; loop_cnt--;) { 3401cabdff1aSopenharmony_ci LD_SB8(src, src_stride, 3402cabdff1aSopenharmony_ci src3, src4, src5, src6, src7, src8, src9, src10); 3403cabdff1aSopenharmony_ci src += (8 * src_stride); 3404cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3405cabdff1aSopenharmony_ci 3406cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1); 3407cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3); 3408cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5); 3409cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7); 3410cabdff1aSopenharmony_ci 3411cabdff1aSopenharmony_ci dst73 = const_vec; 3412cabdff1aSopenharmony_ci dst84 = const_vec; 3413cabdff1aSopenharmony_ci dst95 = const_vec; 3414cabdff1aSopenharmony_ci dst106 = const_vec; 3415cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73); 3416cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84); 3417cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95); 3418cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106); 3419cabdff1aSopenharmony_ci 3420cabdff1aSopenharmony_ci dst32_r = __msa_ilvr_h(dst73, dst22); 3421cabdff1aSopenharmony_ci ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 3422cabdff1aSopenharmony_ci ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 3423cabdff1aSopenharmony_ci ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 3424cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 3425cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst22, dst106); 3426cabdff1aSopenharmony_ci 3427cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3428cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3429cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3430cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3431cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3432cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3433cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 3434cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 3435cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 3436cabdff1aSopenharmony_ci SRA_4V(dst4, dst5, dst6, dst7, 6); 3437cabdff1aSopenharmony_ci PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 3438cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3439cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 3440cabdff1aSopenharmony_ci dst += (8 * dst_stride); 3441cabdff1aSopenharmony_ci 3442cabdff1aSopenharmony_ci dst10_r = dst98_r; 3443cabdff1aSopenharmony_ci dst21_r = dst109_r; 3444cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 3445cabdff1aSopenharmony_ci } 3446cabdff1aSopenharmony_ci} 3447cabdff1aSopenharmony_ci 3448cabdff1aSopenharmony_cistatic void hevc_hv_4t_4w_msa(uint8_t *src, 3449cabdff1aSopenharmony_ci int32_t src_stride, 3450cabdff1aSopenharmony_ci int16_t *dst, 3451cabdff1aSopenharmony_ci int32_t dst_stride, 3452cabdff1aSopenharmony_ci const int8_t *filter_x, 3453cabdff1aSopenharmony_ci const int8_t *filter_y, 3454cabdff1aSopenharmony_ci int32_t height) 3455cabdff1aSopenharmony_ci{ 3456cabdff1aSopenharmony_ci if (2 == height) { 3457cabdff1aSopenharmony_ci hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride, 3458cabdff1aSopenharmony_ci filter_x, filter_y); 3459cabdff1aSopenharmony_ci } else if (4 == height) { 3460cabdff1aSopenharmony_ci hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride, 3461cabdff1aSopenharmony_ci filter_x, filter_y); 3462cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 3463cabdff1aSopenharmony_ci hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride, 3464cabdff1aSopenharmony_ci filter_x, filter_y, height); 3465cabdff1aSopenharmony_ci } 3466cabdff1aSopenharmony_ci} 3467cabdff1aSopenharmony_ci 3468cabdff1aSopenharmony_cistatic void hevc_hv_4t_6w_msa(uint8_t *src, 3469cabdff1aSopenharmony_ci int32_t src_stride, 3470cabdff1aSopenharmony_ci int16_t *dst, 3471cabdff1aSopenharmony_ci int32_t dst_stride, 3472cabdff1aSopenharmony_ci const int8_t *filter_x, 3473cabdff1aSopenharmony_ci const int8_t *filter_y, 3474cabdff1aSopenharmony_ci int32_t height) 3475cabdff1aSopenharmony_ci{ 3476cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3477cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3478cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3479cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3480cabdff1aSopenharmony_ci v16i8 mask1; 3481cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3482cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3483cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9; 3484cabdff1aSopenharmony_ci v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 3485cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r; 3486cabdff1aSopenharmony_ci v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l; 3487cabdff1aSopenharmony_ci v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l; 3488cabdff1aSopenharmony_ci v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l; 3489cabdff1aSopenharmony_ci v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 3490cabdff1aSopenharmony_ci v4i32 dst0_l, dst1_l, dst2_l, dst3_l; 3491cabdff1aSopenharmony_ci 3492cabdff1aSopenharmony_ci src -= (src_stride + 1); 3493cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3494cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3495cabdff1aSopenharmony_ci 3496cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3497cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3498cabdff1aSopenharmony_ci 3499cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3500cabdff1aSopenharmony_ci 3501cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3502cabdff1aSopenharmony_ci 3503cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3504cabdff1aSopenharmony_ci const_vec <<= 6; 3505cabdff1aSopenharmony_ci 3506cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3507cabdff1aSopenharmony_ci src += (3 * src_stride); 3508cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3509cabdff1aSopenharmony_ci 3510cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3511cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3512cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3513cabdff1aSopenharmony_ci 3514cabdff1aSopenharmony_ci dsth0 = const_vec; 3515cabdff1aSopenharmony_ci dsth1 = const_vec; 3516cabdff1aSopenharmony_ci dsth2 = const_vec; 3517cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth0, dsth0); 3518cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth1, dsth1); 3519cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth2, dsth2); 3520cabdff1aSopenharmony_ci 3521cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 3522cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 3523cabdff1aSopenharmony_ci 3524cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 3525cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3526cabdff1aSopenharmony_ci 3527cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3528cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3529cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3530cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3531cabdff1aSopenharmony_ci 3532cabdff1aSopenharmony_ci dsth3 = const_vec; 3533cabdff1aSopenharmony_ci dsth4 = const_vec; 3534cabdff1aSopenharmony_ci dsth5 = const_vec; 3535cabdff1aSopenharmony_ci dsth6 = const_vec; 3536cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth3, dsth3); 3537cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth4, dsth4); 3538cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth5, dsth5); 3539cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth6, dsth6); 3540cabdff1aSopenharmony_ci 3541cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 3542cabdff1aSopenharmony_ci VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3); 3543cabdff1aSopenharmony_ci VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5); 3544cabdff1aSopenharmony_ci VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7); 3545cabdff1aSopenharmony_ci 3546cabdff1aSopenharmony_ci dsth7 = const_vec; 3547cabdff1aSopenharmony_ci dsth8 = const_vec; 3548cabdff1aSopenharmony_ci dsth9 = const_vec; 3549cabdff1aSopenharmony_ci dsth10 = const_vec; 3550cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth7, dsth7); 3551cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth8, dsth8); 3552cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth9, dsth9); 3553cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth10, dsth10); 3554cabdff1aSopenharmony_ci 3555cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 3556cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 3557cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 3558cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 3559cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 3560cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 3561cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l); 3562cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l); 3563cabdff1aSopenharmony_ci 3564cabdff1aSopenharmony_ci PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l); 3565cabdff1aSopenharmony_ci PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l); 3566cabdff1aSopenharmony_ci dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l); 3567cabdff1aSopenharmony_ci 3568cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3569cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3570cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3571cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3572cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3573cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3574cabdff1aSopenharmony_ci dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 3575cabdff1aSopenharmony_ci dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 3576cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1); 3577cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1); 3578cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1); 3579cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1); 3580cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 3581cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 3582cabdff1aSopenharmony_ci SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 3583cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); 3584cabdff1aSopenharmony_ci PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3); 3585cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5); 3586cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 3587cabdff1aSopenharmony_ci ST_W4(tmp4, 0, 1, 2, 3, dst + 4, dst_stride); 3588cabdff1aSopenharmony_ci dst += 4 * dst_stride; 3589cabdff1aSopenharmony_ci ST_D4(tmp2, tmp3, 0, 1, 0, 1, dst, dst_stride); 3590cabdff1aSopenharmony_ci ST_W4(tmp5, 0, 1, 2, 3, dst + 4, dst_stride); 3591cabdff1aSopenharmony_ci} 3592cabdff1aSopenharmony_ci 3593cabdff1aSopenharmony_cistatic void hevc_hv_4t_8x2_msa(uint8_t *src, 3594cabdff1aSopenharmony_ci int32_t src_stride, 3595cabdff1aSopenharmony_ci int16_t *dst, 3596cabdff1aSopenharmony_ci int32_t dst_stride, 3597cabdff1aSopenharmony_ci const int8_t *filter_x, 3598cabdff1aSopenharmony_ci const int8_t *filter_y) 3599cabdff1aSopenharmony_ci{ 3600cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3601cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3602cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3603cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3604cabdff1aSopenharmony_ci v16i8 mask1; 3605cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3606cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 3607cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4; 3608cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 3609cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 3610cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 3611cabdff1aSopenharmony_ci 3612cabdff1aSopenharmony_ci src -= (src_stride + 1); 3613cabdff1aSopenharmony_ci 3614cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3615cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3616cabdff1aSopenharmony_ci 3617cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3618cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3619cabdff1aSopenharmony_ci 3620cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3621cabdff1aSopenharmony_ci 3622cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3623cabdff1aSopenharmony_ci 3624cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3625cabdff1aSopenharmony_ci const_vec <<= 6; 3626cabdff1aSopenharmony_ci 3627cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3628cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3629cabdff1aSopenharmony_ci 3630cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3631cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3632cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3633cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 3634cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 3635cabdff1aSopenharmony_ci 3636cabdff1aSopenharmony_ci dst0 = const_vec; 3637cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 3638cabdff1aSopenharmony_ci dst1 = const_vec; 3639cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); 3640cabdff1aSopenharmony_ci dst2 = const_vec; 3641cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); 3642cabdff1aSopenharmony_ci dst3 = const_vec; 3643cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3); 3644cabdff1aSopenharmony_ci dst4 = const_vec; 3645cabdff1aSopenharmony_ci DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4); 3646cabdff1aSopenharmony_ci 3647cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3648cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3649cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3650cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3651cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3652cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3653cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3654cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3655cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3656cabdff1aSopenharmony_ci PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3657cabdff1aSopenharmony_ci ST_SW2(dst0_r, dst1_r, dst, dst_stride); 3658cabdff1aSopenharmony_ci} 3659cabdff1aSopenharmony_ci 3660cabdff1aSopenharmony_cistatic void hevc_hv_4t_8multx4_msa(uint8_t *src, int32_t src_stride, 3661cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 3662cabdff1aSopenharmony_ci const int8_t *filter_x, 3663cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t width8mult) 3664cabdff1aSopenharmony_ci{ 3665cabdff1aSopenharmony_ci int32_t cnt; 3666cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 3667cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3668cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec; 3669cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6; 3670cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 3671cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 3672cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3673cabdff1aSopenharmony_ci 3674cabdff1aSopenharmony_ci src -= (src_stride + 1); 3675cabdff1aSopenharmony_ci 3676cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3677cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3678cabdff1aSopenharmony_ci 3679cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3680cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3681cabdff1aSopenharmony_ci 3682cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3683cabdff1aSopenharmony_ci 3684cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 3685cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3686cabdff1aSopenharmony_ci 3687cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3688cabdff1aSopenharmony_ci const_vec <<= 6; 3689cabdff1aSopenharmony_ci 3690cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 3691cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3692cabdff1aSopenharmony_ci src += 8; 3693cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3694cabdff1aSopenharmony_ci 3695cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3696cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3697cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3698cabdff1aSopenharmony_ci 3699cabdff1aSopenharmony_ci dst0 = const_vec; 3700cabdff1aSopenharmony_ci dst1 = const_vec; 3701cabdff1aSopenharmony_ci dst2 = const_vec; 3702cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 3703cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); 3704cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); 3705cabdff1aSopenharmony_ci 3706cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3707cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3708cabdff1aSopenharmony_ci 3709cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3710cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3711cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3712cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3713cabdff1aSopenharmony_ci dst3 = const_vec; 3714cabdff1aSopenharmony_ci dst4 = const_vec; 3715cabdff1aSopenharmony_ci dst5 = const_vec; 3716cabdff1aSopenharmony_ci dst6 = const_vec; 3717cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 3718cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4); 3719cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5); 3720cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6); 3721cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3722cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3723cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3724cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3725cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3726cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3727cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3728cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3729cabdff1aSopenharmony_ci 3730cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3731cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3732cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3733cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3734cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3735cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3736cabdff1aSopenharmony_ci PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3737cabdff1aSopenharmony_ci PCKEV_H2_SW(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r); 3738cabdff1aSopenharmony_ci 3739cabdff1aSopenharmony_ci ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); 3740cabdff1aSopenharmony_ci dst += 8; 3741cabdff1aSopenharmony_ci } 3742cabdff1aSopenharmony_ci} 3743cabdff1aSopenharmony_ci 3744cabdff1aSopenharmony_cistatic void hevc_hv_4t_8x6_msa(uint8_t *src, 3745cabdff1aSopenharmony_ci int32_t src_stride, 3746cabdff1aSopenharmony_ci int16_t *dst, 3747cabdff1aSopenharmony_ci int32_t dst_stride, 3748cabdff1aSopenharmony_ci const int8_t *filter_x, 3749cabdff1aSopenharmony_ci const int8_t *filter_y) 3750cabdff1aSopenharmony_ci{ 3751cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3752cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3753cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3754cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3755cabdff1aSopenharmony_ci v16i8 mask1; 3756cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3757cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 3758cabdff1aSopenharmony_ci v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 3759cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 3760cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3761cabdff1aSopenharmony_ci v4i32 dst4_r, dst4_l, dst5_r, dst5_l; 3762cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst10_l, dst32_l; 3763cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst21_l, dst43_l; 3764cabdff1aSopenharmony_ci v8i16 dst54_r, dst54_l, dst65_r, dst65_l; 3765cabdff1aSopenharmony_ci v8i16 dst76_r, dst76_l, dst87_r, dst87_l; 3766cabdff1aSopenharmony_ci 3767cabdff1aSopenharmony_ci src -= (src_stride + 1); 3768cabdff1aSopenharmony_ci 3769cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3770cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3771cabdff1aSopenharmony_ci 3772cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3773cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3774cabdff1aSopenharmony_ci 3775cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3776cabdff1aSopenharmony_ci 3777cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3778cabdff1aSopenharmony_ci 3779cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3780cabdff1aSopenharmony_ci const_vec <<= 6; 3781cabdff1aSopenharmony_ci 3782cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3783cabdff1aSopenharmony_ci src += (5 * src_stride); 3784cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src5, src6, src7, src8); 3785cabdff1aSopenharmony_ci 3786cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3787cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 3788cabdff1aSopenharmony_ci 3789cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3790cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3791cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3792cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 3793cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 3794cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 3795cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13); 3796cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15); 3797cabdff1aSopenharmony_ci VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17); 3798cabdff1aSopenharmony_ci 3799cabdff1aSopenharmony_ci dst0 = const_vec; 3800cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 3801cabdff1aSopenharmony_ci dst1 = const_vec; 3802cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); 3803cabdff1aSopenharmony_ci dst2 = const_vec; 3804cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); 3805cabdff1aSopenharmony_ci dst3 = const_vec; 3806cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3); 3807cabdff1aSopenharmony_ci dst4 = const_vec; 3808cabdff1aSopenharmony_ci DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4); 3809cabdff1aSopenharmony_ci dst5 = const_vec; 3810cabdff1aSopenharmony_ci DPADD_SB2_SH(vec10, vec11, filt0, filt1, dst5, dst5); 3811cabdff1aSopenharmony_ci dst6 = const_vec; 3812cabdff1aSopenharmony_ci DPADD_SB2_SH(vec12, vec13, filt0, filt1, dst6, dst6); 3813cabdff1aSopenharmony_ci dst7 = const_vec; 3814cabdff1aSopenharmony_ci DPADD_SB2_SH(vec14, vec15, filt0, filt1, dst7, dst7); 3815cabdff1aSopenharmony_ci dst8 = const_vec; 3816cabdff1aSopenharmony_ci DPADD_SB2_SH(vec16, vec17, filt0, filt1, dst8, dst8); 3817cabdff1aSopenharmony_ci 3818cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3819cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3820cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3821cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3822cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3823cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3824cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 3825cabdff1aSopenharmony_ci ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 3826cabdff1aSopenharmony_ci 3827cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3828cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3829cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3830cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3831cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3832cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3833cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3834cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3835cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3836cabdff1aSopenharmony_ci dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); 3837cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3838cabdff1aSopenharmony_ci dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); 3839cabdff1aSopenharmony_ci 3840cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3841cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3842cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6); 3843cabdff1aSopenharmony_ci 3844cabdff1aSopenharmony_ci PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, 3845cabdff1aSopenharmony_ci dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r); 3846cabdff1aSopenharmony_ci PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r); 3847cabdff1aSopenharmony_ci 3848cabdff1aSopenharmony_ci ST_SW2(dst0_r, dst1_r, dst, dst_stride); 3849cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3850cabdff1aSopenharmony_ci ST_SW2(dst2_r, dst3_r, dst, dst_stride); 3851cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3852cabdff1aSopenharmony_ci ST_SW2(dst4_r, dst5_r, dst, dst_stride); 3853cabdff1aSopenharmony_ci} 3854cabdff1aSopenharmony_ci 3855cabdff1aSopenharmony_cistatic void hevc_hv_4t_8multx4mult_msa(uint8_t *src, 3856cabdff1aSopenharmony_ci int32_t src_stride, 3857cabdff1aSopenharmony_ci int16_t *dst, 3858cabdff1aSopenharmony_ci int32_t dst_stride, 3859cabdff1aSopenharmony_ci const int8_t *filter_x, 3860cabdff1aSopenharmony_ci const int8_t *filter_y, 3861cabdff1aSopenharmony_ci int32_t height, 3862cabdff1aSopenharmony_ci int32_t width8mult) 3863cabdff1aSopenharmony_ci{ 3864cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 3865cabdff1aSopenharmony_ci uint8_t *src_tmp; 3866cabdff1aSopenharmony_ci int16_t *dst_tmp; 3867cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 3868cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3869cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3870cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3871cabdff1aSopenharmony_ci v16i8 mask1; 3872cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3873cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3874cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6; 3875cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3876cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 3877cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 3878cabdff1aSopenharmony_ci 3879cabdff1aSopenharmony_ci src -= (src_stride + 1); 3880cabdff1aSopenharmony_ci 3881cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3882cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3883cabdff1aSopenharmony_ci 3884cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3885cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3886cabdff1aSopenharmony_ci 3887cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3888cabdff1aSopenharmony_ci 3889cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3890cabdff1aSopenharmony_ci 3891cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3892cabdff1aSopenharmony_ci const_vec <<= 6; 3893cabdff1aSopenharmony_ci 3894cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 3895cabdff1aSopenharmony_ci src_tmp = src; 3896cabdff1aSopenharmony_ci dst_tmp = dst; 3897cabdff1aSopenharmony_ci 3898cabdff1aSopenharmony_ci LD_SB3(src_tmp, src_stride, src0, src1, src2); 3899cabdff1aSopenharmony_ci src_tmp += (3 * src_stride); 3900cabdff1aSopenharmony_ci 3901cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3902cabdff1aSopenharmony_ci 3903cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3904cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3905cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3906cabdff1aSopenharmony_ci 3907cabdff1aSopenharmony_ci dst0 = const_vec; 3908cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 3909cabdff1aSopenharmony_ci dst1 = const_vec; 3910cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); 3911cabdff1aSopenharmony_ci dst2 = const_vec; 3912cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); 3913cabdff1aSopenharmony_ci 3914cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3915cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3916cabdff1aSopenharmony_ci 3917cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 3918cabdff1aSopenharmony_ci LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 3919cabdff1aSopenharmony_ci src_tmp += (4 * src_stride); 3920cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 3921cabdff1aSopenharmony_ci 3922cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3923cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3924cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3925cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3926cabdff1aSopenharmony_ci 3927cabdff1aSopenharmony_ci dst3 = const_vec; 3928cabdff1aSopenharmony_ci dst4 = const_vec; 3929cabdff1aSopenharmony_ci dst5 = const_vec; 3930cabdff1aSopenharmony_ci dst6 = const_vec; 3931cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 3932cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4); 3933cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5); 3934cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6); 3935cabdff1aSopenharmony_ci 3936cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3937cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3938cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3939cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3940cabdff1aSopenharmony_ci 3941cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3942cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3943cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3944cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3945cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3946cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3947cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3948cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3949cabdff1aSopenharmony_ci 3950cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3951cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3952cabdff1aSopenharmony_ci 3953cabdff1aSopenharmony_ci PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, 3954cabdff1aSopenharmony_ci dst2_l, dst2_r, dst3_l, dst3_r, 3955cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 3956cabdff1aSopenharmony_ci 3957cabdff1aSopenharmony_ci ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride); 3958cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 3959cabdff1aSopenharmony_ci 3960cabdff1aSopenharmony_ci dst10_r = dst54_r; 3961cabdff1aSopenharmony_ci dst10_l = dst54_l; 3962cabdff1aSopenharmony_ci dst21_r = dst65_r; 3963cabdff1aSopenharmony_ci dst21_l = dst65_l; 3964cabdff1aSopenharmony_ci dst2 = dst6; 3965cabdff1aSopenharmony_ci } 3966cabdff1aSopenharmony_ci 3967cabdff1aSopenharmony_ci src += 8; 3968cabdff1aSopenharmony_ci dst += 8; 3969cabdff1aSopenharmony_ci } 3970cabdff1aSopenharmony_ci} 3971cabdff1aSopenharmony_ci 3972cabdff1aSopenharmony_cistatic void hevc_hv_4t_8w_msa(uint8_t *src, 3973cabdff1aSopenharmony_ci int32_t src_stride, 3974cabdff1aSopenharmony_ci int16_t *dst, 3975cabdff1aSopenharmony_ci int32_t dst_stride, 3976cabdff1aSopenharmony_ci const int8_t *filter_x, 3977cabdff1aSopenharmony_ci const int8_t *filter_y, 3978cabdff1aSopenharmony_ci int32_t height) 3979cabdff1aSopenharmony_ci{ 3980cabdff1aSopenharmony_ci 3981cabdff1aSopenharmony_ci if (2 == height) { 3982cabdff1aSopenharmony_ci hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride, 3983cabdff1aSopenharmony_ci filter_x, filter_y); 3984cabdff1aSopenharmony_ci } else if (4 == height) { 3985cabdff1aSopenharmony_ci hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride, 3986cabdff1aSopenharmony_ci filter_x, filter_y, 1); 3987cabdff1aSopenharmony_ci } else if (6 == height) { 3988cabdff1aSopenharmony_ci hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride, 3989cabdff1aSopenharmony_ci filter_x, filter_y); 3990cabdff1aSopenharmony_ci } else if (0 == (height % 4)) { 3991cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 3992cabdff1aSopenharmony_ci filter_x, filter_y, height, 1); 3993cabdff1aSopenharmony_ci } 3994cabdff1aSopenharmony_ci} 3995cabdff1aSopenharmony_ci 3996cabdff1aSopenharmony_cistatic void hevc_hv_4t_12w_msa(uint8_t *src, 3997cabdff1aSopenharmony_ci int32_t src_stride, 3998cabdff1aSopenharmony_ci int16_t *dst, 3999cabdff1aSopenharmony_ci int32_t dst_stride, 4000cabdff1aSopenharmony_ci const int8_t *filter_x, 4001cabdff1aSopenharmony_ci const int8_t *filter_y, 4002cabdff1aSopenharmony_ci int32_t height) 4003cabdff1aSopenharmony_ci{ 4004cabdff1aSopenharmony_ci uint32_t loop_cnt; 4005cabdff1aSopenharmony_ci uint8_t *src_tmp; 4006cabdff1aSopenharmony_ci int16_t *dst_tmp; 4007cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4008cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4009cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3; 4010cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec; 4011cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73; 4012cabdff1aSopenharmony_ci v8i16 dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r; 4013cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 4014cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 4015cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4016cabdff1aSopenharmony_ci v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 4017cabdff1aSopenharmony_ci 4018cabdff1aSopenharmony_ci src -= (src_stride + 1); 4019cabdff1aSopenharmony_ci 4020cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4021cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4022cabdff1aSopenharmony_ci 4023cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4024cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4025cabdff1aSopenharmony_ci 4026cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4027cabdff1aSopenharmony_ci 4028cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 4029cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4030cabdff1aSopenharmony_ci 4031cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 4032cabdff1aSopenharmony_ci const_vec <<= 6; 4033cabdff1aSopenharmony_ci 4034cabdff1aSopenharmony_ci src_tmp = src; 4035cabdff1aSopenharmony_ci dst_tmp = dst; 4036cabdff1aSopenharmony_ci 4037cabdff1aSopenharmony_ci LD_SB3(src_tmp, src_stride, src0, src1, src2); 4038cabdff1aSopenharmony_ci src_tmp += (3 * src_stride); 4039cabdff1aSopenharmony_ci 4040cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4041cabdff1aSopenharmony_ci 4042cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4043cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4044cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4045cabdff1aSopenharmony_ci 4046cabdff1aSopenharmony_ci dst0 = const_vec; 4047cabdff1aSopenharmony_ci dst1 = const_vec; 4048cabdff1aSopenharmony_ci dst2 = const_vec; 4049cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 4050cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); 4051cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); 4052cabdff1aSopenharmony_ci 4053cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4054cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4055cabdff1aSopenharmony_ci 4056cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 4057cabdff1aSopenharmony_ci LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 4058cabdff1aSopenharmony_ci src_tmp += (4 * src_stride); 4059cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 4060cabdff1aSopenharmony_ci 4061cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4062cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4063cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4064cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4065cabdff1aSopenharmony_ci 4066cabdff1aSopenharmony_ci dst3 = const_vec; 4067cabdff1aSopenharmony_ci dst4 = const_vec; 4068cabdff1aSopenharmony_ci dst5 = const_vec; 4069cabdff1aSopenharmony_ci dst6 = const_vec; 4070cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 4071cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4); 4072cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5); 4073cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6); 4074cabdff1aSopenharmony_ci 4075cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4076cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4077cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4078cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4079cabdff1aSopenharmony_ci 4080cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4081cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4082cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4083cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4084cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4085cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4086cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4087cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4088cabdff1aSopenharmony_ci 4089cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4090cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4091cabdff1aSopenharmony_ci PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4092cabdff1aSopenharmony_ci dst3_r, dst0_r, dst1_r, dst2_r, dst3_r); 4093cabdff1aSopenharmony_ci ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride); 4094cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 4095cabdff1aSopenharmony_ci 4096cabdff1aSopenharmony_ci dst10_r = dst54_r; 4097cabdff1aSopenharmony_ci dst10_l = dst54_l; 4098cabdff1aSopenharmony_ci dst21_r = dst65_r; 4099cabdff1aSopenharmony_ci dst21_l = dst65_l; 4100cabdff1aSopenharmony_ci dst2 = dst6; 4101cabdff1aSopenharmony_ci } 4102cabdff1aSopenharmony_ci 4103cabdff1aSopenharmony_ci src += 8; 4104cabdff1aSopenharmony_ci dst += 8; 4105cabdff1aSopenharmony_ci 4106cabdff1aSopenharmony_ci mask2 = LD_SB(ff_hevc_mask_arr + 16); 4107cabdff1aSopenharmony_ci mask3 = mask2 + 2; 4108cabdff1aSopenharmony_ci 4109cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 4110cabdff1aSopenharmony_ci src += (3 * src_stride); 4111cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4112cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 4113cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3); 4114cabdff1aSopenharmony_ci dst10 = const_vec; 4115cabdff1aSopenharmony_ci dst21 = const_vec; 4116cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10); 4117cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21); 4118cabdff1aSopenharmony_ci ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 4119cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 4120cabdff1aSopenharmony_ci 4121cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 4122cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, 4123cabdff1aSopenharmony_ci src10); 4124cabdff1aSopenharmony_ci src += (8 * src_stride); 4125cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4126cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1); 4127cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3); 4128cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5); 4129cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7); 4130cabdff1aSopenharmony_ci 4131cabdff1aSopenharmony_ci dst73 = const_vec; 4132cabdff1aSopenharmony_ci dst84 = const_vec; 4133cabdff1aSopenharmony_ci dst95 = const_vec; 4134cabdff1aSopenharmony_ci dst106 = const_vec; 4135cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73); 4136cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84); 4137cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95); 4138cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106); 4139cabdff1aSopenharmony_ci 4140cabdff1aSopenharmony_ci dst32_r = __msa_ilvr_h(dst73, dst22); 4141cabdff1aSopenharmony_ci ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 4142cabdff1aSopenharmony_ci ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 4143cabdff1aSopenharmony_ci ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 4144cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 4145cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst22, dst106); 4146cabdff1aSopenharmony_ci 4147cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4148cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4149cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4150cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4151cabdff1aSopenharmony_ci tmp4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4152cabdff1aSopenharmony_ci tmp5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4153cabdff1aSopenharmony_ci tmp6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4154cabdff1aSopenharmony_ci tmp7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4155cabdff1aSopenharmony_ci 4156cabdff1aSopenharmony_ci SRA_4V(tmp0, tmp1, tmp2, tmp3, 6); 4157cabdff1aSopenharmony_ci SRA_4V(tmp4, tmp5, tmp6, tmp7, 6); 4158cabdff1aSopenharmony_ci PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1, 4159cabdff1aSopenharmony_ci tmp2, tmp3); 4160cabdff1aSopenharmony_ci ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 4161cabdff1aSopenharmony_ci dst += (8 * dst_stride); 4162cabdff1aSopenharmony_ci 4163cabdff1aSopenharmony_ci dst10_r = dst98_r; 4164cabdff1aSopenharmony_ci dst21_r = dst109_r; 4165cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 4166cabdff1aSopenharmony_ci } 4167cabdff1aSopenharmony_ci} 4168cabdff1aSopenharmony_ci 4169cabdff1aSopenharmony_cistatic void hevc_hv_4t_16w_msa(uint8_t *src, 4170cabdff1aSopenharmony_ci int32_t src_stride, 4171cabdff1aSopenharmony_ci int16_t *dst, 4172cabdff1aSopenharmony_ci int32_t dst_stride, 4173cabdff1aSopenharmony_ci const int8_t *filter_x, 4174cabdff1aSopenharmony_ci const int8_t *filter_y, 4175cabdff1aSopenharmony_ci int32_t height) 4176cabdff1aSopenharmony_ci{ 4177cabdff1aSopenharmony_ci if (4 == height) { 4178cabdff1aSopenharmony_ci hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride, 4179cabdff1aSopenharmony_ci filter_x, filter_y, 2); 4180cabdff1aSopenharmony_ci } else { 4181cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4182cabdff1aSopenharmony_ci filter_x, filter_y, height, 2); 4183cabdff1aSopenharmony_ci } 4184cabdff1aSopenharmony_ci} 4185cabdff1aSopenharmony_ci 4186cabdff1aSopenharmony_cistatic void hevc_hv_4t_24w_msa(uint8_t *src, 4187cabdff1aSopenharmony_ci int32_t src_stride, 4188cabdff1aSopenharmony_ci int16_t *dst, 4189cabdff1aSopenharmony_ci int32_t dst_stride, 4190cabdff1aSopenharmony_ci const int8_t *filter_x, 4191cabdff1aSopenharmony_ci const int8_t *filter_y, 4192cabdff1aSopenharmony_ci int32_t height) 4193cabdff1aSopenharmony_ci{ 4194cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4195cabdff1aSopenharmony_ci filter_x, filter_y, height, 3); 4196cabdff1aSopenharmony_ci} 4197cabdff1aSopenharmony_ci 4198cabdff1aSopenharmony_cistatic void hevc_hv_4t_32w_msa(uint8_t *src, 4199cabdff1aSopenharmony_ci int32_t src_stride, 4200cabdff1aSopenharmony_ci int16_t *dst, 4201cabdff1aSopenharmony_ci int32_t dst_stride, 4202cabdff1aSopenharmony_ci const int8_t *filter_x, 4203cabdff1aSopenharmony_ci const int8_t *filter_y, 4204cabdff1aSopenharmony_ci int32_t height) 4205cabdff1aSopenharmony_ci{ 4206cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4207cabdff1aSopenharmony_ci filter_x, filter_y, height, 4); 4208cabdff1aSopenharmony_ci} 4209cabdff1aSopenharmony_ci 4210cabdff1aSopenharmony_ci#define MC_COPY(WIDTH) \ 4211cabdff1aSopenharmony_civoid ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \ 4212cabdff1aSopenharmony_ci uint8_t *src, \ 4213cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 4214cabdff1aSopenharmony_ci int height, \ 4215cabdff1aSopenharmony_ci intptr_t mx, \ 4216cabdff1aSopenharmony_ci intptr_t my, \ 4217cabdff1aSopenharmony_ci int width) \ 4218cabdff1aSopenharmony_ci{ \ 4219cabdff1aSopenharmony_ci hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \ 4220cabdff1aSopenharmony_ci} 4221cabdff1aSopenharmony_ci 4222cabdff1aSopenharmony_ciMC_COPY(4); 4223cabdff1aSopenharmony_ciMC_COPY(6); 4224cabdff1aSopenharmony_ciMC_COPY(8); 4225cabdff1aSopenharmony_ciMC_COPY(12); 4226cabdff1aSopenharmony_ciMC_COPY(16); 4227cabdff1aSopenharmony_ciMC_COPY(24); 4228cabdff1aSopenharmony_ciMC_COPY(32); 4229cabdff1aSopenharmony_ciMC_COPY(48); 4230cabdff1aSopenharmony_ciMC_COPY(64); 4231cabdff1aSopenharmony_ci 4232cabdff1aSopenharmony_ci#undef MC_COPY 4233cabdff1aSopenharmony_ci 4234cabdff1aSopenharmony_ci#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 4235cabdff1aSopenharmony_civoid ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \ 4236cabdff1aSopenharmony_ci uint8_t *src, \ 4237cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 4238cabdff1aSopenharmony_ci int height, \ 4239cabdff1aSopenharmony_ci intptr_t mx, \ 4240cabdff1aSopenharmony_ci intptr_t my, \ 4241cabdff1aSopenharmony_ci int width) \ 4242cabdff1aSopenharmony_ci{ \ 4243cabdff1aSopenharmony_ci const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 4244cabdff1aSopenharmony_ci \ 4245cabdff1aSopenharmony_ci hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \ 4246cabdff1aSopenharmony_ci MAX_PB_SIZE, filter, height); \ 4247cabdff1aSopenharmony_ci} 4248cabdff1aSopenharmony_ci 4249cabdff1aSopenharmony_ciMC(qpel, h, 4, 8, hz, mx); 4250cabdff1aSopenharmony_ciMC(qpel, h, 8, 8, hz, mx); 4251cabdff1aSopenharmony_ciMC(qpel, h, 12, 8, hz, mx); 4252cabdff1aSopenharmony_ciMC(qpel, h, 16, 8, hz, mx); 4253cabdff1aSopenharmony_ciMC(qpel, h, 24, 8, hz, mx); 4254cabdff1aSopenharmony_ciMC(qpel, h, 32, 8, hz, mx); 4255cabdff1aSopenharmony_ciMC(qpel, h, 48, 8, hz, mx); 4256cabdff1aSopenharmony_ciMC(qpel, h, 64, 8, hz, mx); 4257cabdff1aSopenharmony_ci 4258cabdff1aSopenharmony_ciMC(qpel, v, 4, 8, vt, my); 4259cabdff1aSopenharmony_ciMC(qpel, v, 8, 8, vt, my); 4260cabdff1aSopenharmony_ciMC(qpel, v, 12, 8, vt, my); 4261cabdff1aSopenharmony_ciMC(qpel, v, 16, 8, vt, my); 4262cabdff1aSopenharmony_ciMC(qpel, v, 24, 8, vt, my); 4263cabdff1aSopenharmony_ciMC(qpel, v, 32, 8, vt, my); 4264cabdff1aSopenharmony_ciMC(qpel, v, 48, 8, vt, my); 4265cabdff1aSopenharmony_ciMC(qpel, v, 64, 8, vt, my); 4266cabdff1aSopenharmony_ci 4267cabdff1aSopenharmony_ciMC(epel, h, 4, 4, hz, mx); 4268cabdff1aSopenharmony_ciMC(epel, h, 6, 4, hz, mx); 4269cabdff1aSopenharmony_ciMC(epel, h, 8, 4, hz, mx); 4270cabdff1aSopenharmony_ciMC(epel, h, 12, 4, hz, mx); 4271cabdff1aSopenharmony_ciMC(epel, h, 16, 4, hz, mx); 4272cabdff1aSopenharmony_ciMC(epel, h, 24, 4, hz, mx); 4273cabdff1aSopenharmony_ciMC(epel, h, 32, 4, hz, mx); 4274cabdff1aSopenharmony_ci 4275cabdff1aSopenharmony_ciMC(epel, v, 4, 4, vt, my); 4276cabdff1aSopenharmony_ciMC(epel, v, 6, 4, vt, my); 4277cabdff1aSopenharmony_ciMC(epel, v, 8, 4, vt, my); 4278cabdff1aSopenharmony_ciMC(epel, v, 12, 4, vt, my); 4279cabdff1aSopenharmony_ciMC(epel, v, 16, 4, vt, my); 4280cabdff1aSopenharmony_ciMC(epel, v, 24, 4, vt, my); 4281cabdff1aSopenharmony_ciMC(epel, v, 32, 4, vt, my); 4282cabdff1aSopenharmony_ci 4283cabdff1aSopenharmony_ci#undef MC 4284cabdff1aSopenharmony_ci 4285cabdff1aSopenharmony_ci#define MC_HV(PEL, WIDTH, TAP) \ 4286cabdff1aSopenharmony_civoid ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_msa(int16_t *dst, \ 4287cabdff1aSopenharmony_ci uint8_t *src, \ 4288cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 4289cabdff1aSopenharmony_ci int height, \ 4290cabdff1aSopenharmony_ci intptr_t mx, \ 4291cabdff1aSopenharmony_ci intptr_t my, \ 4292cabdff1aSopenharmony_ci int width) \ 4293cabdff1aSopenharmony_ci{ \ 4294cabdff1aSopenharmony_ci const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 4295cabdff1aSopenharmony_ci const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 4296cabdff1aSopenharmony_ci \ 4297cabdff1aSopenharmony_ci hevc_hv_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \ 4298cabdff1aSopenharmony_ci filter_x, filter_y, height); \ 4299cabdff1aSopenharmony_ci} 4300cabdff1aSopenharmony_ci 4301cabdff1aSopenharmony_ciMC_HV(qpel, 4, 8); 4302cabdff1aSopenharmony_ciMC_HV(qpel, 8, 8); 4303cabdff1aSopenharmony_ciMC_HV(qpel, 12, 8); 4304cabdff1aSopenharmony_ciMC_HV(qpel, 16, 8); 4305cabdff1aSopenharmony_ciMC_HV(qpel, 24, 8); 4306cabdff1aSopenharmony_ciMC_HV(qpel, 32, 8); 4307cabdff1aSopenharmony_ciMC_HV(qpel, 48, 8); 4308cabdff1aSopenharmony_ciMC_HV(qpel, 64, 8); 4309cabdff1aSopenharmony_ci 4310cabdff1aSopenharmony_ciMC_HV(epel, 4, 4); 4311cabdff1aSopenharmony_ciMC_HV(epel, 6, 4); 4312cabdff1aSopenharmony_ciMC_HV(epel, 8, 4); 4313cabdff1aSopenharmony_ciMC_HV(epel, 12, 4); 4314cabdff1aSopenharmony_ciMC_HV(epel, 16, 4); 4315cabdff1aSopenharmony_ciMC_HV(epel, 24, 4); 4316cabdff1aSopenharmony_ciMC_HV(epel, 32, 4); 4317cabdff1aSopenharmony_ci 4318cabdff1aSopenharmony_ci#undef MC_HV 4319