1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 22cabdff1aSopenharmony_ci#include "libavcodec/mips/hevcdsp_mips.h" 23cabdff1aSopenharmony_ci#include "libavcodec/mips/hevc_macros_msa.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = { 26cabdff1aSopenharmony_ci /* 8 width cases */ 27cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28cabdff1aSopenharmony_ci /* 4 width cases */ 29cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 30cabdff1aSopenharmony_ci /* 4 width cases */ 31cabdff1aSopenharmony_ci 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 32cabdff1aSopenharmony_ci}; 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 35cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, \ 36cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, \ 37cabdff1aSopenharmony_ci out0, out1) \ 38cabdff1aSopenharmony_ci{ \ 39cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 40cabdff1aSopenharmony_ci \ 41cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 42cabdff1aSopenharmony_ci DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 43cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 44cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 45cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 46cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ 47cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ 48cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \ 49cabdff1aSopenharmony_ci} 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ci#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 52cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, \ 53cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, \ 54cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 55cabdff1aSopenharmony_ci{ \ 56cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 57cabdff1aSopenharmony_ci \ 58cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 59cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 60cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 61cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 62cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ 63cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ 64cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ 65cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 66cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ 67cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ 68cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ 69cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 70cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ 71cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ 72cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ 73cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 74cabdff1aSopenharmony_ci} 75cabdff1aSopenharmony_ci 76cabdff1aSopenharmony_ci#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 77cabdff1aSopenharmony_ci mask0, mask1, filt0, filt1, \ 78cabdff1aSopenharmony_ci out0, out1) \ 79cabdff1aSopenharmony_ci{ \ 80cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 81cabdff1aSopenharmony_ci \ 82cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 83cabdff1aSopenharmony_ci DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 84cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 85cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 86cabdff1aSopenharmony_ci} 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 89cabdff1aSopenharmony_ci mask0, mask1, filt0, filt1, \ 90cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 91cabdff1aSopenharmony_ci{ \ 92cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 93cabdff1aSopenharmony_ci \ 94cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 95cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 96cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 97cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 98cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 99cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 100cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 101cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 102cabdff1aSopenharmony_ci} 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_cistatic void copy_width8_msa(uint8_t *src, int32_t src_stride, 105cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 106cabdff1aSopenharmony_ci int32_t height) 107cabdff1aSopenharmony_ci{ 108cabdff1aSopenharmony_ci int32_t cnt; 109cabdff1aSopenharmony_ci uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ci if (2 == height) { 112cabdff1aSopenharmony_ci LD2(src, src_stride, out0, out1); 113cabdff1aSopenharmony_ci SD(out0, dst); 114cabdff1aSopenharmony_ci dst += dst_stride; 115cabdff1aSopenharmony_ci SD(out1, dst); 116cabdff1aSopenharmony_ci } else if (6 == height) { 117cabdff1aSopenharmony_ci LD4(src, src_stride, out0, out1, out2, out3); 118cabdff1aSopenharmony_ci src += (4 * src_stride); 119cabdff1aSopenharmony_ci SD4(out0, out1, out2, out3, dst, dst_stride); 120cabdff1aSopenharmony_ci dst += (4 * dst_stride); 121cabdff1aSopenharmony_ci LD2(src, src_stride, out0, out1); 122cabdff1aSopenharmony_ci SD(out0, dst); 123cabdff1aSopenharmony_ci dst += dst_stride; 124cabdff1aSopenharmony_ci SD(out1, dst); 125cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 126cabdff1aSopenharmony_ci for (cnt = (height >> 3); cnt--;) { 127cabdff1aSopenharmony_ci LD4(src, src_stride, out0, out1, out2, out3); 128cabdff1aSopenharmony_ci src += (4 * src_stride); 129cabdff1aSopenharmony_ci LD4(src, src_stride, out4, out5, out6, out7); 130cabdff1aSopenharmony_ci src += (4 * src_stride); 131cabdff1aSopenharmony_ci SD4(out0, out1, out2, out3, dst, dst_stride); 132cabdff1aSopenharmony_ci dst += (4 * dst_stride); 133cabdff1aSopenharmony_ci SD4(out4, out5, out6, out7, dst, dst_stride); 134cabdff1aSopenharmony_ci dst += (4 * dst_stride); 135cabdff1aSopenharmony_ci } 136cabdff1aSopenharmony_ci } else if (0 == (height % 4)) { 137cabdff1aSopenharmony_ci for (cnt = (height >> 2); cnt--;) { 138cabdff1aSopenharmony_ci LD4(src, src_stride, out0, out1, out2, out3); 139cabdff1aSopenharmony_ci src += (4 * src_stride); 140cabdff1aSopenharmony_ci SD4(out0, out1, out2, out3, dst, dst_stride); 141cabdff1aSopenharmony_ci dst += (4 * dst_stride); 142cabdff1aSopenharmony_ci } 143cabdff1aSopenharmony_ci } 144cabdff1aSopenharmony_ci} 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_cistatic void copy_width12_msa(uint8_t *src, int32_t src_stride, 147cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 148cabdff1aSopenharmony_ci int32_t height) 149cabdff1aSopenharmony_ci{ 150cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 151cabdff1aSopenharmony_ci 152cabdff1aSopenharmony_ci LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 153cabdff1aSopenharmony_ci src += (8 * src_stride); 154cabdff1aSopenharmony_ci ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 155cabdff1aSopenharmony_ci dst += (8 * dst_stride); 156cabdff1aSopenharmony_ci LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 157cabdff1aSopenharmony_ci ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 158cabdff1aSopenharmony_ci} 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_cistatic void copy_width16_msa(uint8_t *src, int32_t src_stride, 161cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 162cabdff1aSopenharmony_ci int32_t height) 163cabdff1aSopenharmony_ci{ 164cabdff1aSopenharmony_ci int32_t cnt; 165cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci if (12 == height) { 168cabdff1aSopenharmony_ci LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 169cabdff1aSopenharmony_ci src += (8 * src_stride); 170cabdff1aSopenharmony_ci ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 171cabdff1aSopenharmony_ci dst += (8 * dst_stride); 172cabdff1aSopenharmony_ci LD_UB4(src, src_stride, src0, src1, src2, src3); 173cabdff1aSopenharmony_ci src += (4 * src_stride); 174cabdff1aSopenharmony_ci ST_UB4(src0, src1, src2, src3, dst, dst_stride); 175cabdff1aSopenharmony_ci dst += (4 * dst_stride); 176cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 177cabdff1aSopenharmony_ci for (cnt = (height >> 3); cnt--;) { 178cabdff1aSopenharmony_ci LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, 179cabdff1aSopenharmony_ci src7); 180cabdff1aSopenharmony_ci src += (8 * src_stride); 181cabdff1aSopenharmony_ci ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 182cabdff1aSopenharmony_ci dst_stride); 183cabdff1aSopenharmony_ci dst += (8 * dst_stride); 184cabdff1aSopenharmony_ci } 185cabdff1aSopenharmony_ci } else if (0 == (height % 4)) { 186cabdff1aSopenharmony_ci for (cnt = (height >> 2); cnt--;) { 187cabdff1aSopenharmony_ci LD_UB4(src, src_stride, src0, src1, src2, src3); 188cabdff1aSopenharmony_ci src += (4 * src_stride); 189cabdff1aSopenharmony_ci 190cabdff1aSopenharmony_ci ST_UB4(src0, src1, src2, src3, dst, dst_stride); 191cabdff1aSopenharmony_ci dst += (4 * dst_stride); 192cabdff1aSopenharmony_ci } 193cabdff1aSopenharmony_ci } 194cabdff1aSopenharmony_ci} 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_cistatic void copy_width24_msa(uint8_t *src, int32_t src_stride, 197cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 198cabdff1aSopenharmony_ci int32_t height) 199cabdff1aSopenharmony_ci{ 200cabdff1aSopenharmony_ci int32_t cnt; 201cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 202cabdff1aSopenharmony_ci uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 203cabdff1aSopenharmony_ci 204cabdff1aSopenharmony_ci for (cnt = 4; cnt--;) { 205cabdff1aSopenharmony_ci LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 206cabdff1aSopenharmony_ci LD4(src + 16, src_stride, out0, out1, out2, out3); 207cabdff1aSopenharmony_ci src += (4 * src_stride); 208cabdff1aSopenharmony_ci LD4(src + 16, src_stride, out4, out5, out6, out7); 209cabdff1aSopenharmony_ci src += (4 * src_stride); 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 212cabdff1aSopenharmony_ci SD4(out0, out1, out2, out3, dst + 16, dst_stride); 213cabdff1aSopenharmony_ci dst += (4 * dst_stride); 214cabdff1aSopenharmony_ci SD4(out4, out5, out6, out7, dst + 16, dst_stride); 215cabdff1aSopenharmony_ci dst += (4 * dst_stride); 216cabdff1aSopenharmony_ci } 217cabdff1aSopenharmony_ci} 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_cistatic void copy_width32_msa(uint8_t *src, int32_t src_stride, 220cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 221cabdff1aSopenharmony_ci int32_t height) 222cabdff1aSopenharmony_ci{ 223cabdff1aSopenharmony_ci int32_t cnt; 224cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci for (cnt = (height >> 2); cnt--;) { 227cabdff1aSopenharmony_ci LD_UB4(src, src_stride, src0, src1, src2, src3); 228cabdff1aSopenharmony_ci LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 229cabdff1aSopenharmony_ci src += (4 * src_stride); 230cabdff1aSopenharmony_ci ST_UB4(src0, src1, src2, src3, dst, dst_stride); 231cabdff1aSopenharmony_ci ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 232cabdff1aSopenharmony_ci dst += (4 * dst_stride); 233cabdff1aSopenharmony_ci } 234cabdff1aSopenharmony_ci} 235cabdff1aSopenharmony_ci 236cabdff1aSopenharmony_cistatic void copy_width48_msa(uint8_t *src, int32_t src_stride, 237cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 238cabdff1aSopenharmony_ci int32_t height) 239cabdff1aSopenharmony_ci{ 240cabdff1aSopenharmony_ci int32_t cnt; 241cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 242cabdff1aSopenharmony_ci v16u8 src11; 243cabdff1aSopenharmony_ci 244cabdff1aSopenharmony_ci for (cnt = (height >> 2); cnt--;) { 245cabdff1aSopenharmony_ci LD_UB4(src, src_stride, src0, src1, src2, src3); 246cabdff1aSopenharmony_ci LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 247cabdff1aSopenharmony_ci LD_UB4(src + 32, src_stride, src8, src9, src10, src11); 248cabdff1aSopenharmony_ci src += (4 * src_stride); 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_ci ST_UB4(src0, src1, src2, src3, dst, dst_stride); 251cabdff1aSopenharmony_ci ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 252cabdff1aSopenharmony_ci ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride); 253cabdff1aSopenharmony_ci dst += (4 * dst_stride); 254cabdff1aSopenharmony_ci } 255cabdff1aSopenharmony_ci} 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_cistatic void copy_width64_msa(uint8_t *src, int32_t src_stride, 258cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 259cabdff1aSopenharmony_ci int32_t height) 260cabdff1aSopenharmony_ci{ 261cabdff1aSopenharmony_ci int32_t cnt; 262cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 263cabdff1aSopenharmony_ci v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 264cabdff1aSopenharmony_ci 265cabdff1aSopenharmony_ci for (cnt = (height >> 2); cnt--;) { 266cabdff1aSopenharmony_ci LD_UB4(src, 16, src0, src1, src2, src3); 267cabdff1aSopenharmony_ci src += src_stride; 268cabdff1aSopenharmony_ci LD_UB4(src, 16, src4, src5, src6, src7); 269cabdff1aSopenharmony_ci src += src_stride; 270cabdff1aSopenharmony_ci LD_UB4(src, 16, src8, src9, src10, src11); 271cabdff1aSopenharmony_ci src += src_stride; 272cabdff1aSopenharmony_ci LD_UB4(src, 16, src12, src13, src14, src15); 273cabdff1aSopenharmony_ci src += src_stride; 274cabdff1aSopenharmony_ci 275cabdff1aSopenharmony_ci ST_UB4(src0, src1, src2, src3, dst, 16); 276cabdff1aSopenharmony_ci dst += dst_stride; 277cabdff1aSopenharmony_ci ST_UB4(src4, src5, src6, src7, dst, 16); 278cabdff1aSopenharmony_ci dst += dst_stride; 279cabdff1aSopenharmony_ci ST_UB4(src8, src9, src10, src11, dst, 16); 280cabdff1aSopenharmony_ci dst += dst_stride; 281cabdff1aSopenharmony_ci ST_UB4(src12, src13, src14, src15, dst, 16); 282cabdff1aSopenharmony_ci dst += dst_stride; 283cabdff1aSopenharmony_ci } 284cabdff1aSopenharmony_ci} 285cabdff1aSopenharmony_ci 286cabdff1aSopenharmony_cistatic void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, 287cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 288cabdff1aSopenharmony_ci const int8_t *filter) 289cabdff1aSopenharmony_ci{ 290cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, mask3, out; 291cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 292cabdff1aSopenharmony_ci v8i16 filt, out0, out1; 293cabdff1aSopenharmony_ci 294cabdff1aSopenharmony_ci mask0 = LD_UB(&ff_hevc_mask_arr[16]); 295cabdff1aSopenharmony_ci src -= 3; 296cabdff1aSopenharmony_ci 297cabdff1aSopenharmony_ci /* rearranging filter */ 298cabdff1aSopenharmony_ci filt = LD_SH(filter); 299cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 300cabdff1aSopenharmony_ci 301cabdff1aSopenharmony_ci mask1 = mask0 + 2; 302cabdff1aSopenharmony_ci mask2 = mask0 + 4; 303cabdff1aSopenharmony_ci mask3 = mask0 + 6; 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 306cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 307cabdff1aSopenharmony_ci HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 308cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out0, out1); 309cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 6); 310cabdff1aSopenharmony_ci SAT_SH2_SH(out0, out1, 7); 311cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 312cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 313cabdff1aSopenharmony_ci} 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_cistatic void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, 316cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 317cabdff1aSopenharmony_ci const int8_t *filter) 318cabdff1aSopenharmony_ci{ 319cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, filt3; 320cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 321cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, mask3, out; 322cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 323cabdff1aSopenharmony_ci 324cabdff1aSopenharmony_ci mask0 = LD_UB(&ff_hevc_mask_arr[16]); 325cabdff1aSopenharmony_ci src -= 3; 326cabdff1aSopenharmony_ci 327cabdff1aSopenharmony_ci /* rearranging filter */ 328cabdff1aSopenharmony_ci filt = LD_SH(filter); 329cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_ci mask1 = mask0 + 2; 332cabdff1aSopenharmony_ci mask2 = mask0 + 4; 333cabdff1aSopenharmony_ci mask3 = mask0 + 6; 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 336cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 337cabdff1aSopenharmony_ci src += (4 * src_stride); 338cabdff1aSopenharmony_ci HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 339cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out0, out1); 340cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 341cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 342cabdff1aSopenharmony_ci HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 343cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out2, out3); 344cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 345cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 346cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 347cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 348cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 349cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 350cabdff1aSopenharmony_ci} 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_cistatic void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, 353cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 354cabdff1aSopenharmony_ci const int8_t *filter) 355cabdff1aSopenharmony_ci{ 356cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, mask3, out; 357cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 358cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci mask0 = LD_UB(&ff_hevc_mask_arr[16]); 361cabdff1aSopenharmony_ci src -= 3; 362cabdff1aSopenharmony_ci 363cabdff1aSopenharmony_ci /* rearranging filter */ 364cabdff1aSopenharmony_ci filt = LD_SH(filter); 365cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci mask1 = mask0 + 2; 368cabdff1aSopenharmony_ci mask2 = mask0 + 4; 369cabdff1aSopenharmony_ci mask3 = mask0 + 6; 370cabdff1aSopenharmony_ci 371cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 372cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 373cabdff1aSopenharmony_ci src += (4 * src_stride); 374cabdff1aSopenharmony_ci HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 375cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out0, out1); 376cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 377cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 378cabdff1aSopenharmony_ci src += (4 * src_stride); 379cabdff1aSopenharmony_ci HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 380cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out2, out3); 381cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 382cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 383cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 384cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 385cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 386cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 387cabdff1aSopenharmony_ci dst += (8 * dst_stride); 388cabdff1aSopenharmony_ci 389cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 390cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 391cabdff1aSopenharmony_ci src += (4 * src_stride); 392cabdff1aSopenharmony_ci HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 393cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out0, out1); 394cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 395cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 396cabdff1aSopenharmony_ci src += (4 * src_stride); 397cabdff1aSopenharmony_ci HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 398cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out2, out3); 399cabdff1aSopenharmony_ci 400cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 401cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 402cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 403cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 404cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 405cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 406cabdff1aSopenharmony_ci} 407cabdff1aSopenharmony_ci 408cabdff1aSopenharmony_cistatic void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, 409cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 410cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 411cabdff1aSopenharmony_ci{ 412cabdff1aSopenharmony_ci if (4 == height) { 413cabdff1aSopenharmony_ci common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); 414cabdff1aSopenharmony_ci } else if (8 == height) { 415cabdff1aSopenharmony_ci common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); 416cabdff1aSopenharmony_ci } else if (16 == height) { 417cabdff1aSopenharmony_ci common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter); 418cabdff1aSopenharmony_ci } 419cabdff1aSopenharmony_ci} 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_cistatic void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, 422cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 423cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 424cabdff1aSopenharmony_ci{ 425cabdff1aSopenharmony_ci uint32_t loop_cnt; 426cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 427cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; 428cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; 429cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 430cabdff1aSopenharmony_ci 431cabdff1aSopenharmony_ci mask0 = LD_UB(&ff_hevc_mask_arr[0]); 432cabdff1aSopenharmony_ci src -= 3; 433cabdff1aSopenharmony_ci 434cabdff1aSopenharmony_ci /* rearranging filter */ 435cabdff1aSopenharmony_ci filt = LD_SH(filter); 436cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 437cabdff1aSopenharmony_ci 438cabdff1aSopenharmony_ci mask1 = mask0 + 2; 439cabdff1aSopenharmony_ci mask2 = mask0 + 4; 440cabdff1aSopenharmony_ci mask3 = mask0 + 6; 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 443cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 444cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 445cabdff1aSopenharmony_ci src += (4 * src_stride); 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); 448cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); 449cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, 450cabdff1aSopenharmony_ci out0, out1, out2, out3); 451cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); 452cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); 453cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, 454cabdff1aSopenharmony_ci out0, out1, out2, out3); 455cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); 456cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); 457cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, 458cabdff1aSopenharmony_ci out0, out1, out2, out3); 459cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); 460cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); 461cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, 462cabdff1aSopenharmony_ci out0, out1, out2, out3); 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 465cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 466cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 467cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2, out3); 468cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 469cabdff1aSopenharmony_ci dst += (4 * dst_stride); 470cabdff1aSopenharmony_ci } 471cabdff1aSopenharmony_ci} 472cabdff1aSopenharmony_ci 473cabdff1aSopenharmony_cistatic void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, 474cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 475cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 476cabdff1aSopenharmony_ci{ 477cabdff1aSopenharmony_ci uint32_t loop_cnt; 478cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00; 479cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2; 480cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 481cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 482cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, filt3; 483cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3, out4, out5; 484cabdff1aSopenharmony_ci 485cabdff1aSopenharmony_ci mask00 = LD_UB(&ff_hevc_mask_arr[0]); 486cabdff1aSopenharmony_ci mask0 = LD_UB(&ff_hevc_mask_arr[16]); 487cabdff1aSopenharmony_ci 488cabdff1aSopenharmony_ci src = src - 3; 489cabdff1aSopenharmony_ci 490cabdff1aSopenharmony_ci /* rearranging filter */ 491cabdff1aSopenharmony_ci filt = LD_SH(filter); 492cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 493cabdff1aSopenharmony_ci 494cabdff1aSopenharmony_ci mask1 = mask00 + 2; 495cabdff1aSopenharmony_ci mask2 = mask00 + 4; 496cabdff1aSopenharmony_ci mask3 = mask00 + 6; 497cabdff1aSopenharmony_ci mask4 = mask0 + 2; 498cabdff1aSopenharmony_ci mask5 = mask0 + 4; 499cabdff1aSopenharmony_ci mask6 = mask0 + 6; 500cabdff1aSopenharmony_ci 501cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 502cabdff1aSopenharmony_ci /* 8 width */ 503cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 504cabdff1aSopenharmony_ci /* 4 width */ 505cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src4, src5, src6, src7); 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 508cabdff1aSopenharmony_ci XORI_B4_128_SB(src4, src5, src6, src7); 509cabdff1aSopenharmony_ci src += (4 * src_stride); 510cabdff1aSopenharmony_ci 511cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1); 512cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3); 513cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, 514cabdff1aSopenharmony_ci out1, out2, out3); 515cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 516cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 517cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0, 518cabdff1aSopenharmony_ci out1, out2, out3); 519cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 520cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 521cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0, 522cabdff1aSopenharmony_ci out1, out2, out3); 523cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5); 524cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7); 525cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0, 526cabdff1aSopenharmony_ci out1, out2, out3); 527cabdff1aSopenharmony_ci 528cabdff1aSopenharmony_ci /* 4 width */ 529cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1); 530cabdff1aSopenharmony_ci DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5); 531cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3); 532cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5); 533cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5); 534cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5); 535cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7); 536cabdff1aSopenharmony_ci DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5); 537cabdff1aSopenharmony_ci 538cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 539cabdff1aSopenharmony_ci SRARI_H2_SH(out4, out5, 6); 540cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 541cabdff1aSopenharmony_ci SAT_SH2_SH(out4, out5, 7); 542cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 543cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2, out3); 544cabdff1aSopenharmony_ci tmp2 = PCKEV_XORI128_UB(out4, out5); 545cabdff1aSopenharmony_ci 546cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 547cabdff1aSopenharmony_ci ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride); 548cabdff1aSopenharmony_ci dst += (4 * dst_stride); 549cabdff1aSopenharmony_ci } 550cabdff1aSopenharmony_ci} 551cabdff1aSopenharmony_ci 552cabdff1aSopenharmony_cistatic void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, 553cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 554cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 555cabdff1aSopenharmony_ci{ 556cabdff1aSopenharmony_ci uint32_t loop_cnt; 557cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, mask3, out; 558cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 559cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, filt3; 560cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 561cabdff1aSopenharmony_ci 562cabdff1aSopenharmony_ci mask0 = LD_UB(&ff_hevc_mask_arr[0]); 563cabdff1aSopenharmony_ci src -= 3; 564cabdff1aSopenharmony_ci 565cabdff1aSopenharmony_ci /* rearranging filter */ 566cabdff1aSopenharmony_ci filt = LD_SH(filter); 567cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 568cabdff1aSopenharmony_ci 569cabdff1aSopenharmony_ci mask1 = mask0 + 2; 570cabdff1aSopenharmony_ci mask2 = mask0 + 4; 571cabdff1aSopenharmony_ci mask3 = mask0 + 6; 572cabdff1aSopenharmony_ci 573cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 574cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src2); 575cabdff1aSopenharmony_ci LD_SB2(src + 8, src_stride, src1, src3); 576cabdff1aSopenharmony_ci src += (2 * src_stride); 577cabdff1aSopenharmony_ci 578cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src4, src6); 579cabdff1aSopenharmony_ci LD_SB2(src + 8, src_stride, src5, src7); 580cabdff1aSopenharmony_ci src += (2 * src_stride); 581cabdff1aSopenharmony_ci 582cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 583cabdff1aSopenharmony_ci XORI_B4_128_SB(src4, src5, src6, src7); 584cabdff1aSopenharmony_ci HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 585cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out0, 586cabdff1aSopenharmony_ci out1, out2, out3); 587cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 588cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 589cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 590cabdff1aSopenharmony_ci ST_UB(out, dst); 591cabdff1aSopenharmony_ci dst += dst_stride; 592cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 593cabdff1aSopenharmony_ci ST_UB(out, dst); 594cabdff1aSopenharmony_ci dst += dst_stride; 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, 597cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out0, 598cabdff1aSopenharmony_ci out1, out2, out3); 599cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 600cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 601cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 602cabdff1aSopenharmony_ci ST_UB(out, dst); 603cabdff1aSopenharmony_ci dst += dst_stride; 604cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 605cabdff1aSopenharmony_ci ST_UB(out, dst); 606cabdff1aSopenharmony_ci dst += dst_stride; 607cabdff1aSopenharmony_ci } 608cabdff1aSopenharmony_ci} 609cabdff1aSopenharmony_ci 610cabdff1aSopenharmony_cistatic void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, 611cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 612cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 613cabdff1aSopenharmony_ci{ 614cabdff1aSopenharmony_ci uint32_t loop_cnt; 615cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 616cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out; 617cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 618cabdff1aSopenharmony_ci v16i8 vec11; 619cabdff1aSopenharmony_ci v8i16 out0, out1, out2, out3, out8, out9, filt; 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_ci mask0 = LD_UB(&ff_hevc_mask_arr[0]); 622cabdff1aSopenharmony_ci src -= 3; 623cabdff1aSopenharmony_ci 624cabdff1aSopenharmony_ci /* rearranging filter */ 625cabdff1aSopenharmony_ci filt = LD_SH(filter); 626cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 627cabdff1aSopenharmony_ci 628cabdff1aSopenharmony_ci mask1 = mask0 + 2; 629cabdff1aSopenharmony_ci mask2 = mask0 + 4; 630cabdff1aSopenharmony_ci mask3 = mask0 + 6; 631cabdff1aSopenharmony_ci mask4 = mask0 + 8; 632cabdff1aSopenharmony_ci mask5 = mask0 + 10; 633cabdff1aSopenharmony_ci mask6 = mask0 + 12; 634cabdff1aSopenharmony_ci mask7 = mask0 + 14; 635cabdff1aSopenharmony_ci 636cabdff1aSopenharmony_ci for (loop_cnt = 16; loop_cnt--;) { 637cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src2); 638cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src1, src3); 639cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 640cabdff1aSopenharmony_ci src += (2 * src_stride); 641cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8); 642cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9); 643cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3); 644cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0, 645cabdff1aSopenharmony_ci out8, out2, out9); 646cabdff1aSopenharmony_ci DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3); 647cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8); 648cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9); 649cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3); 650cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, 651cabdff1aSopenharmony_ci out0, out8, out2, out9); 652cabdff1aSopenharmony_ci DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3); 653cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10); 654cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11); 655cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7); 656cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1, 657cabdff1aSopenharmony_ci out0, out8, out2, out9); 658cabdff1aSopenharmony_ci DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3); 659cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10); 660cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11); 661cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7); 662cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3, 663cabdff1aSopenharmony_ci out0, out8, out2, out9); 664cabdff1aSopenharmony_ci DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3); 665cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out8, out2, out9, 6); 666cabdff1aSopenharmony_ci SRARI_H2_SH(out1, out3, 6); 667cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out8, out2, out9, 7); 668cabdff1aSopenharmony_ci SAT_SH2_SH(out1, out3, 7); 669cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out8, out9); 670cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst + 16, dst_stride); 671cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 672cabdff1aSopenharmony_ci ST_UB(out, dst); 673cabdff1aSopenharmony_ci dst += dst_stride; 674cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 675cabdff1aSopenharmony_ci ST_UB(out, dst); 676cabdff1aSopenharmony_ci dst += dst_stride; 677cabdff1aSopenharmony_ci } 678cabdff1aSopenharmony_ci} 679cabdff1aSopenharmony_ci 680cabdff1aSopenharmony_cistatic void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, 681cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 682cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 683cabdff1aSopenharmony_ci{ 684cabdff1aSopenharmony_ci uint32_t loop_cnt; 685cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, mask3, out; 686cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 687cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, filt3; 688cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 689cabdff1aSopenharmony_ci 690cabdff1aSopenharmony_ci mask0 = LD_UB(&ff_hevc_mask_arr[0]); 691cabdff1aSopenharmony_ci src -= 3; 692cabdff1aSopenharmony_ci 693cabdff1aSopenharmony_ci /* rearranging filter */ 694cabdff1aSopenharmony_ci filt = LD_SH(filter); 695cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 696cabdff1aSopenharmony_ci 697cabdff1aSopenharmony_ci mask1 = mask0 + 2; 698cabdff1aSopenharmony_ci mask2 = mask0 + 4; 699cabdff1aSopenharmony_ci mask3 = mask0 + 6; 700cabdff1aSopenharmony_ci 701cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 702cabdff1aSopenharmony_ci src0 = LD_SB(src); 703cabdff1aSopenharmony_ci src1 = LD_SB(src + 8); 704cabdff1aSopenharmony_ci src2 = LD_SB(src + 16); 705cabdff1aSopenharmony_ci src3 = LD_SB(src + 24); 706cabdff1aSopenharmony_ci src += src_stride; 707cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 708cabdff1aSopenharmony_ci 709cabdff1aSopenharmony_ci src4 = LD_SB(src); 710cabdff1aSopenharmony_ci src5 = LD_SB(src + 8); 711cabdff1aSopenharmony_ci src6 = LD_SB(src + 16); 712cabdff1aSopenharmony_ci src7 = LD_SB(src + 24); 713cabdff1aSopenharmony_ci src += src_stride; 714cabdff1aSopenharmony_ci XORI_B4_128_SB(src4, src5, src6, src7); 715cabdff1aSopenharmony_ci 716cabdff1aSopenharmony_ci HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 717cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out0, 718cabdff1aSopenharmony_ci out1, out2, out3); 719cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 720cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 721cabdff1aSopenharmony_ci 722cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 723cabdff1aSopenharmony_ci ST_UB(out, dst); 724cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 725cabdff1aSopenharmony_ci ST_UB(out, dst + 16); 726cabdff1aSopenharmony_ci dst += dst_stride; 727cabdff1aSopenharmony_ci 728cabdff1aSopenharmony_ci HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, 729cabdff1aSopenharmony_ci mask3, filt0, filt1, filt2, filt3, out0, 730cabdff1aSopenharmony_ci out1, out2, out3); 731cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 732cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 733cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 734cabdff1aSopenharmony_ci ST_UB(out, dst); 735cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 736cabdff1aSopenharmony_ci ST_UB(out, dst + 16); 737cabdff1aSopenharmony_ci dst += dst_stride; 738cabdff1aSopenharmony_ci } 739cabdff1aSopenharmony_ci} 740cabdff1aSopenharmony_ci 741cabdff1aSopenharmony_cistatic void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, 742cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 743cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 744cabdff1aSopenharmony_ci{ 745cabdff1aSopenharmony_ci uint32_t loop_cnt; 746cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2; 747cabdff1aSopenharmony_ci v16i8 src4; 748cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out; 749cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 750cabdff1aSopenharmony_ci 751cabdff1aSopenharmony_ci mask0 = LD_UB(&ff_hevc_mask_arr[0]); 752cabdff1aSopenharmony_ci src -= 3; 753cabdff1aSopenharmony_ci 754cabdff1aSopenharmony_ci /* rearranging filter */ 755cabdff1aSopenharmony_ci filt = LD_SH(filter); 756cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 757cabdff1aSopenharmony_ci 758cabdff1aSopenharmony_ci mask1 = mask0 + 2; 759cabdff1aSopenharmony_ci mask2 = mask0 + 4; 760cabdff1aSopenharmony_ci mask3 = mask0 + 6; 761cabdff1aSopenharmony_ci mask4 = mask0 + 8; 762cabdff1aSopenharmony_ci mask5 = mask0 + 10; 763cabdff1aSopenharmony_ci mask6 = mask0 + 12; 764cabdff1aSopenharmony_ci mask7 = mask0 + 14; 765cabdff1aSopenharmony_ci 766cabdff1aSopenharmony_ci for (loop_cnt = 64; loop_cnt--;) { 767cabdff1aSopenharmony_ci src0 = LD_SB(src); 768cabdff1aSopenharmony_ci src1 = LD_SB(src + 8); 769cabdff1aSopenharmony_ci src2 = LD_SB(src + 16); 770cabdff1aSopenharmony_ci src3 = LD_SB(src + 32); 771cabdff1aSopenharmony_ci src4 = LD_SB(src + 40); 772cabdff1aSopenharmony_ci src += src_stride; 773cabdff1aSopenharmony_ci 774cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 775cabdff1aSopenharmony_ci src4 = (v16i8) __msa_xori_b((v16u8) src4, 128); 776cabdff1aSopenharmony_ci 777cabdff1aSopenharmony_ci VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0, 778cabdff1aSopenharmony_ci vec0, vec1, vec2); 779cabdff1aSopenharmony_ci DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2); 780cabdff1aSopenharmony_ci VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1, 781cabdff1aSopenharmony_ci vec0, vec1, vec2); 782cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1); 783cabdff1aSopenharmony_ci out2 = __msa_dpadd_s_h(out2, vec2, filt1); 784cabdff1aSopenharmony_ci VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2, 785cabdff1aSopenharmony_ci vec0, vec1, vec2); 786cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1); 787cabdff1aSopenharmony_ci out2 = __msa_dpadd_s_h(out2, vec2, filt2); 788cabdff1aSopenharmony_ci 789cabdff1aSopenharmony_ci VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3, 790cabdff1aSopenharmony_ci vec0, vec1, vec2); 791cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1); 792cabdff1aSopenharmony_ci out2 = __msa_dpadd_s_h(out2, vec2, filt3); 793cabdff1aSopenharmony_ci 794cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 6); 795cabdff1aSopenharmony_ci out3 = __msa_srari_h(out2, 6); 796cabdff1aSopenharmony_ci SAT_SH3_SH(out0, out1, out3, 7); 797cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 798cabdff1aSopenharmony_ci ST_UB(out, dst); 799cabdff1aSopenharmony_ci 800cabdff1aSopenharmony_ci VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0, 801cabdff1aSopenharmony_ci vec0, vec1, vec2); 802cabdff1aSopenharmony_ci DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2); 803cabdff1aSopenharmony_ci VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1, 804cabdff1aSopenharmony_ci vec0, vec1, vec2); 805cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1); 806cabdff1aSopenharmony_ci out2 = __msa_dpadd_s_h(out2, vec2, filt1); 807cabdff1aSopenharmony_ci VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2, 808cabdff1aSopenharmony_ci vec0, vec1, vec2); 809cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1); 810cabdff1aSopenharmony_ci out2 = __msa_dpadd_s_h(out2, vec2, filt2); 811cabdff1aSopenharmony_ci VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3, 812cabdff1aSopenharmony_ci vec0, vec1, vec2); 813cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1); 814cabdff1aSopenharmony_ci out2 = __msa_dpadd_s_h(out2, vec2, filt3); 815cabdff1aSopenharmony_ci 816cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 6); 817cabdff1aSopenharmony_ci out2 = __msa_srari_h(out2, 6); 818cabdff1aSopenharmony_ci SAT_SH3_SH(out0, out1, out2, 7); 819cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out3, out0); 820cabdff1aSopenharmony_ci ST_UB(out, dst + 16); 821cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out1, out2); 822cabdff1aSopenharmony_ci ST_UB(out, dst + 32); 823cabdff1aSopenharmony_ci dst += dst_stride; 824cabdff1aSopenharmony_ci } 825cabdff1aSopenharmony_ci} 826cabdff1aSopenharmony_ci 827cabdff1aSopenharmony_cistatic void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, 828cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 829cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 830cabdff1aSopenharmony_ci{ 831cabdff1aSopenharmony_ci int32_t loop_cnt; 832cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, mask3, out; 833cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 834cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 835cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, filt3; 836cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, filt; 837cabdff1aSopenharmony_ci 838cabdff1aSopenharmony_ci mask0 = LD_UB(&ff_hevc_mask_arr[0]); 839cabdff1aSopenharmony_ci src -= 3; 840cabdff1aSopenharmony_ci 841cabdff1aSopenharmony_ci /* rearranging filter */ 842cabdff1aSopenharmony_ci filt = LD_SH(filter); 843cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 844cabdff1aSopenharmony_ci 845cabdff1aSopenharmony_ci mask1 = mask0 + 2; 846cabdff1aSopenharmony_ci mask2 = mask0 + 4; 847cabdff1aSopenharmony_ci mask3 = mask0 + 6; 848cabdff1aSopenharmony_ci 849cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 850cabdff1aSopenharmony_ci LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7); 851cabdff1aSopenharmony_ci src += src_stride; 852cabdff1aSopenharmony_ci 853cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 854cabdff1aSopenharmony_ci 855cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 856cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 857cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, 858cabdff1aSopenharmony_ci res1, res2, res3); 859cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 860cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 861cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0, 862cabdff1aSopenharmony_ci res1, res2, res3); 863cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 864cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 865cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0, 866cabdff1aSopenharmony_ci res1, res2, res3); 867cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5); 868cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7); 869cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0, 870cabdff1aSopenharmony_ci res1, res2, res3); 871cabdff1aSopenharmony_ci 872cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 6); 873cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 874cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(res0, res1); 875cabdff1aSopenharmony_ci ST_UB(out, dst); 876cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(res2, res3); 877cabdff1aSopenharmony_ci ST_UB(out, dst + 16); 878cabdff1aSopenharmony_ci 879cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 880cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 881cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, 882cabdff1aSopenharmony_ci res1, res2, res3); 883cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1); 884cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3); 885cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0, 886cabdff1aSopenharmony_ci res1, res2, res3); 887cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 888cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 889cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0, 890cabdff1aSopenharmony_ci res1, res2, res3); 891cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5); 892cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7); 893cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0, 894cabdff1aSopenharmony_ci res1, res2, res3); 895cabdff1aSopenharmony_ci 896cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 6); 897cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 898cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(res0, res1); 899cabdff1aSopenharmony_ci ST_UB(out, dst + 32); 900cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(res2, res3); 901cabdff1aSopenharmony_ci ST_UB(out, dst + 48); 902cabdff1aSopenharmony_ci dst += dst_stride; 903cabdff1aSopenharmony_ci } 904cabdff1aSopenharmony_ci} 905cabdff1aSopenharmony_ci 906cabdff1aSopenharmony_cistatic void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, 907cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 908cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 909cabdff1aSopenharmony_ci{ 910cabdff1aSopenharmony_ci uint32_t loop_cnt; 911cabdff1aSopenharmony_ci v16u8 out0, out1; 912cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 913cabdff1aSopenharmony_ci v16i8 src11, src12, src13, src14; 914cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 915cabdff1aSopenharmony_ci v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; 916cabdff1aSopenharmony_ci v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312; 917cabdff1aSopenharmony_ci v16i8 src10998, filt0, filt1, filt2, filt3; 918cabdff1aSopenharmony_ci v8i16 filt, out10, out32, out54, out76; 919cabdff1aSopenharmony_ci 920cabdff1aSopenharmony_ci src -= (3 * src_stride); 921cabdff1aSopenharmony_ci 922cabdff1aSopenharmony_ci filt = LD_SH(filter); 923cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 924cabdff1aSopenharmony_ci 925cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 926cabdff1aSopenharmony_ci src += (7 * src_stride); 927cabdff1aSopenharmony_ci 928cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 929cabdff1aSopenharmony_ci src54_r, src21_r); 930cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 931cabdff1aSopenharmony_ci ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, 932cabdff1aSopenharmony_ci src4332, src6554); 933cabdff1aSopenharmony_ci XORI_B3_128_SB(src2110, src4332, src6554); 934cabdff1aSopenharmony_ci 935cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 936cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 937cabdff1aSopenharmony_ci src += (4 * src_stride); 938cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src11, src12, src13, src14); 939cabdff1aSopenharmony_ci src += (4 * src_stride); 940cabdff1aSopenharmony_ci 941cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 942cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 943cabdff1aSopenharmony_ci ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, 944cabdff1aSopenharmony_ci src1110_r, src1211_r, src1312_r, src1413_r); 945cabdff1aSopenharmony_ci ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); 946cabdff1aSopenharmony_ci ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r, 947cabdff1aSopenharmony_ci src12111110, src14131312); 948cabdff1aSopenharmony_ci XORI_B2_128_SB(src8776, src10998); 949cabdff1aSopenharmony_ci XORI_B2_128_SB(src12111110, src14131312); 950cabdff1aSopenharmony_ci 951cabdff1aSopenharmony_ci DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32); 952cabdff1aSopenharmony_ci DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76); 953cabdff1aSopenharmony_ci DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32); 954cabdff1aSopenharmony_ci DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76); 955cabdff1aSopenharmony_ci DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32); 956cabdff1aSopenharmony_ci DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76); 957cabdff1aSopenharmony_ci DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32); 958cabdff1aSopenharmony_ci DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76); 959cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out32, 6); 960cabdff1aSopenharmony_ci SRARI_H2_SH(out54, out76, 6); 961cabdff1aSopenharmony_ci SAT_SH2_SH(out10, out32, 7); 962cabdff1aSopenharmony_ci SAT_SH2_SH(out54, out76, 7); 963cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(out10, out32); 964cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(out54, out76); 965cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 966cabdff1aSopenharmony_ci dst += (8 * dst_stride); 967cabdff1aSopenharmony_ci 968cabdff1aSopenharmony_ci src2110 = src10998; 969cabdff1aSopenharmony_ci src4332 = src12111110; 970cabdff1aSopenharmony_ci src6554 = src14131312; 971cabdff1aSopenharmony_ci src6 = src14; 972cabdff1aSopenharmony_ci } 973cabdff1aSopenharmony_ci} 974cabdff1aSopenharmony_ci 975cabdff1aSopenharmony_cistatic void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, 976cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 977cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 978cabdff1aSopenharmony_ci{ 979cabdff1aSopenharmony_ci uint32_t loop_cnt; 980cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 981cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 982cabdff1aSopenharmony_ci v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; 983cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 984cabdff1aSopenharmony_ci v8i16 filt, out0_r, out1_r, out2_r, out3_r; 985cabdff1aSopenharmony_ci 986cabdff1aSopenharmony_ci src -= (3 * src_stride); 987cabdff1aSopenharmony_ci 988cabdff1aSopenharmony_ci filt = LD_SH(filter); 989cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 990cabdff1aSopenharmony_ci 991cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 992cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 993cabdff1aSopenharmony_ci src += (7 * src_stride); 994cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 995cabdff1aSopenharmony_ci src54_r, src21_r); 996cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 997cabdff1aSopenharmony_ci 998cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 999cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 1000cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1001cabdff1aSopenharmony_ci src += (4 * src_stride); 1002cabdff1aSopenharmony_ci 1003cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1004cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 1005cabdff1aSopenharmony_ci DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0, 1006cabdff1aSopenharmony_ci filt0, out0_r, out1_r, out2_r, out3_r); 1007cabdff1aSopenharmony_ci DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1, 1008cabdff1aSopenharmony_ci filt1, out0_r, out1_r, out2_r, out3_r); 1009cabdff1aSopenharmony_ci DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2, 1010cabdff1aSopenharmony_ci filt2, out0_r, out1_r, out2_r, out3_r); 1011cabdff1aSopenharmony_ci DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3, 1012cabdff1aSopenharmony_ci filt3, out0_r, out1_r, out2_r, out3_r); 1013cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 1014cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1015cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 1016cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 1017cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 1018cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1019cabdff1aSopenharmony_ci 1020cabdff1aSopenharmony_ci src10_r = src54_r; 1021cabdff1aSopenharmony_ci src32_r = src76_r; 1022cabdff1aSopenharmony_ci src54_r = src98_r; 1023cabdff1aSopenharmony_ci src21_r = src65_r; 1024cabdff1aSopenharmony_ci src43_r = src87_r; 1025cabdff1aSopenharmony_ci src65_r = src109_r; 1026cabdff1aSopenharmony_ci src6 = src10; 1027cabdff1aSopenharmony_ci } 1028cabdff1aSopenharmony_ci} 1029cabdff1aSopenharmony_ci 1030cabdff1aSopenharmony_cistatic void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, 1031cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1032cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1033cabdff1aSopenharmony_ci{ 1034cabdff1aSopenharmony_ci uint32_t loop_cnt; 1035cabdff1aSopenharmony_ci uint32_t out2, out3; 1036cabdff1aSopenharmony_ci uint64_t out0, out1; 1037cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 1038cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1039cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, filt3; 1040cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 1041cabdff1aSopenharmony_ci v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 1042cabdff1aSopenharmony_ci v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 1043cabdff1aSopenharmony_ci v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 1044cabdff1aSopenharmony_ci 1045cabdff1aSopenharmony_ci src -= (3 * src_stride); 1046cabdff1aSopenharmony_ci 1047cabdff1aSopenharmony_ci filt = LD_SH(filter); 1048cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1049cabdff1aSopenharmony_ci 1050cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1051cabdff1aSopenharmony_ci src += (7 * src_stride); 1052cabdff1aSopenharmony_ci 1053cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1054cabdff1aSopenharmony_ci 1055cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 1056cabdff1aSopenharmony_ci src54_r, src21_r); 1057cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1058cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, 1059cabdff1aSopenharmony_ci src54_l, src21_l); 1060cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1061cabdff1aSopenharmony_ci 1062cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 1063cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 1064cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1065cabdff1aSopenharmony_ci src += (4 * src_stride); 1066cabdff1aSopenharmony_ci 1067cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1068cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 1069cabdff1aSopenharmony_ci ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 1070cabdff1aSopenharmony_ci src87_l, src98_l, src109_l); 1071cabdff1aSopenharmony_ci out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0, 1072cabdff1aSopenharmony_ci filt1, filt2, filt3); 1073cabdff1aSopenharmony_ci out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0, 1074cabdff1aSopenharmony_ci filt1, filt2, filt3); 1075cabdff1aSopenharmony_ci out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0, 1076cabdff1aSopenharmony_ci filt1, filt2, filt3); 1077cabdff1aSopenharmony_ci out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0, 1078cabdff1aSopenharmony_ci filt1, filt2, filt3); 1079cabdff1aSopenharmony_ci out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0, 1080cabdff1aSopenharmony_ci filt1, filt2, filt3); 1081cabdff1aSopenharmony_ci out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0, 1082cabdff1aSopenharmony_ci filt1, filt2, filt3); 1083cabdff1aSopenharmony_ci out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0, 1084cabdff1aSopenharmony_ci filt1, filt2, filt3); 1085cabdff1aSopenharmony_ci out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0, 1086cabdff1aSopenharmony_ci filt1, filt2, filt3); 1087cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 1088cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); 1089cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1090cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1091cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1092cabdff1aSopenharmony_ci out3_r, tmp0, tmp1, tmp2, tmp3); 1093cabdff1aSopenharmony_ci XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 1094cabdff1aSopenharmony_ci 1095cabdff1aSopenharmony_ci out0 = __msa_copy_u_d((v2i64) tmp0, 0); 1096cabdff1aSopenharmony_ci out1 = __msa_copy_u_d((v2i64) tmp1, 0); 1097cabdff1aSopenharmony_ci out2 = __msa_copy_u_w((v4i32) tmp0, 2); 1098cabdff1aSopenharmony_ci out3 = __msa_copy_u_w((v4i32) tmp1, 2); 1099cabdff1aSopenharmony_ci SD(out0, dst); 1100cabdff1aSopenharmony_ci SW(out2, (dst + 8)); 1101cabdff1aSopenharmony_ci dst += dst_stride; 1102cabdff1aSopenharmony_ci SD(out1, dst); 1103cabdff1aSopenharmony_ci SW(out3, (dst + 8)); 1104cabdff1aSopenharmony_ci dst += dst_stride; 1105cabdff1aSopenharmony_ci out0 = __msa_copy_u_d((v2i64) tmp2, 0); 1106cabdff1aSopenharmony_ci out1 = __msa_copy_u_d((v2i64) tmp3, 0); 1107cabdff1aSopenharmony_ci out2 = __msa_copy_u_w((v4i32) tmp2, 2); 1108cabdff1aSopenharmony_ci out3 = __msa_copy_u_w((v4i32) tmp3, 2); 1109cabdff1aSopenharmony_ci SD(out0, dst); 1110cabdff1aSopenharmony_ci SW(out2, (dst + 8)); 1111cabdff1aSopenharmony_ci dst += dst_stride; 1112cabdff1aSopenharmony_ci SD(out1, dst); 1113cabdff1aSopenharmony_ci SW(out3, (dst + 8)); 1114cabdff1aSopenharmony_ci dst += dst_stride; 1115cabdff1aSopenharmony_ci 1116cabdff1aSopenharmony_ci src10_r = src54_r; 1117cabdff1aSopenharmony_ci src32_r = src76_r; 1118cabdff1aSopenharmony_ci src54_r = src98_r; 1119cabdff1aSopenharmony_ci src21_r = src65_r; 1120cabdff1aSopenharmony_ci src43_r = src87_r; 1121cabdff1aSopenharmony_ci src65_r = src109_r; 1122cabdff1aSopenharmony_ci src10_l = src54_l; 1123cabdff1aSopenharmony_ci src32_l = src76_l; 1124cabdff1aSopenharmony_ci src54_l = src98_l; 1125cabdff1aSopenharmony_ci src21_l = src65_l; 1126cabdff1aSopenharmony_ci src43_l = src87_l; 1127cabdff1aSopenharmony_ci src65_l = src109_l; 1128cabdff1aSopenharmony_ci src6 = src10; 1129cabdff1aSopenharmony_ci } 1130cabdff1aSopenharmony_ci} 1131cabdff1aSopenharmony_ci 1132cabdff1aSopenharmony_cistatic void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, 1133cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1134cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1135cabdff1aSopenharmony_ci{ 1136cabdff1aSopenharmony_ci uint32_t loop_cnt; 1137cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1138cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, filt3; 1139cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 1140cabdff1aSopenharmony_ci v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 1141cabdff1aSopenharmony_ci v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 1142cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 1143cabdff1aSopenharmony_ci v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 1144cabdff1aSopenharmony_ci 1145cabdff1aSopenharmony_ci src -= (3 * src_stride); 1146cabdff1aSopenharmony_ci 1147cabdff1aSopenharmony_ci filt = LD_SH(filter); 1148cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1149cabdff1aSopenharmony_ci 1150cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1151cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1152cabdff1aSopenharmony_ci src += (7 * src_stride); 1153cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 1154cabdff1aSopenharmony_ci src54_r, src21_r); 1155cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1156cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, 1157cabdff1aSopenharmony_ci src54_l, src21_l); 1158cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1159cabdff1aSopenharmony_ci 1160cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1161cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 1162cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1163cabdff1aSopenharmony_ci src += (4 * src_stride); 1164cabdff1aSopenharmony_ci 1165cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1166cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 1167cabdff1aSopenharmony_ci ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 1168cabdff1aSopenharmony_ci src87_l, src98_l, src109_l); 1169cabdff1aSopenharmony_ci out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0, 1170cabdff1aSopenharmony_ci filt1, filt2, filt3); 1171cabdff1aSopenharmony_ci out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0, 1172cabdff1aSopenharmony_ci filt1, filt2, filt3); 1173cabdff1aSopenharmony_ci out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0, 1174cabdff1aSopenharmony_ci filt1, filt2, filt3); 1175cabdff1aSopenharmony_ci out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0, 1176cabdff1aSopenharmony_ci filt1, filt2, filt3); 1177cabdff1aSopenharmony_ci out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0, 1178cabdff1aSopenharmony_ci filt1, filt2, filt3); 1179cabdff1aSopenharmony_ci out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0, 1180cabdff1aSopenharmony_ci filt1, filt2, filt3); 1181cabdff1aSopenharmony_ci out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0, 1182cabdff1aSopenharmony_ci filt1, filt2, filt3); 1183cabdff1aSopenharmony_ci out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0, 1184cabdff1aSopenharmony_ci filt1, filt2, filt3); 1185cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 1186cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); 1187cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1188cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1189cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1190cabdff1aSopenharmony_ci out3_r, tmp0, tmp1, tmp2, tmp3); 1191cabdff1aSopenharmony_ci XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 1192cabdff1aSopenharmony_ci ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 1193cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1194cabdff1aSopenharmony_ci 1195cabdff1aSopenharmony_ci src10_r = src54_r; 1196cabdff1aSopenharmony_ci src32_r = src76_r; 1197cabdff1aSopenharmony_ci src54_r = src98_r; 1198cabdff1aSopenharmony_ci src21_r = src65_r; 1199cabdff1aSopenharmony_ci src43_r = src87_r; 1200cabdff1aSopenharmony_ci src65_r = src109_r; 1201cabdff1aSopenharmony_ci src10_l = src54_l; 1202cabdff1aSopenharmony_ci src32_l = src76_l; 1203cabdff1aSopenharmony_ci src54_l = src98_l; 1204cabdff1aSopenharmony_ci src21_l = src65_l; 1205cabdff1aSopenharmony_ci src43_l = src87_l; 1206cabdff1aSopenharmony_ci src65_l = src109_l; 1207cabdff1aSopenharmony_ci src6 = src10; 1208cabdff1aSopenharmony_ci } 1209cabdff1aSopenharmony_ci} 1210cabdff1aSopenharmony_ci 1211cabdff1aSopenharmony_cistatic void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, 1212cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1213cabdff1aSopenharmony_ci const int8_t *filter, int32_t height, 1214cabdff1aSopenharmony_ci int32_t width) 1215cabdff1aSopenharmony_ci{ 1216cabdff1aSopenharmony_ci uint8_t *src_tmp; 1217cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1218cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 1219cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1220cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, filt3; 1221cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 1222cabdff1aSopenharmony_ci v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 1223cabdff1aSopenharmony_ci v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 1224cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 1225cabdff1aSopenharmony_ci v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 1226cabdff1aSopenharmony_ci 1227cabdff1aSopenharmony_ci src -= (3 * src_stride); 1228cabdff1aSopenharmony_ci 1229cabdff1aSopenharmony_ci filt = LD_SH(filter); 1230cabdff1aSopenharmony_ci SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1231cabdff1aSopenharmony_ci 1232cabdff1aSopenharmony_ci for (cnt = (width >> 4); cnt--;) { 1233cabdff1aSopenharmony_ci src_tmp = src; 1234cabdff1aSopenharmony_ci dst_tmp = dst; 1235cabdff1aSopenharmony_ci 1236cabdff1aSopenharmony_ci LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1237cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1238cabdff1aSopenharmony_ci src_tmp += (7 * src_stride); 1239cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, 1240cabdff1aSopenharmony_ci src32_r, src54_r, src21_r); 1241cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1242cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, 1243cabdff1aSopenharmony_ci src32_l, src54_l, src21_l); 1244cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1245cabdff1aSopenharmony_ci 1246cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1247cabdff1aSopenharmony_ci LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); 1248cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1249cabdff1aSopenharmony_ci src_tmp += (4 * src_stride); 1250cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1251cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 1252cabdff1aSopenharmony_ci ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 1253cabdff1aSopenharmony_ci src87_l, src98_l, src109_l); 1254cabdff1aSopenharmony_ci out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, 1255cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1256cabdff1aSopenharmony_ci out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, 1257cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1258cabdff1aSopenharmony_ci out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, 1259cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1260cabdff1aSopenharmony_ci out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, 1261cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1262cabdff1aSopenharmony_ci out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, 1263cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1264cabdff1aSopenharmony_ci out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, 1265cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1266cabdff1aSopenharmony_ci out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, 1267cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1268cabdff1aSopenharmony_ci out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, 1269cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1270cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 1271cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); 1272cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1273cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1274cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1275cabdff1aSopenharmony_ci out3_r, tmp0, tmp1, tmp2, tmp3); 1276cabdff1aSopenharmony_ci XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 1277cabdff1aSopenharmony_ci ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); 1278cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 1279cabdff1aSopenharmony_ci 1280cabdff1aSopenharmony_ci src10_r = src54_r; 1281cabdff1aSopenharmony_ci src32_r = src76_r; 1282cabdff1aSopenharmony_ci src54_r = src98_r; 1283cabdff1aSopenharmony_ci src21_r = src65_r; 1284cabdff1aSopenharmony_ci src43_r = src87_r; 1285cabdff1aSopenharmony_ci src65_r = src109_r; 1286cabdff1aSopenharmony_ci src10_l = src54_l; 1287cabdff1aSopenharmony_ci src32_l = src76_l; 1288cabdff1aSopenharmony_ci src54_l = src98_l; 1289cabdff1aSopenharmony_ci src21_l = src65_l; 1290cabdff1aSopenharmony_ci src43_l = src87_l; 1291cabdff1aSopenharmony_ci src65_l = src109_l; 1292cabdff1aSopenharmony_ci src6 = src10; 1293cabdff1aSopenharmony_ci } 1294cabdff1aSopenharmony_ci 1295cabdff1aSopenharmony_ci src += 16; 1296cabdff1aSopenharmony_ci dst += 16; 1297cabdff1aSopenharmony_ci } 1298cabdff1aSopenharmony_ci} 1299cabdff1aSopenharmony_ci 1300cabdff1aSopenharmony_cistatic void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, 1301cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1302cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1303cabdff1aSopenharmony_ci{ 1304cabdff1aSopenharmony_ci common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 1305cabdff1aSopenharmony_ci 16); 1306cabdff1aSopenharmony_ci 1307cabdff1aSopenharmony_ci common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter, 1308cabdff1aSopenharmony_ci height); 1309cabdff1aSopenharmony_ci} 1310cabdff1aSopenharmony_ci 1311cabdff1aSopenharmony_cistatic void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, 1312cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1313cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1314cabdff1aSopenharmony_ci{ 1315cabdff1aSopenharmony_ci common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 1316cabdff1aSopenharmony_ci 32); 1317cabdff1aSopenharmony_ci} 1318cabdff1aSopenharmony_ci 1319cabdff1aSopenharmony_cistatic void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, 1320cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1321cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1322cabdff1aSopenharmony_ci{ 1323cabdff1aSopenharmony_ci common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 1324cabdff1aSopenharmony_ci 48); 1325cabdff1aSopenharmony_ci} 1326cabdff1aSopenharmony_ci 1327cabdff1aSopenharmony_cistatic void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, 1328cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1329cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1330cabdff1aSopenharmony_ci{ 1331cabdff1aSopenharmony_ci common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 1332cabdff1aSopenharmony_ci 64); 1333cabdff1aSopenharmony_ci} 1334cabdff1aSopenharmony_ci 1335cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_4w_msa(uint8_t *src, 1336cabdff1aSopenharmony_ci int32_t src_stride, 1337cabdff1aSopenharmony_ci uint8_t *dst, 1338cabdff1aSopenharmony_ci int32_t dst_stride, 1339cabdff1aSopenharmony_ci const int8_t *filter_x, 1340cabdff1aSopenharmony_ci const int8_t *filter_y, 1341cabdff1aSopenharmony_ci int32_t height) 1342cabdff1aSopenharmony_ci{ 1343cabdff1aSopenharmony_ci uint32_t loop_cnt; 1344cabdff1aSopenharmony_ci v16u8 out0, out1; 1345cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1346cabdff1aSopenharmony_ci v16i8 src9, src10, src11, src12, src13, src14; 1347cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1348cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1349cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 1350cabdff1aSopenharmony_ci v8i16 filter_vec; 1351cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1352cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1353cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410; 1354cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r; 1355cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r; 1356cabdff1aSopenharmony_ci v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 1357cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1358cabdff1aSopenharmony_ci 1359cabdff1aSopenharmony_ci src -= ((3 * src_stride) + 3); 1360cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1361cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1362cabdff1aSopenharmony_ci 1363cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1364cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1365cabdff1aSopenharmony_ci 1366cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1367cabdff1aSopenharmony_ci 1368cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1369cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1370cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1371cabdff1aSopenharmony_ci 1372cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1373cabdff1aSopenharmony_ci src += (7 * src_stride); 1374cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1375cabdff1aSopenharmony_ci 1376cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1377cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1378cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, 1379cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1380cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, 1381cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1382cabdff1aSopenharmony_ci 1383cabdff1aSopenharmony_ci dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1384cabdff1aSopenharmony_ci filt3); 1385cabdff1aSopenharmony_ci dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1386cabdff1aSopenharmony_ci filt3); 1387cabdff1aSopenharmony_ci dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1388cabdff1aSopenharmony_ci filt3); 1389cabdff1aSopenharmony_ci dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 1390cabdff1aSopenharmony_ci filt3); 1391cabdff1aSopenharmony_ci 1392cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 1393cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 1394cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 1395cabdff1aSopenharmony_ci 1396cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1397cabdff1aSopenharmony_ci 1398cabdff1aSopenharmony_ci for (loop_cnt = height >> 3; loop_cnt--;) { 1399cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13, 1400cabdff1aSopenharmony_ci src14); 1401cabdff1aSopenharmony_ci src += (8 * src_stride); 1402cabdff1aSopenharmony_ci XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14); 1403cabdff1aSopenharmony_ci 1404cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3, 1405cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1406cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3, 1407cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1408cabdff1aSopenharmony_ci VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3, 1409cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1410cabdff1aSopenharmony_ci VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3, 1411cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1412cabdff1aSopenharmony_ci 1413cabdff1aSopenharmony_ci dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1414cabdff1aSopenharmony_ci filt3); 1415cabdff1aSopenharmony_ci dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1416cabdff1aSopenharmony_ci filt3); 1417cabdff1aSopenharmony_ci dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, 1418cabdff1aSopenharmony_ci filt2, filt3); 1419cabdff1aSopenharmony_ci dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1420cabdff1aSopenharmony_ci filt2, filt3); 1421cabdff1aSopenharmony_ci 1422cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst117, dst66); 1423cabdff1aSopenharmony_ci ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r); 1424cabdff1aSopenharmony_ci ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r); 1425cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r); 1426cabdff1aSopenharmony_ci dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1); 1427cabdff1aSopenharmony_ci dst1110_r = __msa_ilvr_h(dst117, dst1410); 1428cabdff1aSopenharmony_ci 1429cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1430cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1431cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 1432cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1433cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0, 1434cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1435cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0, 1436cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1437cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0, 1438cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1439cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0, 1440cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1441cabdff1aSopenharmony_ci dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0, 1442cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1443cabdff1aSopenharmony_ci dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r, 1444cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1445cabdff1aSopenharmony_ci 1446cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1447cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 1448cabdff1aSopenharmony_ci SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1449cabdff1aSopenharmony_ci SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6); 1450cabdff1aSopenharmony_ci SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7); 1451cabdff1aSopenharmony_ci SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7); 1452cabdff1aSopenharmony_ci PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 1453cabdff1aSopenharmony_ci PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r); 1454cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0_r, dst1_r); 1455cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst4_r, dst5_r); 1456cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1457cabdff1aSopenharmony_ci dst += (8 * dst_stride); 1458cabdff1aSopenharmony_ci 1459cabdff1aSopenharmony_ci dst10_r = dst98_r; 1460cabdff1aSopenharmony_ci dst32_r = dst1110_r; 1461cabdff1aSopenharmony_ci dst54_r = dst1312_r; 1462cabdff1aSopenharmony_ci dst21_r = dst109_r; 1463cabdff1aSopenharmony_ci dst43_r = dst1211_r; 1464cabdff1aSopenharmony_ci dst65_r = dst1413_r; 1465cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1); 1466cabdff1aSopenharmony_ci } 1467cabdff1aSopenharmony_ci} 1468cabdff1aSopenharmony_ci 1469cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, 1470cabdff1aSopenharmony_ci int32_t src_stride, 1471cabdff1aSopenharmony_ci uint8_t *dst, 1472cabdff1aSopenharmony_ci int32_t dst_stride, 1473cabdff1aSopenharmony_ci const int8_t *filter_x, 1474cabdff1aSopenharmony_ci const int8_t *filter_y, 1475cabdff1aSopenharmony_ci int32_t height, int32_t width) 1476cabdff1aSopenharmony_ci{ 1477cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 1478cabdff1aSopenharmony_ci uint8_t *src_tmp; 1479cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1480cabdff1aSopenharmony_ci v16u8 out; 1481cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1482cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1483cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1484cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 1485cabdff1aSopenharmony_ci v8i16 filter_vec; 1486cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1487cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1488cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 1489cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 1490cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 1491cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 1492cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 1493cabdff1aSopenharmony_ci v8i16 dst21_l, dst43_l, dst65_l, dst87_l; 1494cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 1495cabdff1aSopenharmony_ci 1496cabdff1aSopenharmony_ci src -= ((3 * src_stride) + 3); 1497cabdff1aSopenharmony_ci 1498cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1499cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1500cabdff1aSopenharmony_ci 1501cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1502cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1503cabdff1aSopenharmony_ci 1504cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1505cabdff1aSopenharmony_ci 1506cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1507cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1508cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1509cabdff1aSopenharmony_ci 1510cabdff1aSopenharmony_ci for (cnt = width >> 3; cnt--;) { 1511cabdff1aSopenharmony_ci src_tmp = src; 1512cabdff1aSopenharmony_ci dst_tmp = dst; 1513cabdff1aSopenharmony_ci 1514cabdff1aSopenharmony_ci LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1515cabdff1aSopenharmony_ci src_tmp += (7 * src_stride); 1516cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1517cabdff1aSopenharmony_ci 1518cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 1519cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1520cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1521cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1522cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1523cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1524cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1525cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1526cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1527cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1528cabdff1aSopenharmony_ci filt3); 1529cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1530cabdff1aSopenharmony_ci filt3); 1531cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1532cabdff1aSopenharmony_ci filt3); 1533cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1534cabdff1aSopenharmony_ci filt2, filt3); 1535cabdff1aSopenharmony_ci 1536cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1537cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1538cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 1539cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1540cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 1541cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1542cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1543cabdff1aSopenharmony_ci filt3); 1544cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1545cabdff1aSopenharmony_ci filt3); 1546cabdff1aSopenharmony_ci dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1547cabdff1aSopenharmony_ci filt3); 1548cabdff1aSopenharmony_ci 1549cabdff1aSopenharmony_ci for (loop_cnt = height >> 1; loop_cnt--;) { 1550cabdff1aSopenharmony_ci LD_SB2(src_tmp, src_stride, src7, src8); 1551cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 1552cabdff1aSopenharmony_ci src_tmp += 2 * src_stride; 1553cabdff1aSopenharmony_ci 1554cabdff1aSopenharmony_ci ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, 1555cabdff1aSopenharmony_ci dst10_r, dst32_r, dst54_r, dst21_r); 1556cabdff1aSopenharmony_ci ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, 1557cabdff1aSopenharmony_ci dst10_l, dst32_l, dst54_l, dst21_l); 1558cabdff1aSopenharmony_ci ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); 1559cabdff1aSopenharmony_ci ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); 1560cabdff1aSopenharmony_ci 1561cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 1562cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1563cabdff1aSopenharmony_ci dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1564cabdff1aSopenharmony_ci filt2, filt3); 1565cabdff1aSopenharmony_ci 1566cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1567cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1568cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1569cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 1570cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1571cabdff1aSopenharmony_ci dst0_r >>= 6; 1572cabdff1aSopenharmony_ci dst0_l >>= 6; 1573cabdff1aSopenharmony_ci 1574cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, 1575cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1576cabdff1aSopenharmony_ci dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1577cabdff1aSopenharmony_ci filt2, filt3); 1578cabdff1aSopenharmony_ci 1579cabdff1aSopenharmony_ci ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 1580cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, 1581cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1582cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, 1583cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1584cabdff1aSopenharmony_ci dst1_r >>= 6; 1585cabdff1aSopenharmony_ci dst1_l >>= 6; 1586cabdff1aSopenharmony_ci SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6); 1587cabdff1aSopenharmony_ci SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7); 1588cabdff1aSopenharmony_ci 1589cabdff1aSopenharmony_ci PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1); 1590cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(dst0, dst1); 1591cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst_tmp, dst_stride); 1592cabdff1aSopenharmony_ci dst_tmp += (2 * dst_stride); 1593cabdff1aSopenharmony_ci 1594cabdff1aSopenharmony_ci dst0 = dst2; 1595cabdff1aSopenharmony_ci dst1 = dst3; 1596cabdff1aSopenharmony_ci dst2 = dst4; 1597cabdff1aSopenharmony_ci dst3 = dst5; 1598cabdff1aSopenharmony_ci dst4 = dst6; 1599cabdff1aSopenharmony_ci dst5 = dst7; 1600cabdff1aSopenharmony_ci dst6 = dst8; 1601cabdff1aSopenharmony_ci } 1602cabdff1aSopenharmony_ci 1603cabdff1aSopenharmony_ci src += 8; 1604cabdff1aSopenharmony_ci dst += 8; 1605cabdff1aSopenharmony_ci } 1606cabdff1aSopenharmony_ci} 1607cabdff1aSopenharmony_ci 1608cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_8w_msa(uint8_t *src, 1609cabdff1aSopenharmony_ci int32_t src_stride, 1610cabdff1aSopenharmony_ci uint8_t *dst, 1611cabdff1aSopenharmony_ci int32_t dst_stride, 1612cabdff1aSopenharmony_ci const int8_t *filter_x, 1613cabdff1aSopenharmony_ci const int8_t *filter_y, 1614cabdff1aSopenharmony_ci int32_t height) 1615cabdff1aSopenharmony_ci{ 1616cabdff1aSopenharmony_ci hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1617cabdff1aSopenharmony_ci filter_x, filter_y, height, 8); 1618cabdff1aSopenharmony_ci} 1619cabdff1aSopenharmony_ci 1620cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_12w_msa(uint8_t *src, 1621cabdff1aSopenharmony_ci int32_t src_stride, 1622cabdff1aSopenharmony_ci uint8_t *dst, 1623cabdff1aSopenharmony_ci int32_t dst_stride, 1624cabdff1aSopenharmony_ci const int8_t *filter_x, 1625cabdff1aSopenharmony_ci const int8_t *filter_y, 1626cabdff1aSopenharmony_ci int32_t height) 1627cabdff1aSopenharmony_ci{ 1628cabdff1aSopenharmony_ci uint32_t loop_cnt; 1629cabdff1aSopenharmony_ci uint8_t *src_tmp, *dst_tmp; 1630cabdff1aSopenharmony_ci v16u8 out0, out1; 1631cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1632cabdff1aSopenharmony_ci v16i8 src11, src12, src13, src14; 1633cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1634cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1635cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1636cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 1637cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410; 1638cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 1639cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r; 1640cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l; 1641cabdff1aSopenharmony_ci v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r; 1642cabdff1aSopenharmony_ci v8i16 dst1413_r, dst87_l, filter_vec; 1643cabdff1aSopenharmony_ci v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 1644cabdff1aSopenharmony_ci v4i32 dst0_l, dst1_l; 1645cabdff1aSopenharmony_ci 1646cabdff1aSopenharmony_ci src -= ((3 * src_stride) + 3); 1647cabdff1aSopenharmony_ci 1648cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1649cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1650cabdff1aSopenharmony_ci 1651cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1652cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1653cabdff1aSopenharmony_ci 1654cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1655cabdff1aSopenharmony_ci 1656cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 1657cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1658cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1659cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1660cabdff1aSopenharmony_ci 1661cabdff1aSopenharmony_ci src_tmp = src; 1662cabdff1aSopenharmony_ci dst_tmp = dst; 1663cabdff1aSopenharmony_ci 1664cabdff1aSopenharmony_ci LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1665cabdff1aSopenharmony_ci src_tmp += (7 * src_stride); 1666cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1667cabdff1aSopenharmony_ci 1668cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 1669cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1670cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1671cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1672cabdff1aSopenharmony_ci vec11); 1673cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14, 1674cabdff1aSopenharmony_ci vec15); 1675cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1676cabdff1aSopenharmony_ci filt3); 1677cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1678cabdff1aSopenharmony_ci filt3); 1679cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1680cabdff1aSopenharmony_ci filt3); 1681cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1682cabdff1aSopenharmony_ci filt2, filt3); 1683cabdff1aSopenharmony_ci 1684cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1685cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1686cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1687cabdff1aSopenharmony_ci vec11); 1688cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1689cabdff1aSopenharmony_ci filt3); 1690cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1691cabdff1aSopenharmony_ci filt3); 1692cabdff1aSopenharmony_ci dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1693cabdff1aSopenharmony_ci filt3); 1694cabdff1aSopenharmony_ci 1695cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 1696cabdff1aSopenharmony_ci LD_SB2(src_tmp, src_stride, src7, src8); 1697cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 1698cabdff1aSopenharmony_ci src_tmp += 2 * src_stride; 1699cabdff1aSopenharmony_ci 1700cabdff1aSopenharmony_ci ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r, 1701cabdff1aSopenharmony_ci dst32_r, dst54_r, dst21_r); 1702cabdff1aSopenharmony_ci ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l, 1703cabdff1aSopenharmony_ci dst32_l, dst54_l, dst21_l); 1704cabdff1aSopenharmony_ci ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); 1705cabdff1aSopenharmony_ci ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); 1706cabdff1aSopenharmony_ci 1707cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1708cabdff1aSopenharmony_ci vec3); 1709cabdff1aSopenharmony_ci dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1710cabdff1aSopenharmony_ci filt3); 1711cabdff1aSopenharmony_ci 1712cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1713cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1714cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1715cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 1716cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1717cabdff1aSopenharmony_ci dst0_r >>= 6; 1718cabdff1aSopenharmony_ci dst0_l >>= 6; 1719cabdff1aSopenharmony_ci 1720cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1721cabdff1aSopenharmony_ci vec3); 1722cabdff1aSopenharmony_ci dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1723cabdff1aSopenharmony_ci filt3); 1724cabdff1aSopenharmony_ci 1725cabdff1aSopenharmony_ci ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 1726cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, 1727cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1728cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, 1729cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1730cabdff1aSopenharmony_ci dst1_r >>= 6; 1731cabdff1aSopenharmony_ci dst1_l >>= 6; 1732cabdff1aSopenharmony_ci SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6); 1733cabdff1aSopenharmony_ci SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7); 1734cabdff1aSopenharmony_ci 1735cabdff1aSopenharmony_ci PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1); 1736cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0, dst1); 1737cabdff1aSopenharmony_ci ST_D2(out0, 0, 1, dst_tmp, dst_stride); 1738cabdff1aSopenharmony_ci dst_tmp += (2 * dst_stride); 1739cabdff1aSopenharmony_ci 1740cabdff1aSopenharmony_ci dst0 = dst2; 1741cabdff1aSopenharmony_ci dst1 = dst3; 1742cabdff1aSopenharmony_ci dst2 = dst4; 1743cabdff1aSopenharmony_ci dst3 = dst5; 1744cabdff1aSopenharmony_ci dst4 = dst6; 1745cabdff1aSopenharmony_ci dst5 = dst7; 1746cabdff1aSopenharmony_ci dst6 = dst8; 1747cabdff1aSopenharmony_ci } 1748cabdff1aSopenharmony_ci 1749cabdff1aSopenharmony_ci src += 8; 1750cabdff1aSopenharmony_ci dst += 8; 1751cabdff1aSopenharmony_ci 1752cabdff1aSopenharmony_ci mask4 = LD_SB(ff_hevc_mask_arr + 16); 1753cabdff1aSopenharmony_ci mask5 = mask4 + 2; 1754cabdff1aSopenharmony_ci mask6 = mask4 + 4; 1755cabdff1aSopenharmony_ci mask7 = mask4 + 6; 1756cabdff1aSopenharmony_ci 1757cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1758cabdff1aSopenharmony_ci src += (7 * src_stride); 1759cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1760cabdff1aSopenharmony_ci 1761cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 1762cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7); 1763cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10, 1764cabdff1aSopenharmony_ci vec11); 1765cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14, 1766cabdff1aSopenharmony_ci vec15); 1767cabdff1aSopenharmony_ci 1768cabdff1aSopenharmony_ci dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1769cabdff1aSopenharmony_ci filt3); 1770cabdff1aSopenharmony_ci dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1771cabdff1aSopenharmony_ci filt3); 1772cabdff1aSopenharmony_ci dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1773cabdff1aSopenharmony_ci filt3); 1774cabdff1aSopenharmony_ci dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 1775cabdff1aSopenharmony_ci filt3); 1776cabdff1aSopenharmony_ci 1777cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 1778cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 1779cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 1780cabdff1aSopenharmony_ci 1781cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1782cabdff1aSopenharmony_ci 1783cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 1784cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13, 1785cabdff1aSopenharmony_ci src14); 1786cabdff1aSopenharmony_ci src += (8 * src_stride); 1787cabdff1aSopenharmony_ci XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14); 1788cabdff1aSopenharmony_ci 1789cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 1790cabdff1aSopenharmony_ci vec3); 1791cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6, 1792cabdff1aSopenharmony_ci vec7); 1793cabdff1aSopenharmony_ci VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10, 1794cabdff1aSopenharmony_ci vec11); 1795cabdff1aSopenharmony_ci VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13, 1796cabdff1aSopenharmony_ci vec14, vec15); 1797cabdff1aSopenharmony_ci 1798cabdff1aSopenharmony_ci dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1799cabdff1aSopenharmony_ci filt3); 1800cabdff1aSopenharmony_ci dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1801cabdff1aSopenharmony_ci filt3); 1802cabdff1aSopenharmony_ci dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, 1803cabdff1aSopenharmony_ci filt2, filt3); 1804cabdff1aSopenharmony_ci dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1805cabdff1aSopenharmony_ci filt2, filt3); 1806cabdff1aSopenharmony_ci 1807cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst117, dst66); 1808cabdff1aSopenharmony_ci ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r); 1809cabdff1aSopenharmony_ci ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r); 1810cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r); 1811cabdff1aSopenharmony_ci dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1); 1812cabdff1aSopenharmony_ci dst1110_r = __msa_ilvr_h(dst117, dst1410); 1813cabdff1aSopenharmony_ci 1814cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1815cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1816cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 1817cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1818cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0, 1819cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1820cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0, 1821cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1822cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0, 1823cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1824cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0, 1825cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1826cabdff1aSopenharmony_ci dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0, 1827cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1828cabdff1aSopenharmony_ci dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r, 1829cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1830cabdff1aSopenharmony_ci 1831cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1832cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 1833cabdff1aSopenharmony_ci SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1834cabdff1aSopenharmony_ci SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6); 1835cabdff1aSopenharmony_ci SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7); 1836cabdff1aSopenharmony_ci SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7); 1837cabdff1aSopenharmony_ci PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 1838cabdff1aSopenharmony_ci PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r); 1839cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0_r, dst1_r); 1840cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst4_r, dst5_r); 1841cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1842cabdff1aSopenharmony_ci dst += (8 * dst_stride); 1843cabdff1aSopenharmony_ci 1844cabdff1aSopenharmony_ci dst10_r = dst98_r; 1845cabdff1aSopenharmony_ci dst32_r = dst1110_r; 1846cabdff1aSopenharmony_ci dst54_r = dst1312_r; 1847cabdff1aSopenharmony_ci dst21_r = dst109_r; 1848cabdff1aSopenharmony_ci dst43_r = dst1211_r; 1849cabdff1aSopenharmony_ci dst65_r = dst1413_r; 1850cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1); 1851cabdff1aSopenharmony_ci } 1852cabdff1aSopenharmony_ci} 1853cabdff1aSopenharmony_ci 1854cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_16w_msa(uint8_t *src, 1855cabdff1aSopenharmony_ci int32_t src_stride, 1856cabdff1aSopenharmony_ci uint8_t *dst, 1857cabdff1aSopenharmony_ci int32_t dst_stride, 1858cabdff1aSopenharmony_ci const int8_t *filter_x, 1859cabdff1aSopenharmony_ci const int8_t *filter_y, 1860cabdff1aSopenharmony_ci int32_t height) 1861cabdff1aSopenharmony_ci{ 1862cabdff1aSopenharmony_ci hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1863cabdff1aSopenharmony_ci filter_x, filter_y, height, 16); 1864cabdff1aSopenharmony_ci} 1865cabdff1aSopenharmony_ci 1866cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_24w_msa(uint8_t *src, 1867cabdff1aSopenharmony_ci int32_t src_stride, 1868cabdff1aSopenharmony_ci uint8_t *dst, 1869cabdff1aSopenharmony_ci int32_t dst_stride, 1870cabdff1aSopenharmony_ci const int8_t *filter_x, 1871cabdff1aSopenharmony_ci const int8_t *filter_y, 1872cabdff1aSopenharmony_ci int32_t height) 1873cabdff1aSopenharmony_ci{ 1874cabdff1aSopenharmony_ci hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1875cabdff1aSopenharmony_ci filter_x, filter_y, height, 24); 1876cabdff1aSopenharmony_ci} 1877cabdff1aSopenharmony_ci 1878cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_32w_msa(uint8_t *src, 1879cabdff1aSopenharmony_ci int32_t src_stride, 1880cabdff1aSopenharmony_ci uint8_t *dst, 1881cabdff1aSopenharmony_ci int32_t dst_stride, 1882cabdff1aSopenharmony_ci const int8_t *filter_x, 1883cabdff1aSopenharmony_ci const int8_t *filter_y, 1884cabdff1aSopenharmony_ci int32_t height) 1885cabdff1aSopenharmony_ci{ 1886cabdff1aSopenharmony_ci hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1887cabdff1aSopenharmony_ci filter_x, filter_y, height, 32); 1888cabdff1aSopenharmony_ci} 1889cabdff1aSopenharmony_ci 1890cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_48w_msa(uint8_t *src, 1891cabdff1aSopenharmony_ci int32_t src_stride, 1892cabdff1aSopenharmony_ci uint8_t *dst, 1893cabdff1aSopenharmony_ci int32_t dst_stride, 1894cabdff1aSopenharmony_ci const int8_t *filter_x, 1895cabdff1aSopenharmony_ci const int8_t *filter_y, 1896cabdff1aSopenharmony_ci int32_t height) 1897cabdff1aSopenharmony_ci{ 1898cabdff1aSopenharmony_ci hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1899cabdff1aSopenharmony_ci filter_x, filter_y, height, 48); 1900cabdff1aSopenharmony_ci} 1901cabdff1aSopenharmony_ci 1902cabdff1aSopenharmony_cistatic void hevc_hv_uni_8t_64w_msa(uint8_t *src, 1903cabdff1aSopenharmony_ci int32_t src_stride, 1904cabdff1aSopenharmony_ci uint8_t *dst, 1905cabdff1aSopenharmony_ci int32_t dst_stride, 1906cabdff1aSopenharmony_ci const int8_t *filter_x, 1907cabdff1aSopenharmony_ci const int8_t *filter_y, 1908cabdff1aSopenharmony_ci int32_t height) 1909cabdff1aSopenharmony_ci{ 1910cabdff1aSopenharmony_ci hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1911cabdff1aSopenharmony_ci filter_x, filter_y, height, 64); 1912cabdff1aSopenharmony_ci} 1913cabdff1aSopenharmony_ci 1914cabdff1aSopenharmony_cistatic void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, 1915cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1916cabdff1aSopenharmony_ci const int8_t *filter) 1917cabdff1aSopenharmony_ci{ 1918cabdff1aSopenharmony_ci v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1; 1919cabdff1aSopenharmony_ci v16u8 out; 1920cabdff1aSopenharmony_ci v8i16 filt, res0; 1921cabdff1aSopenharmony_ci 1922cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[16]); 1923cabdff1aSopenharmony_ci src -= 1; 1924cabdff1aSopenharmony_ci 1925cabdff1aSopenharmony_ci /* rearranging filter */ 1926cabdff1aSopenharmony_ci filt = LD_SH(filter); 1927cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 1928cabdff1aSopenharmony_ci 1929cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1930cabdff1aSopenharmony_ci 1931cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src1); 1932cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 1933cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 1934cabdff1aSopenharmony_ci res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 1935cabdff1aSopenharmony_ci res0 = __msa_srari_h(res0, 6); 1936cabdff1aSopenharmony_ci res0 = __msa_sat_s_h(res0, 7); 1937cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(res0, res0); 1938cabdff1aSopenharmony_ci ST_W2(out, 0, 1, dst, dst_stride); 1939cabdff1aSopenharmony_ci} 1940cabdff1aSopenharmony_ci 1941cabdff1aSopenharmony_cistatic void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, 1942cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1943cabdff1aSopenharmony_ci const int8_t *filter) 1944cabdff1aSopenharmony_ci{ 1945cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 1946cabdff1aSopenharmony_ci v8i16 filt, out0, out1; 1947cabdff1aSopenharmony_ci v16u8 out; 1948cabdff1aSopenharmony_ci 1949cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[16]); 1950cabdff1aSopenharmony_ci src -= 1; 1951cabdff1aSopenharmony_ci 1952cabdff1aSopenharmony_ci /* rearranging filter */ 1953cabdff1aSopenharmony_ci filt = LD_SH(filter); 1954cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 1955cabdff1aSopenharmony_ci 1956cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1957cabdff1aSopenharmony_ci 1958cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 1959cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 1960cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 1961cabdff1aSopenharmony_ci filt0, filt1, out0, out1); 1962cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 6); 1963cabdff1aSopenharmony_ci SAT_SH2_SH(out0, out1, 7); 1964cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 1965cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1966cabdff1aSopenharmony_ci} 1967cabdff1aSopenharmony_ci 1968cabdff1aSopenharmony_cistatic void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, 1969cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1970cabdff1aSopenharmony_ci const int8_t *filter) 1971cabdff1aSopenharmony_ci{ 1972cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 1973cabdff1aSopenharmony_ci v16u8 out; 1974cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 1975cabdff1aSopenharmony_ci 1976cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[16]); 1977cabdff1aSopenharmony_ci src -= 1; 1978cabdff1aSopenharmony_ci 1979cabdff1aSopenharmony_ci /* rearranging filter */ 1980cabdff1aSopenharmony_ci filt = LD_SH(filter); 1981cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 1982cabdff1aSopenharmony_ci 1983cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1984cabdff1aSopenharmony_ci 1985cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 1986cabdff1aSopenharmony_ci src += (4 * src_stride); 1987cabdff1aSopenharmony_ci 1988cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 1989cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 1990cabdff1aSopenharmony_ci filt0, filt1, out0, out1); 1991cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 1992cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 1993cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 1994cabdff1aSopenharmony_ci filt0, filt1, out2, out3); 1995cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 1996cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 1997cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 1998cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1999cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 2000cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 2001cabdff1aSopenharmony_ci} 2002cabdff1aSopenharmony_ci 2003cabdff1aSopenharmony_cistatic void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, 2004cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2005cabdff1aSopenharmony_ci const int8_t *filter) 2006cabdff1aSopenharmony_ci{ 2007cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2008cabdff1aSopenharmony_ci v16i8 filt0, filt1, mask0, mask1; 2009cabdff1aSopenharmony_ci v16u8 out; 2010cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 2011cabdff1aSopenharmony_ci 2012cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2013cabdff1aSopenharmony_ci src -= 1; 2014cabdff1aSopenharmony_ci 2015cabdff1aSopenharmony_ci /* rearranging filter */ 2016cabdff1aSopenharmony_ci filt = LD_SH(filter); 2017cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2018cabdff1aSopenharmony_ci 2019cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2020cabdff1aSopenharmony_ci 2021cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2022cabdff1aSopenharmony_ci src += (8 * src_stride); 2023cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2024cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 2025cabdff1aSopenharmony_ci filt0, filt1, out0, out1); 2026cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, 2027cabdff1aSopenharmony_ci filt0, filt1, out2, out3); 2028cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 2029cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 2030cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 2031cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2032cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 2033cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 2034cabdff1aSopenharmony_ci dst += (8 * dst_stride); 2035cabdff1aSopenharmony_ci 2036cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2037cabdff1aSopenharmony_ci src += (8 * src_stride); 2038cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2039cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 2040cabdff1aSopenharmony_ci filt0, filt1, out0, out1); 2041cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, 2042cabdff1aSopenharmony_ci filt0, filt1, out2, out3); 2043cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 2044cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 2045cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 2046cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2047cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 2048cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 2049cabdff1aSopenharmony_ci} 2050cabdff1aSopenharmony_ci 2051cabdff1aSopenharmony_cistatic void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, 2052cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2053cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2054cabdff1aSopenharmony_ci{ 2055cabdff1aSopenharmony_ci if (2 == height) { 2056cabdff1aSopenharmony_ci common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); 2057cabdff1aSopenharmony_ci } else if (4 == height) { 2058cabdff1aSopenharmony_ci common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); 2059cabdff1aSopenharmony_ci } else if (8 == height) { 2060cabdff1aSopenharmony_ci common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter); 2061cabdff1aSopenharmony_ci } else if (16 == height) { 2062cabdff1aSopenharmony_ci common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter); 2063cabdff1aSopenharmony_ci } 2064cabdff1aSopenharmony_ci} 2065cabdff1aSopenharmony_ci 2066cabdff1aSopenharmony_cistatic void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, 2067cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2068cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2069cabdff1aSopenharmony_ci{ 2070cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 2071cabdff1aSopenharmony_ci v16u8 out4, out5; 2072cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 2073cabdff1aSopenharmony_ci 2074cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2075cabdff1aSopenharmony_ci src -= 1; 2076cabdff1aSopenharmony_ci 2077cabdff1aSopenharmony_ci /* rearranging filter */ 2078cabdff1aSopenharmony_ci filt = LD_SH(filter); 2079cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2080cabdff1aSopenharmony_ci 2081cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2082cabdff1aSopenharmony_ci 2083cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 2084cabdff1aSopenharmony_ci src += (4 * src_stride); 2085cabdff1aSopenharmony_ci 2086cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2087cabdff1aSopenharmony_ci HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 2088cabdff1aSopenharmony_ci filt1, out0, out1, out2, out3); 2089cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 2090cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 2091cabdff1aSopenharmony_ci out4 = PCKEV_XORI128_UB(out0, out1); 2092cabdff1aSopenharmony_ci out5 = PCKEV_XORI128_UB(out2, out3); 2093cabdff1aSopenharmony_ci ST_W2(out4, 0, 2, dst, dst_stride); 2094cabdff1aSopenharmony_ci ST_H2(out4, 2, 6, dst + 4, dst_stride); 2095cabdff1aSopenharmony_ci ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride); 2096cabdff1aSopenharmony_ci ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2097cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2098cabdff1aSopenharmony_ci 2099cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 2100cabdff1aSopenharmony_ci src += (4 * src_stride); 2101cabdff1aSopenharmony_ci 2102cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2103cabdff1aSopenharmony_ci HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 2104cabdff1aSopenharmony_ci filt1, out0, out1, out2, out3); 2105cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 2106cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 2107cabdff1aSopenharmony_ci out4 = PCKEV_XORI128_UB(out0, out1); 2108cabdff1aSopenharmony_ci out5 = PCKEV_XORI128_UB(out2, out3); 2109cabdff1aSopenharmony_ci ST_W2(out4, 0, 2, dst, dst_stride); 2110cabdff1aSopenharmony_ci ST_H2(out4, 2, 6, dst + 4, dst_stride); 2111cabdff1aSopenharmony_ci ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride); 2112cabdff1aSopenharmony_ci ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2113cabdff1aSopenharmony_ci} 2114cabdff1aSopenharmony_ci 2115cabdff1aSopenharmony_cistatic void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, 2116cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2117cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2118cabdff1aSopenharmony_ci{ 2119cabdff1aSopenharmony_ci uint32_t loop_cnt; 2120cabdff1aSopenharmony_ci v16i8 src0, src1, filt0, filt1, mask0, mask1; 2121cabdff1aSopenharmony_ci v16u8 out; 2122cabdff1aSopenharmony_ci v8i16 filt, vec0, vec1, vec2, vec3; 2123cabdff1aSopenharmony_ci 2124cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2125cabdff1aSopenharmony_ci src -= 1; 2126cabdff1aSopenharmony_ci 2127cabdff1aSopenharmony_ci filt = LD_SH(filter); 2128cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2129cabdff1aSopenharmony_ci 2130cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2131cabdff1aSopenharmony_ci 2132cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 2133cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src1); 2134cabdff1aSopenharmony_ci src += (2 * src_stride); 2135cabdff1aSopenharmony_ci 2136cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 2137cabdff1aSopenharmony_ci VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2138cabdff1aSopenharmony_ci DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1); 2139cabdff1aSopenharmony_ci VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3); 2140cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1); 2141cabdff1aSopenharmony_ci SRARI_H2_SH(vec0, vec1, 6); 2142cabdff1aSopenharmony_ci SAT_SH2_SH(vec0, vec1, 7); 2143cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(vec0, vec1); 2144cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, dst_stride); 2145cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2146cabdff1aSopenharmony_ci } 2147cabdff1aSopenharmony_ci} 2148cabdff1aSopenharmony_ci 2149cabdff1aSopenharmony_cistatic void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, 2150cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2151cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2152cabdff1aSopenharmony_ci{ 2153cabdff1aSopenharmony_ci uint32_t loop_cnt; 2154cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 2155cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 2156cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 2157cabdff1aSopenharmony_ci 2158cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2159cabdff1aSopenharmony_ci src -= 1; 2160cabdff1aSopenharmony_ci 2161cabdff1aSopenharmony_ci /* rearranging filter */ 2162cabdff1aSopenharmony_ci filt = LD_SH(filter); 2163cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2164cabdff1aSopenharmony_ci 2165cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2166cabdff1aSopenharmony_ci 2167cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2168cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 2169cabdff1aSopenharmony_ci src += (4 * src_stride); 2170cabdff1aSopenharmony_ci 2171cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2172cabdff1aSopenharmony_ci HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 2173cabdff1aSopenharmony_ci filt1, out0, out1, out2, out3); 2174cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 2175cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 2176cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 2177cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2, out3); 2178cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 2179cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2180cabdff1aSopenharmony_ci } 2181cabdff1aSopenharmony_ci} 2182cabdff1aSopenharmony_ci 2183cabdff1aSopenharmony_cistatic void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, 2184cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2185cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2186cabdff1aSopenharmony_ci{ 2187cabdff1aSopenharmony_ci if ((2 == height) || (6 == height)) { 2188cabdff1aSopenharmony_ci common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter, 2189cabdff1aSopenharmony_ci height); 2190cabdff1aSopenharmony_ci } else { 2191cabdff1aSopenharmony_ci common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter, 2192cabdff1aSopenharmony_ci height); 2193cabdff1aSopenharmony_ci } 2194cabdff1aSopenharmony_ci} 2195cabdff1aSopenharmony_ci 2196cabdff1aSopenharmony_cistatic void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, 2197cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2198cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2199cabdff1aSopenharmony_ci{ 2200cabdff1aSopenharmony_ci uint32_t loop_cnt; 2201cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3; 2202cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 2203cabdff1aSopenharmony_ci v16i8 vec10, vec11; 2204cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 2205cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3, out4, out5; 2206cabdff1aSopenharmony_ci 2207cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2208cabdff1aSopenharmony_ci mask2 = LD_SB(&ff_hevc_mask_arr[32]); 2209cabdff1aSopenharmony_ci 2210cabdff1aSopenharmony_ci src -= 1; 2211cabdff1aSopenharmony_ci 2212cabdff1aSopenharmony_ci /* rearranging filter */ 2213cabdff1aSopenharmony_ci filt = LD_SH(filter); 2214cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2215cabdff1aSopenharmony_ci 2216cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2217cabdff1aSopenharmony_ci mask3 = mask2 + 2; 2218cabdff1aSopenharmony_ci 2219cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 2220cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 2221cabdff1aSopenharmony_ci src += (4 * src_stride); 2222cabdff1aSopenharmony_ci 2223cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2224cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1); 2225cabdff1aSopenharmony_ci DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1); 2226cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3); 2227cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1); 2228cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 6); 2229cabdff1aSopenharmony_ci SAT_SH2_SH(out0, out1, 7); 2230cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 2231cabdff1aSopenharmony_ci ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride); 2232cabdff1aSopenharmony_ci 2233cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5); 2234cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7); 2235cabdff1aSopenharmony_ci DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 2236cabdff1aSopenharmony_ci out2, out3, out4, out5); 2237cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9); 2238cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11); 2239cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1, 2240cabdff1aSopenharmony_ci out2, out3, out4, out5); 2241cabdff1aSopenharmony_ci SRARI_H4_SH(out2, out3, out4, out5, 6); 2242cabdff1aSopenharmony_ci SAT_SH4_SH(out2, out3, out4, out5, 7); 2243cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out2, out3); 2244cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out4, out5); 2245cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 2246cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2247cabdff1aSopenharmony_ci } 2248cabdff1aSopenharmony_ci} 2249cabdff1aSopenharmony_ci 2250cabdff1aSopenharmony_cistatic void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, 2251cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2252cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2253cabdff1aSopenharmony_ci{ 2254cabdff1aSopenharmony_ci uint32_t loop_cnt; 2255cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2256cabdff1aSopenharmony_ci v16i8 filt0, filt1, mask0, mask1; 2257cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m; 2258cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; 2259cabdff1aSopenharmony_ci v16u8 out; 2260cabdff1aSopenharmony_ci 2261cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2262cabdff1aSopenharmony_ci src -= 1; 2263cabdff1aSopenharmony_ci 2264cabdff1aSopenharmony_ci /* rearranging filter */ 2265cabdff1aSopenharmony_ci filt = LD_SH(filter); 2266cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2267cabdff1aSopenharmony_ci 2268cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2269cabdff1aSopenharmony_ci 2270cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2271cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 2272cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 2273cabdff1aSopenharmony_ci src += (4 * src_stride); 2274cabdff1aSopenharmony_ci 2275cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2276cabdff1aSopenharmony_ci 2277cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); 2278cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); 2279cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, 2280cabdff1aSopenharmony_ci out0, out1, out2, out3); 2281cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); 2282cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); 2283cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, 2284cabdff1aSopenharmony_ci out0, out1, out2, out3); 2285cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 2286cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 2287cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 2288cabdff1aSopenharmony_ci ST_UB(out, dst); 2289cabdff1aSopenharmony_ci dst += dst_stride; 2290cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 2291cabdff1aSopenharmony_ci ST_UB(out, dst); 2292cabdff1aSopenharmony_ci dst += dst_stride; 2293cabdff1aSopenharmony_ci 2294cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m); 2295cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m); 2296cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, 2297cabdff1aSopenharmony_ci out4, out5, out6, out7); 2298cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m); 2299cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m); 2300cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, 2301cabdff1aSopenharmony_ci out4, out5, out6, out7); 2302cabdff1aSopenharmony_ci SRARI_H4_SH(out4, out5, out6, out7, 6); 2303cabdff1aSopenharmony_ci SAT_SH4_SH(out4, out5, out6, out7, 7); 2304cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out4, out5); 2305cabdff1aSopenharmony_ci ST_UB(out, dst); 2306cabdff1aSopenharmony_ci dst += dst_stride; 2307cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out6, out7); 2308cabdff1aSopenharmony_ci ST_UB(out, dst); 2309cabdff1aSopenharmony_ci dst += dst_stride; 2310cabdff1aSopenharmony_ci } 2311cabdff1aSopenharmony_ci} 2312cabdff1aSopenharmony_ci 2313cabdff1aSopenharmony_cistatic void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, 2314cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2315cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2316cabdff1aSopenharmony_ci{ 2317cabdff1aSopenharmony_ci uint8_t *dst1 = dst + 16; 2318cabdff1aSopenharmony_ci uint32_t loop_cnt; 2319cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2320cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2321cabdff1aSopenharmony_ci v16i8 filt0, filt1, mask0, mask1, mask00, mask11; 2322cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 2323cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 2324cabdff1aSopenharmony_ci 2325cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2326cabdff1aSopenharmony_ci src -= 1; 2327cabdff1aSopenharmony_ci 2328cabdff1aSopenharmony_ci /* rearranging filter */ 2329cabdff1aSopenharmony_ci filt = LD_SH(filter); 2330cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2331cabdff1aSopenharmony_ci 2332cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2333cabdff1aSopenharmony_ci mask00 = mask0 + 8; 2334cabdff1aSopenharmony_ci mask11 = mask0 + 10; 2335cabdff1aSopenharmony_ci 2336cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 2337cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 2338cabdff1aSopenharmony_ci LD_SB4(src + 16, src_stride, src1, src3, src5, src7); 2339cabdff1aSopenharmony_ci src += (4 * src_stride); 2340cabdff1aSopenharmony_ci 2341cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2342cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1); 2343cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3); 2344cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5); 2345cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7); 2346cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2347cabdff1aSopenharmony_ci out0, out1, out2, out3); 2348cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, 2349cabdff1aSopenharmony_ci out0, out1, out2, out3); 2350cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 2351cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 2352cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 2353cabdff1aSopenharmony_ci ST_UB(tmp0, dst); 2354cabdff1aSopenharmony_ci dst += dst_stride; 2355cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out2, out3); 2356cabdff1aSopenharmony_ci ST_UB(tmp0, dst); 2357cabdff1aSopenharmony_ci dst += dst_stride; 2358cabdff1aSopenharmony_ci 2359cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1); 2360cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3); 2361cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5); 2362cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7); 2363cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2364cabdff1aSopenharmony_ci out0, out1, out2, out3); 2365cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, 2366cabdff1aSopenharmony_ci out0, out1, out2, out3); 2367cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 2368cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 2369cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 2370cabdff1aSopenharmony_ci ST_UB(tmp0, dst); 2371cabdff1aSopenharmony_ci dst += dst_stride; 2372cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out2, out3); 2373cabdff1aSopenharmony_ci ST_UB(tmp0, dst); 2374cabdff1aSopenharmony_ci dst += dst_stride; 2375cabdff1aSopenharmony_ci 2376cabdff1aSopenharmony_ci /* 8 width */ 2377cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1); 2378cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3); 2379cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5); 2380cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7); 2381cabdff1aSopenharmony_ci 2382cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2383cabdff1aSopenharmony_ci out0, out1, out2, out3); 2384cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, 2385cabdff1aSopenharmony_ci out0, out1, out2, out3); 2386cabdff1aSopenharmony_ci 2387cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 2388cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 2389cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 2390cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2, out3); 2391cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride); 2392cabdff1aSopenharmony_ci dst1 += (4 * dst_stride); 2393cabdff1aSopenharmony_ci } 2394cabdff1aSopenharmony_ci} 2395cabdff1aSopenharmony_ci 2396cabdff1aSopenharmony_cistatic void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, 2397cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2398cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2399cabdff1aSopenharmony_ci{ 2400cabdff1aSopenharmony_ci uint32_t loop_cnt; 2401cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2402cabdff1aSopenharmony_ci v16i8 filt0, filt1, mask0, mask1; 2403cabdff1aSopenharmony_ci v16u8 out; 2404cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m; 2405cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; 2406cabdff1aSopenharmony_ci 2407cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2408cabdff1aSopenharmony_ci src -= 1; 2409cabdff1aSopenharmony_ci 2410cabdff1aSopenharmony_ci /* rearranging filter */ 2411cabdff1aSopenharmony_ci filt = LD_SH(filter); 2412cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2413cabdff1aSopenharmony_ci 2414cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2415cabdff1aSopenharmony_ci 2416cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 2417cabdff1aSopenharmony_ci src0 = LD_SB(src); 2418cabdff1aSopenharmony_ci src1 = LD_SB(src + 8); 2419cabdff1aSopenharmony_ci src2 = LD_SB(src + 16); 2420cabdff1aSopenharmony_ci src3 = LD_SB(src + 24); 2421cabdff1aSopenharmony_ci src += src_stride; 2422cabdff1aSopenharmony_ci src4 = LD_SB(src); 2423cabdff1aSopenharmony_ci src5 = LD_SB(src + 8); 2424cabdff1aSopenharmony_ci src6 = LD_SB(src + 16); 2425cabdff1aSopenharmony_ci src7 = LD_SB(src + 24); 2426cabdff1aSopenharmony_ci src += src_stride; 2427cabdff1aSopenharmony_ci 2428cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2429cabdff1aSopenharmony_ci 2430cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); 2431cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); 2432cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, 2433cabdff1aSopenharmony_ci out0, out1, out2, out3); 2434cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); 2435cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); 2436cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, 2437cabdff1aSopenharmony_ci out0, out1, out2, out3); 2438cabdff1aSopenharmony_ci 2439cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m); 2440cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m); 2441cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, 2442cabdff1aSopenharmony_ci out4, out5, out6, out7); 2443cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m); 2444cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m); 2445cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, 2446cabdff1aSopenharmony_ci out4, out5, out6, out7); 2447cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 6); 2448cabdff1aSopenharmony_ci SRARI_H4_SH(out4, out5, out6, out7, 6); 2449cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 2450cabdff1aSopenharmony_ci SAT_SH4_SH(out4, out5, out6, out7, 7); 2451cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 2452cabdff1aSopenharmony_ci ST_UB(out, dst); 2453cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 2454cabdff1aSopenharmony_ci ST_UB(out, dst + 16); 2455cabdff1aSopenharmony_ci dst += dst_stride; 2456cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out4, out5); 2457cabdff1aSopenharmony_ci ST_UB(out, dst); 2458cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out6, out7); 2459cabdff1aSopenharmony_ci ST_UB(out, dst + 16); 2460cabdff1aSopenharmony_ci dst += dst_stride; 2461cabdff1aSopenharmony_ci } 2462cabdff1aSopenharmony_ci} 2463cabdff1aSopenharmony_ci 2464cabdff1aSopenharmony_cistatic void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, 2465cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2466cabdff1aSopenharmony_ci const int8_t *filter) 2467cabdff1aSopenharmony_ci{ 2468cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r; 2469cabdff1aSopenharmony_ci v16i8 src2110, src4332, filt0, filt1; 2470cabdff1aSopenharmony_ci v16u8 out; 2471cabdff1aSopenharmony_ci v8i16 filt, out10; 2472cabdff1aSopenharmony_ci 2473cabdff1aSopenharmony_ci src -= src_stride; 2474cabdff1aSopenharmony_ci 2475cabdff1aSopenharmony_ci filt = LD_SH(filter); 2476cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2477cabdff1aSopenharmony_ci 2478cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2479cabdff1aSopenharmony_ci src += (3 * src_stride); 2480cabdff1aSopenharmony_ci 2481cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2482cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2483cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2484cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 2485cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2486cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); 2487cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); 2488cabdff1aSopenharmony_ci out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 2489cabdff1aSopenharmony_ci out10 = __msa_srari_h(out10, 6); 2490cabdff1aSopenharmony_ci out10 = __msa_sat_s_h(out10, 7); 2491cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out10, out10); 2492cabdff1aSopenharmony_ci ST_W2(out, 0, 1, dst, dst_stride); 2493cabdff1aSopenharmony_ci} 2494cabdff1aSopenharmony_ci 2495cabdff1aSopenharmony_cistatic void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, 2496cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2497cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2498cabdff1aSopenharmony_ci{ 2499cabdff1aSopenharmony_ci uint32_t loop_cnt; 2500cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 2501cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 2502cabdff1aSopenharmony_ci v16i8 src2110, src4332, filt0, filt1; 2503cabdff1aSopenharmony_ci v8i16 filt, out10, out32; 2504cabdff1aSopenharmony_ci v16u8 out; 2505cabdff1aSopenharmony_ci 2506cabdff1aSopenharmony_ci src -= src_stride; 2507cabdff1aSopenharmony_ci 2508cabdff1aSopenharmony_ci filt = LD_SH(filter); 2509cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2510cabdff1aSopenharmony_ci 2511cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2512cabdff1aSopenharmony_ci src += (3 * src_stride); 2513cabdff1aSopenharmony_ci 2514cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2515cabdff1aSopenharmony_ci 2516cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2517cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2518cabdff1aSopenharmony_ci 2519cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2520cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src3, src4, src5); 2521cabdff1aSopenharmony_ci src += (3 * src_stride); 2522cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2523cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); 2524cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); 2525cabdff1aSopenharmony_ci out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 2526cabdff1aSopenharmony_ci 2527cabdff1aSopenharmony_ci src2 = LD_SB(src); 2528cabdff1aSopenharmony_ci src += (src_stride); 2529cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r); 2530cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r); 2531cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2532cabdff1aSopenharmony_ci out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1); 2533cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out32, 6); 2534cabdff1aSopenharmony_ci SAT_SH2_SH(out10, out32, 7); 2535cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out10, out32); 2536cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2537cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2538cabdff1aSopenharmony_ci } 2539cabdff1aSopenharmony_ci} 2540cabdff1aSopenharmony_ci 2541cabdff1aSopenharmony_cistatic void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, 2542cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2543cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2544cabdff1aSopenharmony_ci{ 2545cabdff1aSopenharmony_ci if (2 == height) { 2546cabdff1aSopenharmony_ci common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); 2547cabdff1aSopenharmony_ci } else { 2548cabdff1aSopenharmony_ci common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter, 2549cabdff1aSopenharmony_ci height); 2550cabdff1aSopenharmony_ci } 2551cabdff1aSopenharmony_ci} 2552cabdff1aSopenharmony_ci 2553cabdff1aSopenharmony_cistatic void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, 2554cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2555cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2556cabdff1aSopenharmony_ci{ 2557cabdff1aSopenharmony_ci v16u8 out0, out1; 2558cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 2559cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 2560cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec; 2561cabdff1aSopenharmony_ci 2562cabdff1aSopenharmony_ci src -= src_stride; 2563cabdff1aSopenharmony_ci 2564cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2565cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2566cabdff1aSopenharmony_ci 2567cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2568cabdff1aSopenharmony_ci src += (3 * src_stride); 2569cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2570cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2571cabdff1aSopenharmony_ci 2572cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 2573cabdff1aSopenharmony_ci src += (2 * src_stride); 2574cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 2575cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2576cabdff1aSopenharmony_ci 2577cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 2578cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 2579cabdff1aSopenharmony_ci 2580cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src5, src6); 2581cabdff1aSopenharmony_ci src += (2 * src_stride); 2582cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 2583cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2584cabdff1aSopenharmony_ci 2585cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 2586cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 2587cabdff1aSopenharmony_ci 2588cabdff1aSopenharmony_ci SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6); 2589cabdff1aSopenharmony_ci SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7); 2590cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0_r, dst1_r); 2591cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2_r, dst3_r); 2592cabdff1aSopenharmony_ci ST_W2(out0, 0, 2, dst, dst_stride); 2593cabdff1aSopenharmony_ci ST_H2(out0, 2, 6, dst + 4, dst_stride); 2594cabdff1aSopenharmony_ci ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 2595cabdff1aSopenharmony_ci ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2596cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2597cabdff1aSopenharmony_ci 2598cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 2599cabdff1aSopenharmony_ci src += (2 * src_stride); 2600cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 2601cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r); 2602cabdff1aSopenharmony_ci 2603cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1); 2604cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1); 2605cabdff1aSopenharmony_ci 2606cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src5, src6); 2607cabdff1aSopenharmony_ci src += (2 * src_stride); 2608cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 2609cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2610cabdff1aSopenharmony_ci 2611cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 2612cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 2613cabdff1aSopenharmony_ci 2614cabdff1aSopenharmony_ci SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6); 2615cabdff1aSopenharmony_ci SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7); 2616cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0_r, dst1_r); 2617cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2_r, dst3_r); 2618cabdff1aSopenharmony_ci ST_W2(out0, 0, 2, dst, dst_stride); 2619cabdff1aSopenharmony_ci ST_H2(out0, 2, 6, dst + 4, dst_stride); 2620cabdff1aSopenharmony_ci ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 2621cabdff1aSopenharmony_ci ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2622cabdff1aSopenharmony_ci} 2623cabdff1aSopenharmony_ci 2624cabdff1aSopenharmony_cistatic void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, 2625cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2626cabdff1aSopenharmony_ci const int8_t *filter) 2627cabdff1aSopenharmony_ci{ 2628cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 2629cabdff1aSopenharmony_ci v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1; 2630cabdff1aSopenharmony_ci v16u8 out; 2631cabdff1aSopenharmony_ci 2632cabdff1aSopenharmony_ci src -= src_stride; 2633cabdff1aSopenharmony_ci 2634cabdff1aSopenharmony_ci /* rearranging filter_y */ 2635cabdff1aSopenharmony_ci filt = LD_SH(filter); 2636cabdff1aSopenharmony_ci SPLATI_H2_SH(filt, 0, 1, filt0, filt1); 2637cabdff1aSopenharmony_ci 2638cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 2639cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2640cabdff1aSopenharmony_ci ILVR_B2_SH(src1, src0, src3, src2, src01, src23); 2641cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1); 2642cabdff1aSopenharmony_ci ILVR_B2_SH(src2, src1, src4, src3, src12, src34); 2643cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1); 2644cabdff1aSopenharmony_ci SRARI_H2_SH(tmp0, tmp1, 6); 2645cabdff1aSopenharmony_ci SAT_SH2_SH(tmp0, tmp1, 7); 2646cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(tmp0, tmp1); 2647cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, dst_stride); 2648cabdff1aSopenharmony_ci} 2649cabdff1aSopenharmony_ci 2650cabdff1aSopenharmony_cistatic void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, 2651cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2652cabdff1aSopenharmony_ci const int8_t *filter) 2653cabdff1aSopenharmony_ci{ 2654cabdff1aSopenharmony_ci uint32_t loop_cnt; 2655cabdff1aSopenharmony_ci uint64_t out0, out1, out2; 2656cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 2657cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2; 2658cabdff1aSopenharmony_ci v8i16 filt, filt0, filt1; 2659cabdff1aSopenharmony_ci 2660cabdff1aSopenharmony_ci src -= src_stride; 2661cabdff1aSopenharmony_ci 2662cabdff1aSopenharmony_ci /* rearranging filter_y */ 2663cabdff1aSopenharmony_ci filt = LD_SH(filter); 2664cabdff1aSopenharmony_ci SPLATI_H2_SH(filt, 0, 1, filt0, filt1); 2665cabdff1aSopenharmony_ci 2666cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2667cabdff1aSopenharmony_ci src += (3 * src_stride); 2668cabdff1aSopenharmony_ci 2669cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2670cabdff1aSopenharmony_ci ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2); 2671cabdff1aSopenharmony_ci 2672cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 2673cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src3, src4, src5); 2674cabdff1aSopenharmony_ci src += (3 * src_stride); 2675cabdff1aSopenharmony_ci 2676cabdff1aSopenharmony_ci XORI_B3_128_SB(src3, src4, src5); 2677cabdff1aSopenharmony_ci ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4); 2678cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2679cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2680cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1); 2681cabdff1aSopenharmony_ci SRARI_H2_SH(tmp0, tmp1, 6); 2682cabdff1aSopenharmony_ci tmp2 = __msa_srari_h(tmp2, 6); 2683cabdff1aSopenharmony_ci SAT_SH3_SH(tmp0, tmp1, tmp2, 7); 2684cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2); 2685cabdff1aSopenharmony_ci XORI_B2_128_SH(tmp0, tmp2); 2686cabdff1aSopenharmony_ci 2687cabdff1aSopenharmony_ci out0 = __msa_copy_u_d((v2i64) tmp0, 0); 2688cabdff1aSopenharmony_ci out1 = __msa_copy_u_d((v2i64) tmp0, 1); 2689cabdff1aSopenharmony_ci out2 = __msa_copy_u_d((v2i64) tmp2, 0); 2690cabdff1aSopenharmony_ci SD(out0, dst); 2691cabdff1aSopenharmony_ci dst += dst_stride; 2692cabdff1aSopenharmony_ci SD(out1, dst); 2693cabdff1aSopenharmony_ci dst += dst_stride; 2694cabdff1aSopenharmony_ci SD(out2, dst); 2695cabdff1aSopenharmony_ci dst += dst_stride; 2696cabdff1aSopenharmony_ci 2697cabdff1aSopenharmony_ci src2 = src5; 2698cabdff1aSopenharmony_ci vec0 = vec3; 2699cabdff1aSopenharmony_ci vec2 = vec4; 2700cabdff1aSopenharmony_ci } 2701cabdff1aSopenharmony_ci} 2702cabdff1aSopenharmony_ci 2703cabdff1aSopenharmony_cistatic void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, 2704cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2705cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2706cabdff1aSopenharmony_ci{ 2707cabdff1aSopenharmony_ci uint32_t loop_cnt; 2708cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src7, src8, src9, src10; 2709cabdff1aSopenharmony_ci v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; 2710cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 2711cabdff1aSopenharmony_ci v8i16 filt, out0_r, out1_r, out2_r, out3_r; 2712cabdff1aSopenharmony_ci 2713cabdff1aSopenharmony_ci src -= src_stride; 2714cabdff1aSopenharmony_ci 2715cabdff1aSopenharmony_ci filt = LD_SH(filter); 2716cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2717cabdff1aSopenharmony_ci 2718cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2719cabdff1aSopenharmony_ci src += (3 * src_stride); 2720cabdff1aSopenharmony_ci 2721cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2722cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2723cabdff1aSopenharmony_ci 2724cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2725cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 2726cabdff1aSopenharmony_ci src += (4 * src_stride); 2727cabdff1aSopenharmony_ci 2728cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 2729cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, 2730cabdff1aSopenharmony_ci src72_r, src87_r, src98_r, src109_r); 2731cabdff1aSopenharmony_ci out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1); 2732cabdff1aSopenharmony_ci out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1); 2733cabdff1aSopenharmony_ci out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1); 2734cabdff1aSopenharmony_ci out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 2735cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 2736cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2737cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 2738cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 2739cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 2740cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2741cabdff1aSopenharmony_ci 2742cabdff1aSopenharmony_ci src10_r = src98_r; 2743cabdff1aSopenharmony_ci src21_r = src109_r; 2744cabdff1aSopenharmony_ci src2 = src10; 2745cabdff1aSopenharmony_ci } 2746cabdff1aSopenharmony_ci} 2747cabdff1aSopenharmony_ci 2748cabdff1aSopenharmony_cistatic void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, 2749cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2750cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2751cabdff1aSopenharmony_ci{ 2752cabdff1aSopenharmony_ci if (2 == height) { 2753cabdff1aSopenharmony_ci common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter); 2754cabdff1aSopenharmony_ci } else if (6 == height) { 2755cabdff1aSopenharmony_ci common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter); 2756cabdff1aSopenharmony_ci } else { 2757cabdff1aSopenharmony_ci common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride, 2758cabdff1aSopenharmony_ci filter, height); 2759cabdff1aSopenharmony_ci } 2760cabdff1aSopenharmony_ci} 2761cabdff1aSopenharmony_ci 2762cabdff1aSopenharmony_cistatic void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, 2763cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2764cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2765cabdff1aSopenharmony_ci{ 2766cabdff1aSopenharmony_ci uint32_t loop_cnt; 2767cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 2768cabdff1aSopenharmony_ci v16u8 out0, out1; 2769cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 2770cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 2771cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554; 2772cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1; 2773cabdff1aSopenharmony_ci v8i16 filter_vec; 2774cabdff1aSopenharmony_ci 2775cabdff1aSopenharmony_ci src -= (1 * src_stride); 2776cabdff1aSopenharmony_ci 2777cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2778cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2779cabdff1aSopenharmony_ci 2780cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2781cabdff1aSopenharmony_ci src += (3 * src_stride); 2782cabdff1aSopenharmony_ci 2783cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2784cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2785cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 2786cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); 2787cabdff1aSopenharmony_ci 2788cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 2789cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src3, src4, src5, src6); 2790cabdff1aSopenharmony_ci src += (4 * src_stride); 2791cabdff1aSopenharmony_ci 2792cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 2793cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2794cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 2795cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); 2796cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2797cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l); 2798cabdff1aSopenharmony_ci src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); 2799cabdff1aSopenharmony_ci 2800cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 2801cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 2802cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 2803cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 2804cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 2805cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 2806cabdff1aSopenharmony_ci 2807cabdff1aSopenharmony_ci SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6); 2808cabdff1aSopenharmony_ci SRARI_H2_SH(dst0_l, dst1_l, 6); 2809cabdff1aSopenharmony_ci SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7); 2810cabdff1aSopenharmony_ci SAT_SH2_SH(dst0_l, dst1_l, 7); 2811cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0_r, dst1_r); 2812cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2_r, dst3_r); 2813cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2814cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0_l, dst1_l); 2815cabdff1aSopenharmony_ci ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride); 2816cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2817cabdff1aSopenharmony_ci 2818cabdff1aSopenharmony_ci src2 = src6; 2819cabdff1aSopenharmony_ci src10_r = src54_r; 2820cabdff1aSopenharmony_ci src21_r = src65_r; 2821cabdff1aSopenharmony_ci src2110 = src6554; 2822cabdff1aSopenharmony_ci } 2823cabdff1aSopenharmony_ci} 2824cabdff1aSopenharmony_ci 2825cabdff1aSopenharmony_cistatic void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, 2826cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2827cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2828cabdff1aSopenharmony_ci{ 2829cabdff1aSopenharmony_ci uint32_t loop_cnt; 2830cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 2831cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; 2832cabdff1aSopenharmony_ci v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; 2833cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 2834cabdff1aSopenharmony_ci v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 2835cabdff1aSopenharmony_ci 2836cabdff1aSopenharmony_ci src -= src_stride; 2837cabdff1aSopenharmony_ci 2838cabdff1aSopenharmony_ci filt = LD_SH(filter); 2839cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2840cabdff1aSopenharmony_ci 2841cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2842cabdff1aSopenharmony_ci src += (3 * src_stride); 2843cabdff1aSopenharmony_ci 2844cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2845cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2846cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 2847cabdff1aSopenharmony_ci 2848cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2849cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src3, src4, src5, src6); 2850cabdff1aSopenharmony_ci src += (4 * src_stride); 2851cabdff1aSopenharmony_ci 2852cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 2853cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 2854cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 2855cabdff1aSopenharmony_ci ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 2856cabdff1aSopenharmony_ci src32_l, src43_l, src54_l, src65_l); 2857cabdff1aSopenharmony_ci out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 2858cabdff1aSopenharmony_ci out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 2859cabdff1aSopenharmony_ci out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 2860cabdff1aSopenharmony_ci out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 2861cabdff1aSopenharmony_ci out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 2862cabdff1aSopenharmony_ci out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 2863cabdff1aSopenharmony_ci out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1); 2864cabdff1aSopenharmony_ci out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1); 2865cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 2866cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); 2867cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2868cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 2869cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 2870cabdff1aSopenharmony_ci out3_r, tmp0, tmp1, tmp2, tmp3); 2871cabdff1aSopenharmony_ci XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 2872cabdff1aSopenharmony_ci ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 2873cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2874cabdff1aSopenharmony_ci 2875cabdff1aSopenharmony_ci src10_r = src54_r; 2876cabdff1aSopenharmony_ci src21_r = src65_r; 2877cabdff1aSopenharmony_ci src10_l = src54_l; 2878cabdff1aSopenharmony_ci src21_l = src65_l; 2879cabdff1aSopenharmony_ci src2 = src6; 2880cabdff1aSopenharmony_ci } 2881cabdff1aSopenharmony_ci} 2882cabdff1aSopenharmony_ci 2883cabdff1aSopenharmony_cistatic void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, 2884cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2885cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2886cabdff1aSopenharmony_ci{ 2887cabdff1aSopenharmony_ci uint32_t loop_cnt; 2888cabdff1aSopenharmony_ci uint64_t out0, out1; 2889cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2890cabdff1aSopenharmony_ci v16i8 src11, filt0, filt1; 2891cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 2892cabdff1aSopenharmony_ci v16i8 src109_r, src10_l, src32_l, src21_l, src43_l; 2893cabdff1aSopenharmony_ci v16u8 out; 2894cabdff1aSopenharmony_ci v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l; 2895cabdff1aSopenharmony_ci 2896cabdff1aSopenharmony_ci src -= src_stride; 2897cabdff1aSopenharmony_ci 2898cabdff1aSopenharmony_ci filt = LD_SH(filter); 2899cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2900cabdff1aSopenharmony_ci 2901cabdff1aSopenharmony_ci /* 16 width */ 2902cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 2903cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2904cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2905cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 2906cabdff1aSopenharmony_ci 2907cabdff1aSopenharmony_ci /* 8 width */ 2908cabdff1aSopenharmony_ci LD_SB3(src + 16, src_stride, src6, src7, src8); 2909cabdff1aSopenharmony_ci src += (3 * src_stride); 2910cabdff1aSopenharmony_ci XORI_B3_128_SB(src6, src7, src8); 2911cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 2912cabdff1aSopenharmony_ci 2913cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 2914cabdff1aSopenharmony_ci /* 16 width */ 2915cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 2916cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 2917cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2918cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 2919cabdff1aSopenharmony_ci 2920cabdff1aSopenharmony_ci /* 8 width */ 2921cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src9, src10); 2922cabdff1aSopenharmony_ci src += (2 * src_stride); 2923cabdff1aSopenharmony_ci XORI_B2_128_SB(src9, src10); 2924cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 2925cabdff1aSopenharmony_ci 2926cabdff1aSopenharmony_ci /* 16 width */ 2927cabdff1aSopenharmony_ci out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 2928cabdff1aSopenharmony_ci out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 2929cabdff1aSopenharmony_ci out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 2930cabdff1aSopenharmony_ci out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 2931cabdff1aSopenharmony_ci 2932cabdff1aSopenharmony_ci /* 8 width */ 2933cabdff1aSopenharmony_ci out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 2934cabdff1aSopenharmony_ci out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 2935cabdff1aSopenharmony_ci 2936cabdff1aSopenharmony_ci /* 16 + 8 width */ 2937cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 2938cabdff1aSopenharmony_ci SRARI_H2_SH(out0_l, out1_l, 6); 2939cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2940cabdff1aSopenharmony_ci SAT_SH2_SH(out0_l, out1_l, 7); 2941cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0_r, out0_l); 2942cabdff1aSopenharmony_ci ST_UB(out, dst); 2943cabdff1aSopenharmony_ci PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r); 2944cabdff1aSopenharmony_ci XORI_B2_128_SH(out2_r, out3_r); 2945cabdff1aSopenharmony_ci out0 = __msa_copy_u_d((v2i64) out2_r, 0); 2946cabdff1aSopenharmony_ci out1 = __msa_copy_u_d((v2i64) out3_r, 0); 2947cabdff1aSopenharmony_ci SD(out0, dst + 16); 2948cabdff1aSopenharmony_ci dst += dst_stride; 2949cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out1_r, out1_l); 2950cabdff1aSopenharmony_ci ST_UB(out, dst); 2951cabdff1aSopenharmony_ci SD(out1, dst + 16); 2952cabdff1aSopenharmony_ci dst += dst_stride; 2953cabdff1aSopenharmony_ci 2954cabdff1aSopenharmony_ci /* 16 width */ 2955cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src5, src2); 2956cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src2); 2957cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 2958cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 2959cabdff1aSopenharmony_ci 2960cabdff1aSopenharmony_ci /* 8 width */ 2961cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src11, src8); 2962cabdff1aSopenharmony_ci src += (2 * src_stride); 2963cabdff1aSopenharmony_ci XORI_B2_128_SB(src11, src8); 2964cabdff1aSopenharmony_ci ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); 2965cabdff1aSopenharmony_ci 2966cabdff1aSopenharmony_ci /* 16 width */ 2967cabdff1aSopenharmony_ci out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 2968cabdff1aSopenharmony_ci out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1); 2969cabdff1aSopenharmony_ci out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 2970cabdff1aSopenharmony_ci out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1); 2971cabdff1aSopenharmony_ci 2972cabdff1aSopenharmony_ci /* 8 width */ 2973cabdff1aSopenharmony_ci out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1); 2974cabdff1aSopenharmony_ci out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1); 2975cabdff1aSopenharmony_ci 2976cabdff1aSopenharmony_ci /* 16 + 8 width */ 2977cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 2978cabdff1aSopenharmony_ci SRARI_H2_SH(out0_l, out1_l, 6); 2979cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2980cabdff1aSopenharmony_ci SAT_SH2_SH(out0_l, out1_l, 7); 2981cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0_r, out0_l); 2982cabdff1aSopenharmony_ci ST_UB(out, dst); 2983cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2_r, out2_r); 2984cabdff1aSopenharmony_ci ST_D1(out, 0, dst + 16); 2985cabdff1aSopenharmony_ci dst += dst_stride; 2986cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out1_r, out1_l); 2987cabdff1aSopenharmony_ci ST_UB(out, dst); 2988cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out3_r, out3_r); 2989cabdff1aSopenharmony_ci ST_D1(out, 0, dst + 16); 2990cabdff1aSopenharmony_ci dst += dst_stride; 2991cabdff1aSopenharmony_ci } 2992cabdff1aSopenharmony_ci} 2993cabdff1aSopenharmony_ci 2994cabdff1aSopenharmony_cistatic void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, 2995cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2996cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 2997cabdff1aSopenharmony_ci{ 2998cabdff1aSopenharmony_ci uint32_t loop_cnt; 2999cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; 3000cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r; 3001cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src87_r, src109_r; 3002cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 3003cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src76_l, src98_l; 3004cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src87_l, src109_l; 3005cabdff1aSopenharmony_ci v8i16 filt; 3006cabdff1aSopenharmony_ci v16i8 filt0, filt1; 3007cabdff1aSopenharmony_ci v16u8 out; 3008cabdff1aSopenharmony_ci 3009cabdff1aSopenharmony_ci src -= src_stride; 3010cabdff1aSopenharmony_ci 3011cabdff1aSopenharmony_ci filt = LD_SH(filter); 3012cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 3013cabdff1aSopenharmony_ci 3014cabdff1aSopenharmony_ci /* 16 width */ 3015cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3016cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3017cabdff1aSopenharmony_ci 3018cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3019cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3020cabdff1aSopenharmony_ci 3021cabdff1aSopenharmony_ci /* next 16 width */ 3022cabdff1aSopenharmony_ci LD_SB3(src + 16, src_stride, src6, src7, src8); 3023cabdff1aSopenharmony_ci src += (3 * src_stride); 3024cabdff1aSopenharmony_ci 3025cabdff1aSopenharmony_ci XORI_B3_128_SB(src6, src7, src8); 3026cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3027cabdff1aSopenharmony_ci ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 3028cabdff1aSopenharmony_ci 3029cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 3030cabdff1aSopenharmony_ci /* 16 width */ 3031cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 3032cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3033cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3034cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3035cabdff1aSopenharmony_ci 3036cabdff1aSopenharmony_ci /* 16 width */ 3037cabdff1aSopenharmony_ci out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3038cabdff1aSopenharmony_ci out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 3039cabdff1aSopenharmony_ci out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3040cabdff1aSopenharmony_ci out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 3041cabdff1aSopenharmony_ci 3042cabdff1aSopenharmony_ci /* 16 width */ 3043cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6); 3044cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7); 3045cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0_r, out0_l); 3046cabdff1aSopenharmony_ci ST_UB(out, dst); 3047cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out1_r, out1_l); 3048cabdff1aSopenharmony_ci ST_UB(out, dst + dst_stride); 3049cabdff1aSopenharmony_ci 3050cabdff1aSopenharmony_ci src10_r = src32_r; 3051cabdff1aSopenharmony_ci src21_r = src43_r; 3052cabdff1aSopenharmony_ci src10_l = src32_l; 3053cabdff1aSopenharmony_ci src21_l = src43_l; 3054cabdff1aSopenharmony_ci src2 = src4; 3055cabdff1aSopenharmony_ci 3056cabdff1aSopenharmony_ci /* next 16 width */ 3057cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src9, src10); 3058cabdff1aSopenharmony_ci src += (2 * src_stride); 3059cabdff1aSopenharmony_ci XORI_B2_128_SB(src9, src10); 3060cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3061cabdff1aSopenharmony_ci ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); 3062cabdff1aSopenharmony_ci 3063cabdff1aSopenharmony_ci /* next 16 width */ 3064cabdff1aSopenharmony_ci out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 3065cabdff1aSopenharmony_ci out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1); 3066cabdff1aSopenharmony_ci out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 3067cabdff1aSopenharmony_ci out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1); 3068cabdff1aSopenharmony_ci 3069cabdff1aSopenharmony_ci /* next 16 width */ 3070cabdff1aSopenharmony_ci SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6); 3071cabdff1aSopenharmony_ci SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7); 3072cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2_r, out2_l); 3073cabdff1aSopenharmony_ci ST_UB(out, dst + 16); 3074cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out3_r, out3_l); 3075cabdff1aSopenharmony_ci ST_UB(out, dst + 16 + dst_stride); 3076cabdff1aSopenharmony_ci 3077cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3078cabdff1aSopenharmony_ci 3079cabdff1aSopenharmony_ci src76_r = src98_r; 3080cabdff1aSopenharmony_ci src87_r = src109_r; 3081cabdff1aSopenharmony_ci src76_l = src98_l; 3082cabdff1aSopenharmony_ci src87_l = src109_l; 3083cabdff1aSopenharmony_ci src8 = src10; 3084cabdff1aSopenharmony_ci } 3085cabdff1aSopenharmony_ci} 3086cabdff1aSopenharmony_ci 3087cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_4x2_msa(uint8_t *src, 3088cabdff1aSopenharmony_ci int32_t src_stride, 3089cabdff1aSopenharmony_ci uint8_t *dst, 3090cabdff1aSopenharmony_ci int32_t dst_stride, 3091cabdff1aSopenharmony_ci const int8_t *filter_x, 3092cabdff1aSopenharmony_ci const int8_t *filter_y) 3093cabdff1aSopenharmony_ci{ 3094cabdff1aSopenharmony_ci v16u8 out; 3095cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3096cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3097cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3098cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3099cabdff1aSopenharmony_ci v16i8 mask1; 3100cabdff1aSopenharmony_ci v8i16 filter_vec, tmp; 3101cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 3102cabdff1aSopenharmony_ci v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43; 3103cabdff1aSopenharmony_ci v4i32 dst0, dst1; 3104cabdff1aSopenharmony_ci 3105cabdff1aSopenharmony_ci src -= (src_stride + 1); 3106cabdff1aSopenharmony_ci 3107cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3108cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3109cabdff1aSopenharmony_ci 3110cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3111cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3112cabdff1aSopenharmony_ci 3113cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3114cabdff1aSopenharmony_ci 3115cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3116cabdff1aSopenharmony_ci 3117cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3118cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3119cabdff1aSopenharmony_ci 3120cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1); 3121cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3); 3122cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5); 3123cabdff1aSopenharmony_ci 3124cabdff1aSopenharmony_ci dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3125cabdff1aSopenharmony_ci dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3126cabdff1aSopenharmony_ci dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3127cabdff1aSopenharmony_ci 3128cabdff1aSopenharmony_ci ILVRL_H2_SH(dst31, dst20, dst10, dst32); 3129cabdff1aSopenharmony_ci ILVRL_H2_SH(dst42, dst31, dst21, dst43); 3130cabdff1aSopenharmony_ci 3131cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3132cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3133cabdff1aSopenharmony_ci dst0 >>= 6; 3134cabdff1aSopenharmony_ci dst1 >>= 6; 3135cabdff1aSopenharmony_ci tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 3136cabdff1aSopenharmony_ci tmp = __msa_srari_h(tmp, 6); 3137cabdff1aSopenharmony_ci tmp = __msa_sat_s_h(tmp, 7); 3138cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(tmp, tmp); 3139cabdff1aSopenharmony_ci ST_W2(out, 0, 1, dst, dst_stride); 3140cabdff1aSopenharmony_ci} 3141cabdff1aSopenharmony_ci 3142cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_4x4_msa(uint8_t *src, 3143cabdff1aSopenharmony_ci int32_t src_stride, 3144cabdff1aSopenharmony_ci uint8_t *dst, 3145cabdff1aSopenharmony_ci int32_t dst_stride, 3146cabdff1aSopenharmony_ci const int8_t *filter_x, 3147cabdff1aSopenharmony_ci const int8_t *filter_y) 3148cabdff1aSopenharmony_ci{ 3149cabdff1aSopenharmony_ci v16u8 out; 3150cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 3151cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3152cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3153cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3154cabdff1aSopenharmony_ci v16i8 mask1; 3155cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3156cabdff1aSopenharmony_ci v8i16 filter_vec, tmp0, tmp1; 3157cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63; 3158cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst21, dst43, dst65; 3159cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3; 3160cabdff1aSopenharmony_ci 3161cabdff1aSopenharmony_ci src -= (src_stride + 1); 3162cabdff1aSopenharmony_ci 3163cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3164cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3165cabdff1aSopenharmony_ci 3166cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3167cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3168cabdff1aSopenharmony_ci 3169cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3170cabdff1aSopenharmony_ci 3171cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3172cabdff1aSopenharmony_ci 3173cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3174cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3175cabdff1aSopenharmony_ci 3176cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1); 3177cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3); 3178cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5); 3179cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7); 3180cabdff1aSopenharmony_ci 3181cabdff1aSopenharmony_ci dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3182cabdff1aSopenharmony_ci dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3183cabdff1aSopenharmony_ci dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3184cabdff1aSopenharmony_ci dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3185cabdff1aSopenharmony_ci 3186cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10, dst43); 3187cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21, dst54); 3188cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32, dst65); 3189cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3190cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3191cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1); 3192cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1); 3193cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 3194cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 3195cabdff1aSopenharmony_ci SRARI_H2_SH(tmp0, tmp1, 6); 3196cabdff1aSopenharmony_ci SAT_SH2_SH(tmp0, tmp1, 7); 3197cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(tmp0, tmp1); 3198cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 3199cabdff1aSopenharmony_ci} 3200cabdff1aSopenharmony_ci 3201cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, 3202cabdff1aSopenharmony_ci int32_t src_stride, 3203cabdff1aSopenharmony_ci uint8_t *dst, 3204cabdff1aSopenharmony_ci int32_t dst_stride, 3205cabdff1aSopenharmony_ci const int8_t *filter_x, 3206cabdff1aSopenharmony_ci const int8_t *filter_y, 3207cabdff1aSopenharmony_ci int32_t height) 3208cabdff1aSopenharmony_ci{ 3209cabdff1aSopenharmony_ci uint32_t loop_cnt; 3210cabdff1aSopenharmony_ci v16u8 out0, out1; 3211cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 3212cabdff1aSopenharmony_ci v16i8 src6, src7, src8, src9, src10; 3213cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3214cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3215cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3216cabdff1aSopenharmony_ci v16i8 mask1; 3217cabdff1aSopenharmony_ci v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3; 3218cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3219cabdff1aSopenharmony_ci v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 3220cabdff1aSopenharmony_ci v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 3221cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 3222cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 3223cabdff1aSopenharmony_ci v8i16 dst98_r, dst109_r; 3224cabdff1aSopenharmony_ci 3225cabdff1aSopenharmony_ci src -= (src_stride + 1); 3226cabdff1aSopenharmony_ci 3227cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3228cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3229cabdff1aSopenharmony_ci 3230cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3231cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3232cabdff1aSopenharmony_ci 3233cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3234cabdff1aSopenharmony_ci 3235cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3236cabdff1aSopenharmony_ci 3237cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3238cabdff1aSopenharmony_ci src += (3 * src_stride); 3239cabdff1aSopenharmony_ci 3240cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3241cabdff1aSopenharmony_ci 3242cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 3243cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3); 3244cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3245cabdff1aSopenharmony_ci dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3246cabdff1aSopenharmony_ci ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 3247cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 3248cabdff1aSopenharmony_ci 3249cabdff1aSopenharmony_ci for (loop_cnt = height >> 3; loop_cnt--;) { 3250cabdff1aSopenharmony_ci LD_SB8(src, src_stride, 3251cabdff1aSopenharmony_ci src3, src4, src5, src6, src7, src8, src9, src10); 3252cabdff1aSopenharmony_ci src += (8 * src_stride); 3253cabdff1aSopenharmony_ci 3254cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3255cabdff1aSopenharmony_ci 3256cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1); 3257cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3); 3258cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5); 3259cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7); 3260cabdff1aSopenharmony_ci 3261cabdff1aSopenharmony_ci dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3262cabdff1aSopenharmony_ci dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3263cabdff1aSopenharmony_ci dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3264cabdff1aSopenharmony_ci dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3265cabdff1aSopenharmony_ci 3266cabdff1aSopenharmony_ci dst32_r = __msa_ilvr_h(dst73, dst22); 3267cabdff1aSopenharmony_ci ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 3268cabdff1aSopenharmony_ci ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 3269cabdff1aSopenharmony_ci ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 3270cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 3271cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst22, dst106); 3272cabdff1aSopenharmony_ci 3273cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3274cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3275cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3276cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3277cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3278cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3279cabdff1aSopenharmony_ci dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 3280cabdff1aSopenharmony_ci dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 3281cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 3282cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 3283cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r, 3284cabdff1aSopenharmony_ci dst5_r, dst4_r, dst7_r, dst6_r, 3285cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 3286cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); 3287cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 3288cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 3289cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 3290cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3291cabdff1aSopenharmony_ci dst += (8 * dst_stride); 3292cabdff1aSopenharmony_ci 3293cabdff1aSopenharmony_ci dst10_r = dst98_r; 3294cabdff1aSopenharmony_ci dst21_r = dst109_r; 3295cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 3296cabdff1aSopenharmony_ci } 3297cabdff1aSopenharmony_ci} 3298cabdff1aSopenharmony_ci 3299cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_4w_msa(uint8_t *src, 3300cabdff1aSopenharmony_ci int32_t src_stride, 3301cabdff1aSopenharmony_ci uint8_t *dst, 3302cabdff1aSopenharmony_ci int32_t dst_stride, 3303cabdff1aSopenharmony_ci const int8_t *filter_x, 3304cabdff1aSopenharmony_ci const int8_t *filter_y, 3305cabdff1aSopenharmony_ci int32_t height) 3306cabdff1aSopenharmony_ci{ 3307cabdff1aSopenharmony_ci if (2 == height) { 3308cabdff1aSopenharmony_ci hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride, 3309cabdff1aSopenharmony_ci filter_x, filter_y); 3310cabdff1aSopenharmony_ci } else if (4 == height) { 3311cabdff1aSopenharmony_ci hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride, 3312cabdff1aSopenharmony_ci filter_x, filter_y); 3313cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 3314cabdff1aSopenharmony_ci hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride, 3315cabdff1aSopenharmony_ci filter_x, filter_y, height); 3316cabdff1aSopenharmony_ci } 3317cabdff1aSopenharmony_ci} 3318cabdff1aSopenharmony_ci 3319cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_6w_msa(uint8_t *src, 3320cabdff1aSopenharmony_ci int32_t src_stride, 3321cabdff1aSopenharmony_ci uint8_t *dst, 3322cabdff1aSopenharmony_ci int32_t dst_stride, 3323cabdff1aSopenharmony_ci const int8_t *filter_x, 3324cabdff1aSopenharmony_ci const int8_t *filter_y, 3325cabdff1aSopenharmony_ci int32_t height) 3326cabdff1aSopenharmony_ci{ 3327cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 3328cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 3329cabdff1aSopenharmony_ci v16i8 src7, src8, src9, src10; 3330cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3331cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3332cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3333cabdff1aSopenharmony_ci v16i8 mask1; 3334cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec; 3335cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9; 3336cabdff1aSopenharmony_ci v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 3337cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3338cabdff1aSopenharmony_ci v4i32 dst4_r, dst5_r, dst6_r, dst7_r; 3339cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 3340cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 3341cabdff1aSopenharmony_ci v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r; 3342cabdff1aSopenharmony_ci v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l; 3343cabdff1aSopenharmony_ci v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l; 3344cabdff1aSopenharmony_ci 3345cabdff1aSopenharmony_ci src -= (src_stride + 1); 3346cabdff1aSopenharmony_ci 3347cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3348cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3349cabdff1aSopenharmony_ci 3350cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3351cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3352cabdff1aSopenharmony_ci 3353cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3354cabdff1aSopenharmony_ci 3355cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3356cabdff1aSopenharmony_ci 3357cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3358cabdff1aSopenharmony_ci src += (3 * src_stride); 3359cabdff1aSopenharmony_ci 3360cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3361cabdff1aSopenharmony_ci 3362cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3363cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3364cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3365cabdff1aSopenharmony_ci 3366cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3367cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3368cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3369cabdff1aSopenharmony_ci 3370cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 3371cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 3372cabdff1aSopenharmony_ci 3373cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 3374cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3375cabdff1aSopenharmony_ci 3376cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3377cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3378cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3379cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3380cabdff1aSopenharmony_ci 3381cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3382cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3383cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3384cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3385cabdff1aSopenharmony_ci 3386cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 3387cabdff1aSopenharmony_ci VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3); 3388cabdff1aSopenharmony_ci VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5); 3389cabdff1aSopenharmony_ci VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7); 3390cabdff1aSopenharmony_ci 3391cabdff1aSopenharmony_ci dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3392cabdff1aSopenharmony_ci dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3393cabdff1aSopenharmony_ci dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3394cabdff1aSopenharmony_ci dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3395cabdff1aSopenharmony_ci 3396cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 3397cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 3398cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 3399cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 3400cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 3401cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 3402cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l); 3403cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l); 3404cabdff1aSopenharmony_ci 3405cabdff1aSopenharmony_ci PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l); 3406cabdff1aSopenharmony_ci PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l); 3407cabdff1aSopenharmony_ci dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l); 3408cabdff1aSopenharmony_ci 3409cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3410cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3411cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3412cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3413cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3414cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3415cabdff1aSopenharmony_ci dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 3416cabdff1aSopenharmony_ci dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 3417cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1); 3418cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1); 3419cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1); 3420cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1); 3421cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 3422cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 3423cabdff1aSopenharmony_ci SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 3424cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); 3425cabdff1aSopenharmony_ci PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3); 3426cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5); 3427cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); 3428cabdff1aSopenharmony_ci SRARI_H2_SH(tmp4, tmp5, 6); 3429cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7); 3430cabdff1aSopenharmony_ci SAT_SH2_SH(tmp4, tmp5,7); 3431cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 3432cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 3433cabdff1aSopenharmony_ci out2 = PCKEV_XORI128_UB(tmp4, tmp5); 3434cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3435cabdff1aSopenharmony_ci ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); 3436cabdff1aSopenharmony_ci} 3437cabdff1aSopenharmony_ci 3438cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_8x2_msa(uint8_t *src, 3439cabdff1aSopenharmony_ci int32_t src_stride, 3440cabdff1aSopenharmony_ci uint8_t *dst, 3441cabdff1aSopenharmony_ci int32_t dst_stride, 3442cabdff1aSopenharmony_ci const int8_t *filter_x, 3443cabdff1aSopenharmony_ci const int8_t *filter_y) 3444cabdff1aSopenharmony_ci{ 3445cabdff1aSopenharmony_ci v16u8 out; 3446cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3447cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3448cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec; 3449cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3450cabdff1aSopenharmony_ci v16i8 mask1; 3451cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 3452cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4; 3453cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 3454cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 3455cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 3456cabdff1aSopenharmony_ci v8i16 out0_r, out1_r; 3457cabdff1aSopenharmony_ci 3458cabdff1aSopenharmony_ci src -= (src_stride + 1); 3459cabdff1aSopenharmony_ci 3460cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3461cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3462cabdff1aSopenharmony_ci 3463cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3464cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3465cabdff1aSopenharmony_ci 3466cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3467cabdff1aSopenharmony_ci 3468cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3469cabdff1aSopenharmony_ci 3470cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3471cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3472cabdff1aSopenharmony_ci 3473cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3474cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3475cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3476cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 3477cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 3478cabdff1aSopenharmony_ci 3479cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3480cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3481cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3482cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3483cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 3484cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3485cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3486cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3487cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3488cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3489cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3490cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3491cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3492cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3493cabdff1aSopenharmony_ci PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r); 3494cabdff1aSopenharmony_ci SRARI_H2_SH(out0_r, out1_r, 6); 3495cabdff1aSopenharmony_ci SAT_SH2_SH(out0_r, out1_r, 7); 3496cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0_r, out1_r); 3497cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, dst_stride); 3498cabdff1aSopenharmony_ci} 3499cabdff1aSopenharmony_ci 3500cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_8multx4_msa(uint8_t *src, 3501cabdff1aSopenharmony_ci int32_t src_stride, 3502cabdff1aSopenharmony_ci uint8_t *dst, 3503cabdff1aSopenharmony_ci int32_t dst_stride, 3504cabdff1aSopenharmony_ci const int8_t *filter_x, 3505cabdff1aSopenharmony_ci const int8_t *filter_y, 3506cabdff1aSopenharmony_ci int32_t width8mult) 3507cabdff1aSopenharmony_ci{ 3508cabdff1aSopenharmony_ci uint32_t cnt; 3509cabdff1aSopenharmony_ci v16u8 out0, out1; 3510cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 3511cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3512cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec; 3513cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 3514cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3515cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 3516cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 3517cabdff1aSopenharmony_ci 3518cabdff1aSopenharmony_ci src -= (src_stride + 1); 3519cabdff1aSopenharmony_ci 3520cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3521cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3522cabdff1aSopenharmony_ci 3523cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3524cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3525cabdff1aSopenharmony_ci 3526cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3527cabdff1aSopenharmony_ci 3528cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 3529cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3530cabdff1aSopenharmony_ci 3531cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 3532cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3533cabdff1aSopenharmony_ci src += 8; 3534cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3535cabdff1aSopenharmony_ci 3536cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3537cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3538cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3539cabdff1aSopenharmony_ci 3540cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3541cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3542cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3543cabdff1aSopenharmony_ci 3544cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3545cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3546cabdff1aSopenharmony_ci 3547cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3548cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3549cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3550cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3551cabdff1aSopenharmony_ci 3552cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3553cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3554cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3555cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3556cabdff1aSopenharmony_ci 3557cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3558cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3559cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3560cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3561cabdff1aSopenharmony_ci 3562cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3563cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3564cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3565cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3566cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3567cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3568cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3569cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3570cabdff1aSopenharmony_ci 3571cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3572cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3573cabdff1aSopenharmony_ci 3574cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 3575cabdff1aSopenharmony_ci dst3_r, tmp0, tmp1, tmp2, tmp3); 3576cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); 3577cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 3578cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 3579cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 3580cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 3581cabdff1aSopenharmony_ci dst += 8; 3582cabdff1aSopenharmony_ci } 3583cabdff1aSopenharmony_ci} 3584cabdff1aSopenharmony_ci 3585cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_8x6_msa(uint8_t *src, 3586cabdff1aSopenharmony_ci int32_t src_stride, 3587cabdff1aSopenharmony_ci uint8_t *dst, 3588cabdff1aSopenharmony_ci int32_t dst_stride, 3589cabdff1aSopenharmony_ci const int8_t *filter_x, 3590cabdff1aSopenharmony_ci const int8_t *filter_y) 3591cabdff1aSopenharmony_ci{ 3592cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 3593cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3594cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3595cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec; 3596cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3597cabdff1aSopenharmony_ci v16i8 mask1; 3598cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 3599cabdff1aSopenharmony_ci v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 3600cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 3601cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3602cabdff1aSopenharmony_ci v4i32 dst4_r, dst4_l, dst5_r, dst5_l; 3603cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst10_l, dst32_l; 3604cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst21_l, dst43_l; 3605cabdff1aSopenharmony_ci v8i16 dst54_r, dst54_l, dst65_r, dst65_l; 3606cabdff1aSopenharmony_ci v8i16 dst76_r, dst76_l, dst87_r, dst87_l; 3607cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r; 3608cabdff1aSopenharmony_ci 3609cabdff1aSopenharmony_ci src -= (src_stride + 1); 3610cabdff1aSopenharmony_ci 3611cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3612cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3613cabdff1aSopenharmony_ci 3614cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3615cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3616cabdff1aSopenharmony_ci 3617cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3618cabdff1aSopenharmony_ci 3619cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3620cabdff1aSopenharmony_ci 3621cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3622cabdff1aSopenharmony_ci src += (5 * src_stride); 3623cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src5, src6, src7, src8); 3624cabdff1aSopenharmony_ci 3625cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3626cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 3627cabdff1aSopenharmony_ci 3628cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3629cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3630cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3631cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 3632cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 3633cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 3634cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13); 3635cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15); 3636cabdff1aSopenharmony_ci VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17); 3637cabdff1aSopenharmony_ci 3638cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3639cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3640cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3641cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3642cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 3643cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 3644cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1); 3645cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1); 3646cabdff1aSopenharmony_ci dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1); 3647cabdff1aSopenharmony_ci 3648cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3649cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3650cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3651cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3652cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3653cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3654cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 3655cabdff1aSopenharmony_ci ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 3656cabdff1aSopenharmony_ci 3657cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3658cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3659cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3660cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3661cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3662cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3663cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3664cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3665cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3666cabdff1aSopenharmony_ci dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); 3667cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3668cabdff1aSopenharmony_ci dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); 3669cabdff1aSopenharmony_ci 3670cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3671cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3672cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6); 3673cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, 3674cabdff1aSopenharmony_ci dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r); 3675cabdff1aSopenharmony_ci PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r); 3676cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 3677cabdff1aSopenharmony_ci SRARI_H2_SH(out4_r, out5_r, 6); 3678cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3679cabdff1aSopenharmony_ci SAT_SH2_SH(out4_r, out5_r, 7); 3680cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(out0_r, out1_r); 3681cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(out2_r, out3_r); 3682cabdff1aSopenharmony_ci out2 = PCKEV_XORI128_UB(out4_r, out5_r); 3683cabdff1aSopenharmony_ci 3684cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 3685cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 3686cabdff1aSopenharmony_ci} 3687cabdff1aSopenharmony_ci 3688cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src, 3689cabdff1aSopenharmony_ci int32_t src_stride, 3690cabdff1aSopenharmony_ci uint8_t *dst, 3691cabdff1aSopenharmony_ci int32_t dst_stride, 3692cabdff1aSopenharmony_ci const int8_t *filter_x, 3693cabdff1aSopenharmony_ci const int8_t *filter_y, 3694cabdff1aSopenharmony_ci int32_t height, 3695cabdff1aSopenharmony_ci int32_t width8mult) 3696cabdff1aSopenharmony_ci{ 3697cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 3698cabdff1aSopenharmony_ci uint8_t *src_tmp; 3699cabdff1aSopenharmony_ci uint8_t *dst_tmp; 3700cabdff1aSopenharmony_ci v16u8 out0, out1; 3701cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 3702cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3703cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec; 3704cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3705cabdff1aSopenharmony_ci v16i8 mask1; 3706cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3707cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 3708cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3709cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 3710cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 3711cabdff1aSopenharmony_ci v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6; 3712cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r; 3713cabdff1aSopenharmony_ci 3714cabdff1aSopenharmony_ci src -= (src_stride + 1); 3715cabdff1aSopenharmony_ci 3716cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3717cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3718cabdff1aSopenharmony_ci 3719cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3720cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3721cabdff1aSopenharmony_ci 3722cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3723cabdff1aSopenharmony_ci 3724cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3725cabdff1aSopenharmony_ci 3726cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 3727cabdff1aSopenharmony_ci src_tmp = src; 3728cabdff1aSopenharmony_ci dst_tmp = dst; 3729cabdff1aSopenharmony_ci 3730cabdff1aSopenharmony_ci LD_SB3(src_tmp, src_stride, src0, src1, src2); 3731cabdff1aSopenharmony_ci src_tmp += (3 * src_stride); 3732cabdff1aSopenharmony_ci 3733cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3734cabdff1aSopenharmony_ci 3735cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3736cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3737cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3738cabdff1aSopenharmony_ci 3739cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3740cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3741cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3742cabdff1aSopenharmony_ci 3743cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3744cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3745cabdff1aSopenharmony_ci 3746cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3747cabdff1aSopenharmony_ci LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 3748cabdff1aSopenharmony_ci src_tmp += (4 * src_stride); 3749cabdff1aSopenharmony_ci 3750cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 3751cabdff1aSopenharmony_ci 3752cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3753cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3754cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3755cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3756cabdff1aSopenharmony_ci 3757cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3758cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3759cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3760cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3761cabdff1aSopenharmony_ci 3762cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3763cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3764cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3765cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3766cabdff1aSopenharmony_ci 3767cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3768cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3769cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3770cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3771cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3772cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3773cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3774cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3775cabdff1aSopenharmony_ci 3776cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3777cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3778cabdff1aSopenharmony_ci 3779cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, 3780cabdff1aSopenharmony_ci dst2_l, dst2_r, dst3_l, dst3_r, 3781cabdff1aSopenharmony_ci out0_r, out1_r, out2_r, out3_r); 3782cabdff1aSopenharmony_ci 3783cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 3784cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3785cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(out0_r, out1_r); 3786cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(out2_r, out3_r); 3787cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 3788cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 3789cabdff1aSopenharmony_ci 3790cabdff1aSopenharmony_ci dst10_r = dst54_r; 3791cabdff1aSopenharmony_ci dst10_l = dst54_l; 3792cabdff1aSopenharmony_ci dst21_r = dst65_r; 3793cabdff1aSopenharmony_ci dst21_l = dst65_l; 3794cabdff1aSopenharmony_ci dst2 = dst6; 3795cabdff1aSopenharmony_ci } 3796cabdff1aSopenharmony_ci 3797cabdff1aSopenharmony_ci src += 8; 3798cabdff1aSopenharmony_ci dst += 8; 3799cabdff1aSopenharmony_ci } 3800cabdff1aSopenharmony_ci} 3801cabdff1aSopenharmony_ci 3802cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_8w_msa(uint8_t *src, 3803cabdff1aSopenharmony_ci int32_t src_stride, 3804cabdff1aSopenharmony_ci uint8_t *dst, 3805cabdff1aSopenharmony_ci int32_t dst_stride, 3806cabdff1aSopenharmony_ci const int8_t *filter_x, 3807cabdff1aSopenharmony_ci const int8_t *filter_y, 3808cabdff1aSopenharmony_ci int32_t height) 3809cabdff1aSopenharmony_ci{ 3810cabdff1aSopenharmony_ci if (2 == height) { 3811cabdff1aSopenharmony_ci hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride, 3812cabdff1aSopenharmony_ci filter_x, filter_y); 3813cabdff1aSopenharmony_ci } else if (4 == height) { 3814cabdff1aSopenharmony_ci hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, 3815cabdff1aSopenharmony_ci filter_x, filter_y, 1); 3816cabdff1aSopenharmony_ci } else if (6 == height) { 3817cabdff1aSopenharmony_ci hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride, 3818cabdff1aSopenharmony_ci filter_x, filter_y); 3819cabdff1aSopenharmony_ci } else if (0 == (height % 4)) { 3820cabdff1aSopenharmony_ci hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 3821cabdff1aSopenharmony_ci filter_x, filter_y, height, 1); 3822cabdff1aSopenharmony_ci } 3823cabdff1aSopenharmony_ci} 3824cabdff1aSopenharmony_ci 3825cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_12w_msa(uint8_t *src, 3826cabdff1aSopenharmony_ci int32_t src_stride, 3827cabdff1aSopenharmony_ci uint8_t *dst, 3828cabdff1aSopenharmony_ci int32_t dst_stride, 3829cabdff1aSopenharmony_ci const int8_t *filter_x, 3830cabdff1aSopenharmony_ci const int8_t *filter_y, 3831cabdff1aSopenharmony_ci int32_t height) 3832cabdff1aSopenharmony_ci{ 3833cabdff1aSopenharmony_ci uint32_t loop_cnt; 3834cabdff1aSopenharmony_ci uint8_t *src_tmp, *dst_tmp; 3835cabdff1aSopenharmony_ci v16u8 out0, out1; 3836cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3837cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3838cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3; 3839cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3; 3840cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6; 3841cabdff1aSopenharmony_ci v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 3842cabdff1aSopenharmony_ci v8i16 dst76_r, dst98_r, dst87_r, dst109_r; 3843cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 3844cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 3845cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3846cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3847cabdff1aSopenharmony_ci 3848cabdff1aSopenharmony_ci src -= (src_stride + 1); 3849cabdff1aSopenharmony_ci 3850cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3851cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3852cabdff1aSopenharmony_ci 3853cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3854cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3855cabdff1aSopenharmony_ci 3856cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3857cabdff1aSopenharmony_ci 3858cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 3859cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3860cabdff1aSopenharmony_ci 3861cabdff1aSopenharmony_ci src_tmp = src; 3862cabdff1aSopenharmony_ci dst_tmp = dst; 3863cabdff1aSopenharmony_ci 3864cabdff1aSopenharmony_ci LD_SB3(src_tmp, src_stride, src0, src1, src2); 3865cabdff1aSopenharmony_ci src_tmp += (3 * src_stride); 3866cabdff1aSopenharmony_ci 3867cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3868cabdff1aSopenharmony_ci 3869cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3870cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3871cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3872cabdff1aSopenharmony_ci 3873cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3874cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3875cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3876cabdff1aSopenharmony_ci 3877cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 3878cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 3879cabdff1aSopenharmony_ci 3880cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 3881cabdff1aSopenharmony_ci LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 3882cabdff1aSopenharmony_ci src_tmp += (4 * src_stride); 3883cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 3884cabdff1aSopenharmony_ci 3885cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3886cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3887cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3888cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3889cabdff1aSopenharmony_ci 3890cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3891cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3892cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3893cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3894cabdff1aSopenharmony_ci 3895cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 3896cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 3897cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 3898cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 3899cabdff1aSopenharmony_ci 3900cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3901cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3902cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3903cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3904cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3905cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3906cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3907cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3908cabdff1aSopenharmony_ci 3909cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3910cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3911cabdff1aSopenharmony_ci 3912cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 3913cabdff1aSopenharmony_ci dst3_r, tmp0, tmp1, tmp2, tmp3); 3914cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); 3915cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 3916cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 3917cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 3918cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 3919cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 3920cabdff1aSopenharmony_ci 3921cabdff1aSopenharmony_ci dst10_r = dst54_r; 3922cabdff1aSopenharmony_ci dst10_l = dst54_l; 3923cabdff1aSopenharmony_ci dst21_r = dst65_r; 3924cabdff1aSopenharmony_ci dst21_l = dst65_l; 3925cabdff1aSopenharmony_ci dsth2 = dsth6; 3926cabdff1aSopenharmony_ci } 3927cabdff1aSopenharmony_ci 3928cabdff1aSopenharmony_ci src += 8; 3929cabdff1aSopenharmony_ci dst += 8; 3930cabdff1aSopenharmony_ci 3931cabdff1aSopenharmony_ci mask2 = LD_SB(ff_hevc_mask_arr + 16); 3932cabdff1aSopenharmony_ci mask3 = mask2 + 2; 3933cabdff1aSopenharmony_ci 3934cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3935cabdff1aSopenharmony_ci src += (3 * src_stride); 3936cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3937cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 3938cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3); 3939cabdff1aSopenharmony_ci 3940cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3941cabdff1aSopenharmony_ci dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3942cabdff1aSopenharmony_ci 3943cabdff1aSopenharmony_ci ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 3944cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 3945cabdff1aSopenharmony_ci 3946cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 3947cabdff1aSopenharmony_ci LD_SB8(src, src_stride, 3948cabdff1aSopenharmony_ci src3, src4, src5, src6, src7, src8, src9, src10); 3949cabdff1aSopenharmony_ci src += (8 * src_stride); 3950cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3951cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1); 3952cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3); 3953cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5); 3954cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7); 3955cabdff1aSopenharmony_ci 3956cabdff1aSopenharmony_ci dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3957cabdff1aSopenharmony_ci dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3958cabdff1aSopenharmony_ci dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3959cabdff1aSopenharmony_ci dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3960cabdff1aSopenharmony_ci 3961cabdff1aSopenharmony_ci dst32_r = __msa_ilvr_h(dst73, dst22); 3962cabdff1aSopenharmony_ci ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 3963cabdff1aSopenharmony_ci ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 3964cabdff1aSopenharmony_ci ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 3965cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 3966cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst22, dst106); 3967cabdff1aSopenharmony_ci 3968cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3969cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3970cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3971cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3972cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3973cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3974cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 3975cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 3976cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 3977cabdff1aSopenharmony_ci SRA_4V(dst4, dst5, dst6, dst7, 6); 3978cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 3979cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 3980cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); 3981cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 3982cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 3983cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 3984cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3985cabdff1aSopenharmony_ci dst += (8 * dst_stride); 3986cabdff1aSopenharmony_ci 3987cabdff1aSopenharmony_ci dst10_r = dst98_r; 3988cabdff1aSopenharmony_ci dst21_r = dst109_r; 3989cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 3990cabdff1aSopenharmony_ci } 3991cabdff1aSopenharmony_ci} 3992cabdff1aSopenharmony_ci 3993cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_16w_msa(uint8_t *src, 3994cabdff1aSopenharmony_ci int32_t src_stride, 3995cabdff1aSopenharmony_ci uint8_t *dst, 3996cabdff1aSopenharmony_ci int32_t dst_stride, 3997cabdff1aSopenharmony_ci const int8_t *filter_x, 3998cabdff1aSopenharmony_ci const int8_t *filter_y, 3999cabdff1aSopenharmony_ci int32_t height) 4000cabdff1aSopenharmony_ci{ 4001cabdff1aSopenharmony_ci if (4 == height) { 4002cabdff1aSopenharmony_ci hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x, 4003cabdff1aSopenharmony_ci filter_y, 2); 4004cabdff1aSopenharmony_ci } else { 4005cabdff1aSopenharmony_ci hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4006cabdff1aSopenharmony_ci filter_x, filter_y, height, 2); 4007cabdff1aSopenharmony_ci } 4008cabdff1aSopenharmony_ci} 4009cabdff1aSopenharmony_ci 4010cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_24w_msa(uint8_t *src, 4011cabdff1aSopenharmony_ci int32_t src_stride, 4012cabdff1aSopenharmony_ci uint8_t *dst, 4013cabdff1aSopenharmony_ci int32_t dst_stride, 4014cabdff1aSopenharmony_ci const int8_t *filter_x, 4015cabdff1aSopenharmony_ci const int8_t *filter_y, 4016cabdff1aSopenharmony_ci int32_t height) 4017cabdff1aSopenharmony_ci{ 4018cabdff1aSopenharmony_ci hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4019cabdff1aSopenharmony_ci filter_x, filter_y, height, 3); 4020cabdff1aSopenharmony_ci} 4021cabdff1aSopenharmony_ci 4022cabdff1aSopenharmony_cistatic void hevc_hv_uni_4t_32w_msa(uint8_t *src, 4023cabdff1aSopenharmony_ci int32_t src_stride, 4024cabdff1aSopenharmony_ci uint8_t *dst, 4025cabdff1aSopenharmony_ci int32_t dst_stride, 4026cabdff1aSopenharmony_ci const int8_t *filter_x, 4027cabdff1aSopenharmony_ci const int8_t *filter_y, 4028cabdff1aSopenharmony_ci int32_t height) 4029cabdff1aSopenharmony_ci{ 4030cabdff1aSopenharmony_ci hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4031cabdff1aSopenharmony_ci filter_x, filter_y, height, 4); 4032cabdff1aSopenharmony_ci} 4033cabdff1aSopenharmony_ci 4034cabdff1aSopenharmony_ci#define UNI_MC_COPY(WIDTH) \ 4035cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 4036cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 4037cabdff1aSopenharmony_ci uint8_t *src, \ 4038cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 4039cabdff1aSopenharmony_ci int height, \ 4040cabdff1aSopenharmony_ci intptr_t mx, \ 4041cabdff1aSopenharmony_ci intptr_t my, \ 4042cabdff1aSopenharmony_ci int width) \ 4043cabdff1aSopenharmony_ci{ \ 4044cabdff1aSopenharmony_ci copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \ 4045cabdff1aSopenharmony_ci} 4046cabdff1aSopenharmony_ci 4047cabdff1aSopenharmony_ciUNI_MC_COPY(8); 4048cabdff1aSopenharmony_ciUNI_MC_COPY(12); 4049cabdff1aSopenharmony_ciUNI_MC_COPY(16); 4050cabdff1aSopenharmony_ciUNI_MC_COPY(24); 4051cabdff1aSopenharmony_ciUNI_MC_COPY(32); 4052cabdff1aSopenharmony_ciUNI_MC_COPY(48); 4053cabdff1aSopenharmony_ciUNI_MC_COPY(64); 4054cabdff1aSopenharmony_ci 4055cabdff1aSopenharmony_ci#undef UNI_MC_COPY 4056cabdff1aSopenharmony_ci 4057cabdff1aSopenharmony_ci#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 4058cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 4059cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 4060cabdff1aSopenharmony_ci uint8_t *src, \ 4061cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 4062cabdff1aSopenharmony_ci int height, \ 4063cabdff1aSopenharmony_ci intptr_t mx, \ 4064cabdff1aSopenharmony_ci intptr_t my, \ 4065cabdff1aSopenharmony_ci int width) \ 4066cabdff1aSopenharmony_ci{ \ 4067cabdff1aSopenharmony_ci const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 4068cabdff1aSopenharmony_ci \ 4069cabdff1aSopenharmony_ci common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 4070cabdff1aSopenharmony_ci filter, height); \ 4071cabdff1aSopenharmony_ci} 4072cabdff1aSopenharmony_ci 4073cabdff1aSopenharmony_ciUNI_MC(qpel, h, 4, 8, hz, mx); 4074cabdff1aSopenharmony_ciUNI_MC(qpel, h, 8, 8, hz, mx); 4075cabdff1aSopenharmony_ciUNI_MC(qpel, h, 12, 8, hz, mx); 4076cabdff1aSopenharmony_ciUNI_MC(qpel, h, 16, 8, hz, mx); 4077cabdff1aSopenharmony_ciUNI_MC(qpel, h, 24, 8, hz, mx); 4078cabdff1aSopenharmony_ciUNI_MC(qpel, h, 32, 8, hz, mx); 4079cabdff1aSopenharmony_ciUNI_MC(qpel, h, 48, 8, hz, mx); 4080cabdff1aSopenharmony_ciUNI_MC(qpel, h, 64, 8, hz, mx); 4081cabdff1aSopenharmony_ci 4082cabdff1aSopenharmony_ciUNI_MC(qpel, v, 4, 8, vt, my); 4083cabdff1aSopenharmony_ciUNI_MC(qpel, v, 8, 8, vt, my); 4084cabdff1aSopenharmony_ciUNI_MC(qpel, v, 12, 8, vt, my); 4085cabdff1aSopenharmony_ciUNI_MC(qpel, v, 16, 8, vt, my); 4086cabdff1aSopenharmony_ciUNI_MC(qpel, v, 24, 8, vt, my); 4087cabdff1aSopenharmony_ciUNI_MC(qpel, v, 32, 8, vt, my); 4088cabdff1aSopenharmony_ciUNI_MC(qpel, v, 48, 8, vt, my); 4089cabdff1aSopenharmony_ciUNI_MC(qpel, v, 64, 8, vt, my); 4090cabdff1aSopenharmony_ci 4091cabdff1aSopenharmony_ciUNI_MC(epel, h, 4, 4, hz, mx); 4092cabdff1aSopenharmony_ciUNI_MC(epel, h, 6, 4, hz, mx); 4093cabdff1aSopenharmony_ciUNI_MC(epel, h, 8, 4, hz, mx); 4094cabdff1aSopenharmony_ciUNI_MC(epel, h, 12, 4, hz, mx); 4095cabdff1aSopenharmony_ciUNI_MC(epel, h, 16, 4, hz, mx); 4096cabdff1aSopenharmony_ciUNI_MC(epel, h, 24, 4, hz, mx); 4097cabdff1aSopenharmony_ciUNI_MC(epel, h, 32, 4, hz, mx); 4098cabdff1aSopenharmony_ci 4099cabdff1aSopenharmony_ciUNI_MC(epel, v, 4, 4, vt, my); 4100cabdff1aSopenharmony_ciUNI_MC(epel, v, 6, 4, vt, my); 4101cabdff1aSopenharmony_ciUNI_MC(epel, v, 8, 4, vt, my); 4102cabdff1aSopenharmony_ciUNI_MC(epel, v, 12, 4, vt, my); 4103cabdff1aSopenharmony_ciUNI_MC(epel, v, 16, 4, vt, my); 4104cabdff1aSopenharmony_ciUNI_MC(epel, v, 24, 4, vt, my); 4105cabdff1aSopenharmony_ciUNI_MC(epel, v, 32, 4, vt, my); 4106cabdff1aSopenharmony_ci 4107cabdff1aSopenharmony_ci#undef UNI_MC 4108cabdff1aSopenharmony_ci 4109cabdff1aSopenharmony_ci#define UNI_MC_HV(PEL, WIDTH, TAP) \ 4110cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 4111cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 4112cabdff1aSopenharmony_ci uint8_t *src, \ 4113cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 4114cabdff1aSopenharmony_ci int height, \ 4115cabdff1aSopenharmony_ci intptr_t mx, \ 4116cabdff1aSopenharmony_ci intptr_t my, \ 4117cabdff1aSopenharmony_ci int width) \ 4118cabdff1aSopenharmony_ci{ \ 4119cabdff1aSopenharmony_ci const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 4120cabdff1aSopenharmony_ci const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 4121cabdff1aSopenharmony_ci \ 4122cabdff1aSopenharmony_ci hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 4123cabdff1aSopenharmony_ci filter_x, filter_y, height); \ 4124cabdff1aSopenharmony_ci} 4125cabdff1aSopenharmony_ci 4126cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 4, 8); 4127cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 8, 8); 4128cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 12, 8); 4129cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 16, 8); 4130cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 24, 8); 4131cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 32, 8); 4132cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 48, 8); 4133cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 64, 8); 4134cabdff1aSopenharmony_ci 4135cabdff1aSopenharmony_ciUNI_MC_HV(epel, 4, 4); 4136cabdff1aSopenharmony_ciUNI_MC_HV(epel, 6, 4); 4137cabdff1aSopenharmony_ciUNI_MC_HV(epel, 8, 4); 4138cabdff1aSopenharmony_ciUNI_MC_HV(epel, 12, 4); 4139cabdff1aSopenharmony_ciUNI_MC_HV(epel, 16, 4); 4140cabdff1aSopenharmony_ciUNI_MC_HV(epel, 24, 4); 4141cabdff1aSopenharmony_ciUNI_MC_HV(epel, 32, 4); 4142cabdff1aSopenharmony_ci 4143cabdff1aSopenharmony_ci#undef UNI_MC_HV 4144