1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2022 Loongson Technology Corporation Limited 3cabdff1aSopenharmony_ci * Contributed by Lu Wang <wanglu@loongson.cn> 4cabdff1aSopenharmony_ci * Hao Chen <chenhao@loongson.cn> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 24cabdff1aSopenharmony_ci#include "hevcdsp_lsx.h" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = { 27cabdff1aSopenharmony_ci /* 8 width cases */ 28cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 29cabdff1aSopenharmony_ci /* 4 width cases */ 30cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 31cabdff1aSopenharmony_ci /* 4 width cases */ 32cabdff1aSopenharmony_ci 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 33cabdff1aSopenharmony_ci}; 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_cistatic av_always_inline 36cabdff1aSopenharmony_civoid common_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride, 37cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 38cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 39cabdff1aSopenharmony_ci{ 40cabdff1aSopenharmony_ci int32_t loop_cnt; 41cabdff1aSopenharmony_ci __m128i mask0, mask1, mask2, mask3, out1, out2; 42cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 43cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 44cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 45cabdff1aSopenharmony_ci __m128i res0, res1, res2, res3; 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ci mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 48cabdff1aSopenharmony_ci src -= 3; 49cabdff1aSopenharmony_ci 50cabdff1aSopenharmony_ci /* rearranging filter */ 51cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 52cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 55cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 56cabdff1aSopenharmony_ci 57cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 58cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, src, 0, src, 8, src, 16, src, 24, 59cabdff1aSopenharmony_ci src0, src1, src2, src3); 60cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, src, 32, src, 40, src, 48, src, 56, 61cabdff1aSopenharmony_ci src4, src5, src6, src7); 62cabdff1aSopenharmony_ci src += src_stride; 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, 65cabdff1aSopenharmony_ci vec0, vec1); 66cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, 67cabdff1aSopenharmony_ci vec2, vec3); 68cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 69cabdff1aSopenharmony_ci vec3, filt0, res0, res1, res2, res3); 70cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, 71cabdff1aSopenharmony_ci vec0, vec1); 72cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, 73cabdff1aSopenharmony_ci vec2, vec3); 74cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2, 75cabdff1aSopenharmony_ci res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3); 76cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, 77cabdff1aSopenharmony_ci vec4, vec5); 78cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, 79cabdff1aSopenharmony_ci vec6, vec7); 80cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1, 81cabdff1aSopenharmony_ci res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3); 82cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, 83cabdff1aSopenharmony_ci vec4, vec5); 84cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3, 85cabdff1aSopenharmony_ci vec6, vec7); 86cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3, 87cabdff1aSopenharmony_ci res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3); 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6, 90cabdff1aSopenharmony_ci out1, out2); 91cabdff1aSopenharmony_ci __lsx_vst(out1, dst, 0); 92cabdff1aSopenharmony_ci __lsx_vst(out2, dst, 16); 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src5, mask0, 95cabdff1aSopenharmony_ci vec0, vec1); 96cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src7, src7, mask0, 97cabdff1aSopenharmony_ci vec2, vec3); 98cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 99cabdff1aSopenharmony_ci vec3, filt0, res0, res1, res2, res3); 100cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask2, src5, src5, mask2, 101cabdff1aSopenharmony_ci vec0, vec1); 102cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask2, src7, src7, mask2, 103cabdff1aSopenharmony_ci vec2, vec3); 104cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2, 105cabdff1aSopenharmony_ci res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3); 106cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src5, mask1, 107cabdff1aSopenharmony_ci vec4, vec5); 108cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask1, src7, src7, mask1, 109cabdff1aSopenharmony_ci vec6, vec7); 110cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1, 111cabdff1aSopenharmony_ci res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3); 112cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask3, src5, src5, mask3, 113cabdff1aSopenharmony_ci vec4, vec5); 114cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask3, src7, src7, mask3, 115cabdff1aSopenharmony_ci vec6, vec7); 116cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3, 117cabdff1aSopenharmony_ci res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3); 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6, 120cabdff1aSopenharmony_ci out1, out2); 121cabdff1aSopenharmony_ci __lsx_vst(out1, dst, 32); 122cabdff1aSopenharmony_ci __lsx_vst(out2, dst, 48); 123cabdff1aSopenharmony_ci dst += dst_stride; 124cabdff1aSopenharmony_ci } 125cabdff1aSopenharmony_ci} 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_cistatic av_always_inline 128cabdff1aSopenharmony_civoid common_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride, 129cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 130cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 131cabdff1aSopenharmony_ci{ 132cabdff1aSopenharmony_ci uint32_t loop_cnt; 133cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 134cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 135cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 136cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 137cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 138cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 139cabdff1aSopenharmony_ci 140cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 141cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 142cabdff1aSopenharmony_ci __m128i src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; 143cabdff1aSopenharmony_ci __m128i tmp0, tmp1; 144cabdff1aSopenharmony_ci __m128i out0_r, out1_r, out2_r, out3_r; 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci src -= src_stride_3x; 147cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 148cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 151cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 152cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 153cabdff1aSopenharmony_ci src += src_stride_4x; 154cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 155cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 156cabdff1aSopenharmony_ci src += src_stride_3x; 157cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 158cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 159cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 162cabdff1aSopenharmony_ci src7 = __lsx_vld(src, 0); 163cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9); 164cabdff1aSopenharmony_ci src10 = __lsx_vldx(src, src_stride_3x); 165cabdff1aSopenharmony_ci src += src_stride_4x; 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, 168cabdff1aSopenharmony_ci src9, src76_r, src87_r, src98_r, src109_r); 169cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r, 170cabdff1aSopenharmony_ci filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r); 171cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r, 172cabdff1aSopenharmony_ci src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r, 173cabdff1aSopenharmony_ci filt1, out0_r, out1_r, out2_r, out3_r); 174cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r, 175cabdff1aSopenharmony_ci src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r, 176cabdff1aSopenharmony_ci filt2, out0_r, out1_r, out2_r, out3_r); 177cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r, 178cabdff1aSopenharmony_ci src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r, 179cabdff1aSopenharmony_ci filt3, out0_r, out1_r, out2_r, out3_r); 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6, 182cabdff1aSopenharmony_ci tmp0, tmp1) 183cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp0, dst, 0, 0); 184cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); 185cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp1, dst + dst_stride_2x, 0, 0); 186cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp1, dst + dst_stride_3x, 0, 1); 187cabdff1aSopenharmony_ci dst += dst_stride_4x; 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci src10_r = src54_r; 190cabdff1aSopenharmony_ci src32_r = src76_r; 191cabdff1aSopenharmony_ci src54_r = src98_r; 192cabdff1aSopenharmony_ci src21_r = src65_r; 193cabdff1aSopenharmony_ci src43_r = src87_r; 194cabdff1aSopenharmony_ci src65_r = src109_r; 195cabdff1aSopenharmony_ci src6 = src10; 196cabdff1aSopenharmony_ci } 197cabdff1aSopenharmony_ci} 198cabdff1aSopenharmony_ci 199cabdff1aSopenharmony_cistatic av_always_inline 200cabdff1aSopenharmony_civoid common_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 201cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter, 202cabdff1aSopenharmony_ci int32_t height, int32_t width) 203cabdff1aSopenharmony_ci{ 204cabdff1aSopenharmony_ci uint8_t *src_tmp; 205cabdff1aSopenharmony_ci uint8_t *dst_tmp; 206cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 207cabdff1aSopenharmony_ci const int32_t src_stride_2x = (src_stride << 1); 208cabdff1aSopenharmony_ci const int32_t dst_stride_2x = (dst_stride << 1); 209cabdff1aSopenharmony_ci const int32_t src_stride_4x = (src_stride << 2); 210cabdff1aSopenharmony_ci const int32_t dst_stride_4x = (dst_stride << 2); 211cabdff1aSopenharmony_ci const int32_t src_stride_3x = src_stride_2x + src_stride; 212cabdff1aSopenharmony_ci const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 213cabdff1aSopenharmony_ci 214cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 215cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 216cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 217cabdff1aSopenharmony_ci __m128i src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 218cabdff1aSopenharmony_ci __m128i src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 219cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3; 220cabdff1aSopenharmony_ci __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 221cabdff1aSopenharmony_ci 222cabdff1aSopenharmony_ci src -= src_stride_3x; 223cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, filt0, 224cabdff1aSopenharmony_ci filt1, filt2, filt3); 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci for (cnt = (width >> 4); cnt--;) { 227cabdff1aSopenharmony_ci src_tmp = src; 228cabdff1aSopenharmony_ci dst_tmp = dst; 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_ci src0 = __lsx_vld(src_tmp, 0); 231cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 232cabdff1aSopenharmony_ci src1, src2); 233cabdff1aSopenharmony_ci src3 = __lsx_vldx(src_tmp, src_stride_3x); 234cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 235cabdff1aSopenharmony_ci src4 = __lsx_vld(src_tmp, 0); 236cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 237cabdff1aSopenharmony_ci src5, src6); 238cabdff1aSopenharmony_ci src_tmp += src_stride_3x; 239cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 240cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 241cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 242cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, 243cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 244cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l); 245cabdff1aSopenharmony_ci 246cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 247cabdff1aSopenharmony_ci src7 = __lsx_vld(src_tmp, 0); 248cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 249cabdff1aSopenharmony_ci src8, src9); 250cabdff1aSopenharmony_ci src10 = __lsx_vldx(src_tmp, src_stride_3x); 251cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 252cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, 253cabdff1aSopenharmony_ci src9, src76_r, src87_r, src98_r, src109_r); 254cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, 255cabdff1aSopenharmony_ci src9, src76_l, src87_l, src98_l, src109_l); 256cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r, 257cabdff1aSopenharmony_ci filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r); 258cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r, 259cabdff1aSopenharmony_ci src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r, 260cabdff1aSopenharmony_ci filt1, out0_r, out1_r, out2_r, out3_r); 261cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r, 262cabdff1aSopenharmony_ci src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r, 263cabdff1aSopenharmony_ci filt2, out0_r, out1_r, out2_r, out3_r); 264cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r, 265cabdff1aSopenharmony_ci src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r, 266cabdff1aSopenharmony_ci filt3, out0_r, out1_r, out2_r, out3_r); 267cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_l, filt0, src21_l, filt0, src32_l, 268cabdff1aSopenharmony_ci filt0, src43_l, filt0, out0_l, out1_l, out2_l, out3_l); 269cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src32_l, filt1, out1_l, 270cabdff1aSopenharmony_ci src43_l, filt1, out2_l, src54_l, filt1, out3_l, src65_l, 271cabdff1aSopenharmony_ci filt1, out0_l, out1_l, out2_l, out3_l); 272cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src54_l, filt2, out1_l, 273cabdff1aSopenharmony_ci src65_l, filt2, out2_l, src76_l, filt2, out3_l, src87_l, 274cabdff1aSopenharmony_ci filt2, out0_l, out1_l, out2_l, out3_l); 275cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src76_l, filt3, out1_l, 276cabdff1aSopenharmony_ci src87_l, filt3, out2_l, src98_l, filt3, out3_l, src109_l, 277cabdff1aSopenharmony_ci filt3, out0_l, out1_l, out2_l, out3_l); 278cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 279cabdff1aSopenharmony_ci 6, out2_l, out2_r, 6, out3_l, out3_r, 6, 280cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 281cabdff1aSopenharmony_ci __lsx_vst(tmp0, dst_tmp, 0); 282cabdff1aSopenharmony_ci __lsx_vstx(tmp1, dst_tmp, dst_stride); 283cabdff1aSopenharmony_ci __lsx_vstx(tmp2, dst_tmp, dst_stride_2x); 284cabdff1aSopenharmony_ci __lsx_vstx(tmp3, dst_tmp, dst_stride_3x); 285cabdff1aSopenharmony_ci dst_tmp += dst_stride_4x; 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci src10_r = src54_r; 288cabdff1aSopenharmony_ci src32_r = src76_r; 289cabdff1aSopenharmony_ci src54_r = src98_r; 290cabdff1aSopenharmony_ci src21_r = src65_r; 291cabdff1aSopenharmony_ci src43_r = src87_r; 292cabdff1aSopenharmony_ci src65_r = src109_r; 293cabdff1aSopenharmony_ci src10_l = src54_l; 294cabdff1aSopenharmony_ci src32_l = src76_l; 295cabdff1aSopenharmony_ci src54_l = src98_l; 296cabdff1aSopenharmony_ci src21_l = src65_l; 297cabdff1aSopenharmony_ci src43_l = src87_l; 298cabdff1aSopenharmony_ci src65_l = src109_l; 299cabdff1aSopenharmony_ci src6 = src10; 300cabdff1aSopenharmony_ci } 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_ci src += 16; 303cabdff1aSopenharmony_ci dst += 16; 304cabdff1aSopenharmony_ci } 305cabdff1aSopenharmony_ci} 306cabdff1aSopenharmony_ci 307cabdff1aSopenharmony_cistatic void common_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride, 308cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 309cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 310cabdff1aSopenharmony_ci{ 311cabdff1aSopenharmony_ci common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 16); 312cabdff1aSopenharmony_ci common_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride, filter, 313cabdff1aSopenharmony_ci height); 314cabdff1aSopenharmony_ci} 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_cistatic void common_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride, 317cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 318cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 319cabdff1aSopenharmony_ci{ 320cabdff1aSopenharmony_ci common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 32); 321cabdff1aSopenharmony_ci} 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_cistatic void common_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride, 324cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 325cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 326cabdff1aSopenharmony_ci{ 327cabdff1aSopenharmony_ci common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 48); 328cabdff1aSopenharmony_ci} 329cabdff1aSopenharmony_ci 330cabdff1aSopenharmony_cistatic void common_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride, 331cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 332cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 333cabdff1aSopenharmony_ci{ 334cabdff1aSopenharmony_ci common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 64); 335cabdff1aSopenharmony_ci} 336cabdff1aSopenharmony_ci 337cabdff1aSopenharmony_cistatic av_always_inline 338cabdff1aSopenharmony_civoid hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 339cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 340cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height, int32_t width) 341cabdff1aSopenharmony_ci{ 342cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 343cabdff1aSopenharmony_ci uint8_t *src_tmp; 344cabdff1aSopenharmony_ci uint8_t *dst_tmp; 345cabdff1aSopenharmony_ci const int32_t src_stride_2x = (src_stride << 1); 346cabdff1aSopenharmony_ci const int32_t dst_stride_2x = (dst_stride << 1); 347cabdff1aSopenharmony_ci const int32_t src_stride_4x = (src_stride << 2); 348cabdff1aSopenharmony_ci const int32_t src_stride_3x = src_stride_2x + src_stride; 349cabdff1aSopenharmony_ci 350cabdff1aSopenharmony_ci __m128i out; 351cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 352cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 353cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1, filt_h2, filt_h3; 354cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 355cabdff1aSopenharmony_ci __m128i filter_vec; 356cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 357cabdff1aSopenharmony_ci __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 358cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 359cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l; 360cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst76_r; 361cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst54_l, dst76_l; 362cabdff1aSopenharmony_ci __m128i dst21_r, dst43_r, dst65_r, dst87_r; 363cabdff1aSopenharmony_ci __m128i dst21_l, dst43_l, dst65_l, dst87_l; 364cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 365cabdff1aSopenharmony_ci 366cabdff1aSopenharmony_ci src -= (src_stride_3x + 3); 367cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, 368cabdff1aSopenharmony_ci filter_x, 6, filt0, filt1, filt2, filt3); 369cabdff1aSopenharmony_ci 370cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 371cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 372cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2, 373cabdff1aSopenharmony_ci filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3); 374cabdff1aSopenharmony_ci 375cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 376cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 377cabdff1aSopenharmony_ci 378cabdff1aSopenharmony_ci for (cnt = width >> 3; cnt--;) { 379cabdff1aSopenharmony_ci src_tmp = src; 380cabdff1aSopenharmony_ci dst_tmp = dst; 381cabdff1aSopenharmony_ci 382cabdff1aSopenharmony_ci src0 = __lsx_vld(src_tmp, 0); 383cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 384cabdff1aSopenharmony_ci src1, src2); 385cabdff1aSopenharmony_ci src3 = __lsx_vldx(src_tmp, src_stride_3x); 386cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 387cabdff1aSopenharmony_ci src4 = __lsx_vld(src_tmp, 0); 388cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 389cabdff1aSopenharmony_ci src5, src6); 390cabdff1aSopenharmony_ci src_tmp += src_stride_3x; 391cabdff1aSopenharmony_ci 392cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 393cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, 394cabdff1aSopenharmony_ci src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3); 395cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, 396cabdff1aSopenharmony_ci src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7); 397cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, 398cabdff1aSopenharmony_ci src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11); 399cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, 400cabdff1aSopenharmony_ci src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15); 401cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0, 402cabdff1aSopenharmony_ci vec12, filt0, dst0, dst1, dst2, dst3); 403cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1, 404cabdff1aSopenharmony_ci dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3); 405cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2, 406cabdff1aSopenharmony_ci dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3); 407cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3, 408cabdff1aSopenharmony_ci dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3); 409cabdff1aSopenharmony_ci 410cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, 411cabdff1aSopenharmony_ci src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3); 412cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, 413cabdff1aSopenharmony_ci src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7); 414cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, 415cabdff1aSopenharmony_ci src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11); 416cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5); 417cabdff1aSopenharmony_ci dst6 = __lsx_vdp2_h_bu_b(vec8, filt0); 418cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1, 419cabdff1aSopenharmony_ci dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4); 420cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2, 421cabdff1aSopenharmony_ci dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5); 422cabdff1aSopenharmony_ci dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3); 423cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2, 424cabdff1aSopenharmony_ci dst1, dst10_r, dst32_r, dst54_r, dst21_r); 425cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2, 426cabdff1aSopenharmony_ci dst1, dst10_l, dst32_l, dst54_l, dst21_l); 427cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r); 428cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l); 429cabdff1aSopenharmony_ci 430cabdff1aSopenharmony_ci for (loop_cnt = height >> 1; loop_cnt--;) { 431cabdff1aSopenharmony_ci src7 = __lsx_vld(src_tmp, 0); 432cabdff1aSopenharmony_ci src8 = __lsx_vldx(src_tmp, src_stride); 433cabdff1aSopenharmony_ci src_tmp += src_stride_2x; 434cabdff1aSopenharmony_ci 435cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7, 436cabdff1aSopenharmony_ci src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3); 437cabdff1aSopenharmony_ci dst7 = __lsx_vdp2_h_bu_b(vec0, filt0); 438cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, 439cabdff1aSopenharmony_ci filt2, dst7, dst7); 440cabdff1aSopenharmony_ci dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3); 441cabdff1aSopenharmony_ci dst76_r = __lsx_vilvl_h(dst7, dst6); 442cabdff1aSopenharmony_ci dst76_l = __lsx_vilvh_h(dst7, dst6); 443cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, 444cabdff1aSopenharmony_ci dst0_r, dst0_l); 445cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, 446cabdff1aSopenharmony_ci dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, 447cabdff1aSopenharmony_ci dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l); 448cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, 449cabdff1aSopenharmony_ci dst76_l, filt_h3, dst0_r, dst0_l); 450cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l); 451cabdff1aSopenharmony_ci 452cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8, 453cabdff1aSopenharmony_ci src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3); 454cabdff1aSopenharmony_ci dst8 = __lsx_vdp2_h_bu_b(vec0, filt0); 455cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2, 456cabdff1aSopenharmony_ci filt2, dst8, dst8); 457cabdff1aSopenharmony_ci dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3); 458cabdff1aSopenharmony_ci 459cabdff1aSopenharmony_ci dst87_r = __lsx_vilvl_h(dst8, dst7); 460cabdff1aSopenharmony_ci dst87_l = __lsx_vilvh_h(dst8, dst7); 461cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0, 462cabdff1aSopenharmony_ci dst1_r, dst1_l); 463cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l, 464cabdff1aSopenharmony_ci dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l, 465cabdff1aSopenharmony_ci dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l); 466cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l, 467cabdff1aSopenharmony_ci dst87_l, filt_h3, dst1_r, dst1_l); 468cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l); 469cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrari_w, dst0_r, 6, dst0_l, 6,dst1_r, 6, dst1_l, 470cabdff1aSopenharmony_ci 6, dst0_r, dst0_l, dst1_r, dst1_l); 471cabdff1aSopenharmony_ci DUP4_ARG1(__lsx_vclip255_w, dst0_l, dst0_r, dst1_l, dst1_r, 472cabdff1aSopenharmony_ci dst0_l, dst0_r, dst1_l, dst1_r); 473cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, 474cabdff1aSopenharmony_ci dst0, dst1); 475cabdff1aSopenharmony_ci out = __lsx_vpickev_b(dst1, dst0); 476cabdff1aSopenharmony_ci __lsx_vstelm_d(out, dst_tmp, 0, 0); 477cabdff1aSopenharmony_ci __lsx_vstelm_d(out, dst_tmp + dst_stride, 0, 1); 478cabdff1aSopenharmony_ci dst_tmp += dst_stride_2x; 479cabdff1aSopenharmony_ci 480cabdff1aSopenharmony_ci dst10_r = dst32_r; 481cabdff1aSopenharmony_ci dst32_r = dst54_r; 482cabdff1aSopenharmony_ci dst54_r = dst76_r; 483cabdff1aSopenharmony_ci dst10_l = dst32_l; 484cabdff1aSopenharmony_ci dst32_l = dst54_l; 485cabdff1aSopenharmony_ci dst54_l = dst76_l; 486cabdff1aSopenharmony_ci dst21_r = dst43_r; 487cabdff1aSopenharmony_ci dst43_r = dst65_r; 488cabdff1aSopenharmony_ci dst65_r = dst87_r; 489cabdff1aSopenharmony_ci dst21_l = dst43_l; 490cabdff1aSopenharmony_ci dst43_l = dst65_l; 491cabdff1aSopenharmony_ci dst65_l = dst87_l; 492cabdff1aSopenharmony_ci dst6 = dst8; 493cabdff1aSopenharmony_ci } 494cabdff1aSopenharmony_ci src += 8; 495cabdff1aSopenharmony_ci dst += 8; 496cabdff1aSopenharmony_ci } 497cabdff1aSopenharmony_ci} 498cabdff1aSopenharmony_ci 499cabdff1aSopenharmony_cistatic void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 500cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 501cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 502cabdff1aSopenharmony_ci{ 503cabdff1aSopenharmony_ci hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 504cabdff1aSopenharmony_ci filter_x, filter_y, height, 8); 505cabdff1aSopenharmony_ci} 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_cistatic void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 508cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 509cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 510cabdff1aSopenharmony_ci{ 511cabdff1aSopenharmony_ci hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 512cabdff1aSopenharmony_ci filter_x, filter_y, height, 16); 513cabdff1aSopenharmony_ci} 514cabdff1aSopenharmony_ci 515cabdff1aSopenharmony_cistatic void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 516cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 517cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 518cabdff1aSopenharmony_ci{ 519cabdff1aSopenharmony_ci hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 520cabdff1aSopenharmony_ci filter_x, filter_y, height, 24); 521cabdff1aSopenharmony_ci} 522cabdff1aSopenharmony_ci 523cabdff1aSopenharmony_cistatic void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 524cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 525cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 526cabdff1aSopenharmony_ci{ 527cabdff1aSopenharmony_ci hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 528cabdff1aSopenharmony_ci filter_x, filter_y, height, 32); 529cabdff1aSopenharmony_ci} 530cabdff1aSopenharmony_ci 531cabdff1aSopenharmony_cistatic void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 532cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 533cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 534cabdff1aSopenharmony_ci{ 535cabdff1aSopenharmony_ci hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 536cabdff1aSopenharmony_ci filter_x, filter_y, height, 48); 537cabdff1aSopenharmony_ci} 538cabdff1aSopenharmony_ci 539cabdff1aSopenharmony_cistatic void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 540cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 541cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 542cabdff1aSopenharmony_ci{ 543cabdff1aSopenharmony_ci hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 544cabdff1aSopenharmony_ci filter_x, filter_y, height, 64); 545cabdff1aSopenharmony_ci} 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_cistatic av_always_inline 548cabdff1aSopenharmony_civoid common_vt_4t_24w_lsx(uint8_t *src, int32_t src_stride, 549cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 550cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 551cabdff1aSopenharmony_ci{ 552cabdff1aSopenharmony_ci uint32_t loop_cnt; 553cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 554cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 555cabdff1aSopenharmony_ci uint8_t *_src; 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 558cabdff1aSopenharmony_ci __m128i src11, filt0, filt1; 559cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 560cabdff1aSopenharmony_ci __m128i src109_r, src10_l, src32_l, src21_l, src43_l; 561cabdff1aSopenharmony_ci __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l; 562cabdff1aSopenharmony_ci __m128i out1, out2, out3, out4; 563cabdff1aSopenharmony_ci 564cabdff1aSopenharmony_ci src -= src_stride; 565cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 566cabdff1aSopenharmony_ci _src = src + 16; 567cabdff1aSopenharmony_ci 568cabdff1aSopenharmony_ci /* 16 width */ 569cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 570cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 571cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 572cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 573cabdff1aSopenharmony_ci 574cabdff1aSopenharmony_ci /* 8 width */ 575cabdff1aSopenharmony_ci src6 = __lsx_vld(_src, 0); 576cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8); 577cabdff1aSopenharmony_ci src += src_stride_3x; 578cabdff1aSopenharmony_ci _src += src_stride_3x; 579cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 580cabdff1aSopenharmony_ci 581cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 582cabdff1aSopenharmony_ci /* 16 width */ 583cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9); 584cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10); 585cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 586cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 587cabdff1aSopenharmony_ci 588cabdff1aSopenharmony_ci /* 8 width */ 589cabdff1aSopenharmony_ci src += src_stride_2x; 590cabdff1aSopenharmony_ci _src += src_stride_2x; 591cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r); 592cabdff1aSopenharmony_ci 593cabdff1aSopenharmony_ci /* 16 width */ 594cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r, 595cabdff1aSopenharmony_ci filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l); 596cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l, 597cabdff1aSopenharmony_ci filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1, 598cabdff1aSopenharmony_ci out0_r, out0_l, out1_r, out1_l); 599cabdff1aSopenharmony_ci 600cabdff1aSopenharmony_ci /* 8 width */ 601cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0, 602cabdff1aSopenharmony_ci out2_r, out3_r); 603cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out3_r, 604cabdff1aSopenharmony_ci src109_r, filt1, out2_r, out3_r); 605cabdff1aSopenharmony_ci 606cabdff1aSopenharmony_ci /* 16 + 8 width */ 607cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6, 608cabdff1aSopenharmony_ci out3_r, out3_r, 6, out1_l, out1_r, 6, out1, out2, out3, out4); 609cabdff1aSopenharmony_ci __lsx_vst(out1, dst, 0); 610cabdff1aSopenharmony_ci __lsx_vstelm_d(out2, dst, 16, 0); 611cabdff1aSopenharmony_ci dst += dst_stride; 612cabdff1aSopenharmony_ci __lsx_vst(out4, dst, 0); 613cabdff1aSopenharmony_ci __lsx_vstelm_d(out3, dst, 16, 0); 614cabdff1aSopenharmony_ci dst += dst_stride; 615cabdff1aSopenharmony_ci 616cabdff1aSopenharmony_ci /* 16 width */ 617cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11); 618cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8); 619cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r); 620cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l); 621cabdff1aSopenharmony_ci 622cabdff1aSopenharmony_ci /* 8 width */ 623cabdff1aSopenharmony_ci src += src_stride_2x; 624cabdff1aSopenharmony_ci _src += src_stride_2x; 625cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r); 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_ci /* 16 width */ 628cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r, 629cabdff1aSopenharmony_ci filt0, src43_l, filt0, out0_r, out0_l, out1_r, out1_l); 630cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src10_r, filt1, out0_l, src10_l, 631cabdff1aSopenharmony_ci filt1, out1_r, src21_r, filt1, out1_l, src21_l, filt1, 632cabdff1aSopenharmony_ci out0_r, out0_l, out1_r, out1_l); 633cabdff1aSopenharmony_ci 634cabdff1aSopenharmony_ci /* 8 width */ 635cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0, 636cabdff1aSopenharmony_ci out2_r, out3_r); 637cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src76_r, filt1, out3_r, 638cabdff1aSopenharmony_ci src87_r, filt1, out2_r, out3_r); 639cabdff1aSopenharmony_ci 640cabdff1aSopenharmony_ci /* 16 + 8 width */ 641cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6, 642cabdff1aSopenharmony_ci out1_l, out1_r, 6, out3_r, out3_r, 6, out1, out2, out3, out4); 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci __lsx_vst(out1, dst, 0); 645cabdff1aSopenharmony_ci __lsx_vstelm_d(out2, dst, 16, 0); 646cabdff1aSopenharmony_ci dst += dst_stride; 647cabdff1aSopenharmony_ci __lsx_vst(out3, dst, 0); 648cabdff1aSopenharmony_ci __lsx_vstelm_d(out4, dst, 16, 0); 649cabdff1aSopenharmony_ci dst += dst_stride; 650cabdff1aSopenharmony_ci } 651cabdff1aSopenharmony_ci} 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_cistatic av_always_inline 654cabdff1aSopenharmony_civoid common_vt_4t_32w_lsx(uint8_t *src, int32_t src_stride, 655cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 656cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 657cabdff1aSopenharmony_ci{ 658cabdff1aSopenharmony_ci uint32_t loop_cnt; 659cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 660cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 661cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 662cabdff1aSopenharmony_ci uint8_t *_src; 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; 665cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src76_r, src98_r; 666cabdff1aSopenharmony_ci __m128i src21_r, src43_r, src87_r, src109_r; 667cabdff1aSopenharmony_ci __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 668cabdff1aSopenharmony_ci __m128i src10_l, src32_l, src76_l, src98_l; 669cabdff1aSopenharmony_ci __m128i src21_l, src43_l, src87_l, src109_l; 670cabdff1aSopenharmony_ci __m128i filt0, filt1; 671cabdff1aSopenharmony_ci __m128i out1, out2; 672cabdff1aSopenharmony_ci 673cabdff1aSopenharmony_ci src -= src_stride; 674cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 675cabdff1aSopenharmony_ci _src = src + 16; 676cabdff1aSopenharmony_ci 677cabdff1aSopenharmony_ci /* 16 width */ 678cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 679cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 680cabdff1aSopenharmony_ci 681cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 682cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 683cabdff1aSopenharmony_ci 684cabdff1aSopenharmony_ci /* next 16 width */ 685cabdff1aSopenharmony_ci src6 = __lsx_vld(_src, 0); 686cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8); 687cabdff1aSopenharmony_ci src += src_stride_3x; 688cabdff1aSopenharmony_ci _src += src_stride_3x; 689cabdff1aSopenharmony_ci 690cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 691cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l); 692cabdff1aSopenharmony_ci 693cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 694cabdff1aSopenharmony_ci /* 16 width */ 695cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9); 696cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10); 697cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 698cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 699cabdff1aSopenharmony_ci 700cabdff1aSopenharmony_ci /* 16 width */ 701cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r, 702cabdff1aSopenharmony_ci filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l); 703cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l, 704cabdff1aSopenharmony_ci filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1, 705cabdff1aSopenharmony_ci out0_r, out0_l, out1_r, out1_l); 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 6, 708cabdff1aSopenharmony_ci out1, out2); 709cabdff1aSopenharmony_ci __lsx_vst(out1, dst, 0); 710cabdff1aSopenharmony_ci __lsx_vstx(out2, dst, dst_stride); 711cabdff1aSopenharmony_ci 712cabdff1aSopenharmony_ci src10_r = src32_r; 713cabdff1aSopenharmony_ci src21_r = src43_r; 714cabdff1aSopenharmony_ci src10_l = src32_l; 715cabdff1aSopenharmony_ci src21_l = src43_l; 716cabdff1aSopenharmony_ci src2 = src4; 717cabdff1aSopenharmony_ci 718cabdff1aSopenharmony_ci /* next 16 width */ 719cabdff1aSopenharmony_ci src += src_stride_2x; 720cabdff1aSopenharmony_ci _src += src_stride_2x; 721cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r); 722cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l); 723cabdff1aSopenharmony_ci 724cabdff1aSopenharmony_ci /* next 16 width */ 725cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r, 726cabdff1aSopenharmony_ci filt0, src87_l, filt0, out2_r, out2_l, out3_r, out3_l); 727cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out2_l, src98_l, 728cabdff1aSopenharmony_ci filt1, out3_r, src109_r, filt1, out3_l, src109_l, filt1, 729cabdff1aSopenharmony_ci out2_r, out2_l, out3_r, out3_l); 730cabdff1aSopenharmony_ci 731cabdff1aSopenharmony_ci /* next 16 width */ 732cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrarni_bu_h, out2_l, out2_r, 6, out3_l, out3_r, 6, 733cabdff1aSopenharmony_ci out1, out2); 734cabdff1aSopenharmony_ci __lsx_vst(out1, dst, 16); 735cabdff1aSopenharmony_ci __lsx_vst(out2, dst + dst_stride, 16); 736cabdff1aSopenharmony_ci 737cabdff1aSopenharmony_ci dst += dst_stride_2x; 738cabdff1aSopenharmony_ci 739cabdff1aSopenharmony_ci src76_r = src98_r; 740cabdff1aSopenharmony_ci src87_r = src109_r; 741cabdff1aSopenharmony_ci src76_l = src98_l; 742cabdff1aSopenharmony_ci src87_l = src109_l; 743cabdff1aSopenharmony_ci src8 = src10; 744cabdff1aSopenharmony_ci } 745cabdff1aSopenharmony_ci} 746cabdff1aSopenharmony_ci 747cabdff1aSopenharmony_cistatic av_always_inline 748cabdff1aSopenharmony_civoid hevc_hv_4t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 749cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 750cabdff1aSopenharmony_ci const int8_t *filter_y) 751cabdff1aSopenharmony_ci{ 752cabdff1aSopenharmony_ci const int32_t src_stride_2x = (src_stride << 1); 753cabdff1aSopenharmony_ci const int32_t src_stride_4x = (src_stride << 2); 754cabdff1aSopenharmony_ci const int32_t src_stride_3x = src_stride_2x + src_stride; 755cabdff1aSopenharmony_ci __m128i out; 756cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4; 757cabdff1aSopenharmony_ci __m128i filt0, filt1; 758cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1, filter_vec; 759cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 760cabdff1aSopenharmony_ci __m128i mask1; 761cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 762cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4; 763cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l; 764cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst21_r, dst43_r; 765cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst21_l, dst43_l; 766cabdff1aSopenharmony_ci __m128i out0_r, out1_r; 767cabdff1aSopenharmony_ci 768cabdff1aSopenharmony_ci src -= (src_stride + 1); 769cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 770cabdff1aSopenharmony_ci 771cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 772cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 773cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 774cabdff1aSopenharmony_ci 775cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 776cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 777cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src, 778cabdff1aSopenharmony_ci src_stride_3x, src, src_stride_4x, src1, src2, src3, src4); 779cabdff1aSopenharmony_ci 780cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1, 781cabdff1aSopenharmony_ci mask0, src1, src1, mask1, vec0, vec1, vec2, vec3); 782cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3, 783cabdff1aSopenharmony_ci mask0, src3, src3, mask1, vec4, vec5, vec6, vec7); 784cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9); 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 787cabdff1aSopenharmony_ci filt0, dst0, dst1, dst2, dst3); 788cabdff1aSopenharmony_ci dst4 = __lsx_vdp2_h_bu_b(vec8, filt0); 789cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2, 790cabdff1aSopenharmony_ci vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3); 791cabdff1aSopenharmony_ci dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1); 792cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 793cabdff1aSopenharmony_ci dst10_r, dst21_r, dst32_r, dst43_r); 794cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 795cabdff1aSopenharmony_ci dst10_l, dst21_l, dst32_l, dst43_l); 796cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 797cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 798cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 799cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 800cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 801cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, 802cabdff1aSopenharmony_ci out0_r, out1_r); 803cabdff1aSopenharmony_ci out = __lsx_vssrarni_bu_h(out1_r, out0_r, 6); 804cabdff1aSopenharmony_ci __lsx_vstelm_d(out, dst, 0, 0); 805cabdff1aSopenharmony_ci __lsx_vstelm_d(out, dst + dst_stride, 0, 1); 806cabdff1aSopenharmony_ci} 807cabdff1aSopenharmony_ci 808cabdff1aSopenharmony_cistatic av_always_inline 809cabdff1aSopenharmony_civoid hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 810cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 811cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t width8mult) 812cabdff1aSopenharmony_ci{ 813cabdff1aSopenharmony_ci uint32_t cnt; 814cabdff1aSopenharmony_ci const int32_t src_stride_2x = (src_stride << 1); 815cabdff1aSopenharmony_ci const int32_t dst_stride_2x = (dst_stride << 1); 816cabdff1aSopenharmony_ci const int32_t src_stride_4x = (src_stride << 2); 817cabdff1aSopenharmony_ci const int32_t src_stride_3x = src_stride_2x + src_stride; 818cabdff1aSopenharmony_ci const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_ci __m128i out0, out1; 821cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 822cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 823cabdff1aSopenharmony_ci __m128i filt0, filt1, filt_h0, filt_h1, filter_vec; 824cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 825cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 826cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 827cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 828cabdff1aSopenharmony_ci 829cabdff1aSopenharmony_ci src -= (src_stride + 1); 830cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 831cabdff1aSopenharmony_ci 832cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 833cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 834cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 835cabdff1aSopenharmony_ci 836cabdff1aSopenharmony_ci mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 837cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 838cabdff1aSopenharmony_ci 839cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 840cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 841cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 842cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 843cabdff1aSopenharmony_ci src += src_stride_4x; 844cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 845cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 846cabdff1aSopenharmony_ci src += (8 - src_stride_4x); 847cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, 848cabdff1aSopenharmony_ci vec0, vec1); 849cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, 850cabdff1aSopenharmony_ci vec2, vec3); 851cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, 852cabdff1aSopenharmony_ci vec4, vec5); 853cabdff1aSopenharmony_ci 854cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 855cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 856cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 857cabdff1aSopenharmony_ci dst0, dst1); 858cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 859cabdff1aSopenharmony_ci 860cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 861cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 862cabdff1aSopenharmony_ci 863cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, 864cabdff1aSopenharmony_ci vec0, vec1); 865cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, 866cabdff1aSopenharmony_ci vec2, vec3); 867cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, 868cabdff1aSopenharmony_ci vec4, vec5); 869cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, 870cabdff1aSopenharmony_ci vec6, vec7); 871cabdff1aSopenharmony_ci 872cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 873cabdff1aSopenharmony_ci vec6, filt0, dst3, dst4, dst5, dst6); 874cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1, 875cabdff1aSopenharmony_ci dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6); 876cabdff1aSopenharmony_ci 877cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6, 878cabdff1aSopenharmony_ci dst5, dst32_r, dst43_r, dst54_r, dst65_r); 879cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6, 880cabdff1aSopenharmony_ci dst5, dst32_l, dst43_l, dst54_l, dst65_l); 881cabdff1aSopenharmony_ci 882cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 883cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 884cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 885cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 886cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 887cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 888cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 889cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 890cabdff1aSopenharmony_ci filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 891cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 892cabdff1aSopenharmony_ci 893cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, 894cabdff1aSopenharmony_ci dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3); 895cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1); 896cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst, 0, 0); 897cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 898cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 899cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 900cabdff1aSopenharmony_ci dst += 8; 901cabdff1aSopenharmony_ci } 902cabdff1aSopenharmony_ci} 903cabdff1aSopenharmony_ci 904cabdff1aSopenharmony_cistatic av_always_inline 905cabdff1aSopenharmony_civoid hevc_hv_4t_8x6_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 906cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 907cabdff1aSopenharmony_ci const int8_t *filter_y) 908cabdff1aSopenharmony_ci{ 909cabdff1aSopenharmony_ci const int32_t src_stride_2x = (src_stride << 1); 910cabdff1aSopenharmony_ci const int32_t dst_stride_2x = (dst_stride << 1); 911cabdff1aSopenharmony_ci const int32_t src_stride_4x = (src_stride << 2); 912cabdff1aSopenharmony_ci const int32_t dst_stride_4x = (dst_stride << 2); 913cabdff1aSopenharmony_ci const int32_t src_stride_3x = src_stride_2x + src_stride; 914cabdff1aSopenharmony_ci const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 915cabdff1aSopenharmony_ci __m128i out0, out1, out2; 916cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 917cabdff1aSopenharmony_ci __m128i filt0, filt1; 918cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1, filter_vec; 919cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 920cabdff1aSopenharmony_ci __m128i mask1; 921cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 922cabdff1aSopenharmony_ci __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 923cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 924cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 925cabdff1aSopenharmony_ci __m128i dst4_r, dst4_l, dst5_r, dst5_l; 926cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst10_l, dst32_l; 927cabdff1aSopenharmony_ci __m128i dst21_r, dst43_r, dst21_l, dst43_l; 928cabdff1aSopenharmony_ci __m128i dst54_r, dst54_l, dst65_r, dst65_l; 929cabdff1aSopenharmony_ci __m128i dst76_r, dst76_l, dst87_r, dst87_l; 930cabdff1aSopenharmony_ci __m128i out0_r, out1_r, out2_r, out3_r, out4_r, out5_r; 931cabdff1aSopenharmony_ci 932cabdff1aSopenharmony_ci src -= (src_stride + 1); 933cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 934cabdff1aSopenharmony_ci 935cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 936cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 937cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 938cabdff1aSopenharmony_ci 939cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 940cabdff1aSopenharmony_ci 941cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 942cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src, 943cabdff1aSopenharmony_ci src_stride_3x, src, src_stride_4x, src1, src2, src3, src4); 944cabdff1aSopenharmony_ci src += src_stride_4x; 945cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src, 946cabdff1aSopenharmony_ci src_stride_3x, src, src_stride_4x, src5, src6, src7, src8); 947cabdff1aSopenharmony_ci 948cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1, 949cabdff1aSopenharmony_ci mask0, src1, src1, mask1, vec0, vec1, vec2, vec3); 950cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3, 951cabdff1aSopenharmony_ci mask0, src3, src3, mask1, vec4, vec5, vec6, vec7); 952cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5, 953cabdff1aSopenharmony_ci mask0, src5, src5, mask1, vec8, vec9, vec10, vec11); 954cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7, 955cabdff1aSopenharmony_ci mask0, src7, src7, mask1, vec12, vec13, vec14, vec15); 956cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17); 957cabdff1aSopenharmony_ci 958cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 959cabdff1aSopenharmony_ci filt0, dst0, dst1, dst2, dst3); 960cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, vec14, 961cabdff1aSopenharmony_ci filt0, dst4, dst5, dst6, dst7); 962cabdff1aSopenharmony_ci dst8 = __lsx_vdp2_h_bu_b(vec16, filt0); 963cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2, 964cabdff1aSopenharmony_ci vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3); 965cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6, 966cabdff1aSopenharmony_ci vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7); 967cabdff1aSopenharmony_ci dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1); 968cabdff1aSopenharmony_ci 969cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 970cabdff1aSopenharmony_ci dst10_r, dst21_r, dst32_r, dst43_r); 971cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 972cabdff1aSopenharmony_ci dst10_l, dst21_l, dst32_l, dst43_l); 973cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7, 974cabdff1aSopenharmony_ci dst54_r, dst65_r, dst76_r, dst87_r); 975cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7, 976cabdff1aSopenharmony_ci dst54_l, dst65_l, dst76_l, dst87_l); 977cabdff1aSopenharmony_ci 978cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 979cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 980cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 981cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 982cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r, 983cabdff1aSopenharmony_ci filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l); 984cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 985cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 986cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 987cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 988cabdff1aSopenharmony_ci filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 989cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 990cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l, 991cabdff1aSopenharmony_ci filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1, 992cabdff1aSopenharmony_ci dst4_r, dst4_l, dst5_r, dst5_l); 993cabdff1aSopenharmony_ci 994cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l, 995cabdff1aSopenharmony_ci dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, out2_r, out3_r); 996cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vsrani_h_w, dst4_l, dst4_r, 6, dst5_l, dst5_r, 6, 997cabdff1aSopenharmony_ci out4_r, out5_r); 998cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6, 999cabdff1aSopenharmony_ci out0, out1); 1000cabdff1aSopenharmony_ci out2 = __lsx_vssrarni_bu_h(out5_r, out4_r, 6); 1001cabdff1aSopenharmony_ci 1002cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst, 0, 0); 1003cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 1004cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 1005cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 1006cabdff1aSopenharmony_ci dst += dst_stride_4x; 1007cabdff1aSopenharmony_ci __lsx_vstelm_d(out2, dst, 0, 0); 1008cabdff1aSopenharmony_ci __lsx_vstelm_d(out2, dst + dst_stride, 0, 1); 1009cabdff1aSopenharmony_ci} 1010cabdff1aSopenharmony_ci 1011cabdff1aSopenharmony_cistatic av_always_inline 1012cabdff1aSopenharmony_civoid hevc_hv_4t_8multx4mult_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1013cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 1014cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height, 1015cabdff1aSopenharmony_ci int32_t width8mult) 1016cabdff1aSopenharmony_ci{ 1017cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 1018cabdff1aSopenharmony_ci uint8_t *src_tmp; 1019cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1020cabdff1aSopenharmony_ci const int32_t src_stride_2x = (src_stride << 1); 1021cabdff1aSopenharmony_ci const int32_t dst_stride_2x = (dst_stride << 1); 1022cabdff1aSopenharmony_ci const int32_t src_stride_4x = (src_stride << 2); 1023cabdff1aSopenharmony_ci const int32_t dst_stride_4x = (dst_stride << 2); 1024cabdff1aSopenharmony_ci const int32_t src_stride_3x = src_stride_2x + src_stride; 1025cabdff1aSopenharmony_ci const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1026cabdff1aSopenharmony_ci 1027cabdff1aSopenharmony_ci __m128i out0, out1; 1028cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6; 1029cabdff1aSopenharmony_ci __m128i filt0, filt1; 1030cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1, filter_vec; 1031cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1032cabdff1aSopenharmony_ci __m128i mask1; 1033cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1034cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5; 1035cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 1036cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst21_r, dst43_r; 1037cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst21_l, dst43_l; 1038cabdff1aSopenharmony_ci __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6; 1039cabdff1aSopenharmony_ci __m128i out0_r, out1_r, out2_r, out3_r; 1040cabdff1aSopenharmony_ci 1041cabdff1aSopenharmony_ci src -= (src_stride + 1); 1042cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1043cabdff1aSopenharmony_ci 1044cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 1045cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1046cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 1047cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 1048cabdff1aSopenharmony_ci 1049cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 1050cabdff1aSopenharmony_ci src_tmp = src; 1051cabdff1aSopenharmony_ci dst_tmp = dst; 1052cabdff1aSopenharmony_ci 1053cabdff1aSopenharmony_ci src0 = __lsx_vld(src_tmp, 0); 1054cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1055cabdff1aSopenharmony_ci src1, src2); 1056cabdff1aSopenharmony_ci src_tmp += src_stride_3x; 1057cabdff1aSopenharmony_ci 1058cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, 1059cabdff1aSopenharmony_ci vec0, vec1); 1060cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, 1061cabdff1aSopenharmony_ci vec2, vec3); 1062cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, 1063cabdff1aSopenharmony_ci vec4, vec5); 1064cabdff1aSopenharmony_ci 1065cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 1066cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 1067cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 1068cabdff1aSopenharmony_ci dst0, dst1); 1069cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 1070cabdff1aSopenharmony_ci 1071cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 1072cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 1073cabdff1aSopenharmony_ci 1074cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1075cabdff1aSopenharmony_ci src3 = __lsx_vld(src_tmp, 0); 1076cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1077cabdff1aSopenharmony_ci src4, src5); 1078cabdff1aSopenharmony_ci src6 = __lsx_vldx(src_tmp, src_stride_3x); 1079cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 1080cabdff1aSopenharmony_ci 1081cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4, 1082cabdff1aSopenharmony_ci src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3); 1083cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6, 1084cabdff1aSopenharmony_ci src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7); 1085cabdff1aSopenharmony_ci 1086cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 1087cabdff1aSopenharmony_ci vec6, filt0, dst3, dst4, dst5, dst6); 1088cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, 1089cabdff1aSopenharmony_ci filt1, dst5, vec5, filt1, dst6, vec7, filt1, 1090cabdff1aSopenharmony_ci dst3, dst4, dst5, dst6); 1091cabdff1aSopenharmony_ci 1092cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, 1093cabdff1aSopenharmony_ci dst6, dst5, dst32_r, dst43_r, dst54_r, dst65_r); 1094cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, 1095cabdff1aSopenharmony_ci dst6, dst5, dst32_l, dst43_l, dst54_l, dst65_l); 1096cabdff1aSopenharmony_ci 1097cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 1098cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 1099cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 1100cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 1101cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, 1102cabdff1aSopenharmony_ci dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, 1103cabdff1aSopenharmony_ci dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l); 1104cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, 1105cabdff1aSopenharmony_ci dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, 1106cabdff1aSopenharmony_ci dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l); 1107cabdff1aSopenharmony_ci 1108cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, 1109cabdff1aSopenharmony_ci dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, 1110cabdff1aSopenharmony_ci out2_r, out3_r); 1111cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 1112cabdff1aSopenharmony_ci 6, out0, out1); 1113cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst_tmp, 0, 0); 1114cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1); 1115cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0); 1116cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1); 1117cabdff1aSopenharmony_ci dst_tmp += dst_stride_4x; 1118cabdff1aSopenharmony_ci 1119cabdff1aSopenharmony_ci dst10_r = dst54_r; 1120cabdff1aSopenharmony_ci dst10_l = dst54_l; 1121cabdff1aSopenharmony_ci dst21_r = dst65_r; 1122cabdff1aSopenharmony_ci dst21_l = dst65_l; 1123cabdff1aSopenharmony_ci dst2 = dst6; 1124cabdff1aSopenharmony_ci } 1125cabdff1aSopenharmony_ci src += 8; 1126cabdff1aSopenharmony_ci dst += 8; 1127cabdff1aSopenharmony_ci } 1128cabdff1aSopenharmony_ci} 1129cabdff1aSopenharmony_ci 1130cabdff1aSopenharmony_cistatic 1131cabdff1aSopenharmony_civoid hevc_hv_4t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1132cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 1133cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 1134cabdff1aSopenharmony_ci{ 1135cabdff1aSopenharmony_ci if (2 == height) { 1136cabdff1aSopenharmony_ci hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y); 1137cabdff1aSopenharmony_ci } else if (4 == height) { 1138cabdff1aSopenharmony_ci hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, 1139cabdff1aSopenharmony_ci filter_x, filter_y, 1); 1140cabdff1aSopenharmony_ci } else if (6 == height) { 1141cabdff1aSopenharmony_ci hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y); 1142cabdff1aSopenharmony_ci } else if (0 == (height & 0x03)) { 1143cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 1144cabdff1aSopenharmony_ci filter_x, filter_y, height, 1); 1145cabdff1aSopenharmony_ci } 1146cabdff1aSopenharmony_ci} 1147cabdff1aSopenharmony_ci 1148cabdff1aSopenharmony_cistatic av_always_inline 1149cabdff1aSopenharmony_civoid hevc_hv_4t_12w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1150cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 1151cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 1152cabdff1aSopenharmony_ci{ 1153cabdff1aSopenharmony_ci uint32_t loop_cnt; 1154cabdff1aSopenharmony_ci uint8_t *src_tmp, *dst_tmp; 1155cabdff1aSopenharmony_ci const int32_t src_stride_2x = (src_stride << 1); 1156cabdff1aSopenharmony_ci const int32_t dst_stride_2x = (dst_stride << 1); 1157cabdff1aSopenharmony_ci const int32_t src_stride_4x = (src_stride << 2); 1158cabdff1aSopenharmony_ci const int32_t dst_stride_4x = (dst_stride << 2); 1159cabdff1aSopenharmony_ci const int32_t src_stride_3x = src_stride_2x + src_stride; 1160cabdff1aSopenharmony_ci const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1161cabdff1aSopenharmony_ci __m128i out0, out1; 1162cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1163cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1164cabdff1aSopenharmony_ci __m128i mask0, mask1, mask2, mask3; 1165cabdff1aSopenharmony_ci __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3; 1166cabdff1aSopenharmony_ci __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6; 1167cabdff1aSopenharmony_ci __m128i dst10, dst21, dst22, dst73, dst84, dst95, dst106; 1168cabdff1aSopenharmony_ci __m128i dst76_r, dst98_r, dst87_r, dst109_r; 1169cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 1170cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 1171cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 1172cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1173cabdff1aSopenharmony_ci 1174cabdff1aSopenharmony_ci src -= (src_stride + 1); 1175cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1176cabdff1aSopenharmony_ci 1177cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 1178cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1179cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 1180cabdff1aSopenharmony_ci 1181cabdff1aSopenharmony_ci mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1182cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 1183cabdff1aSopenharmony_ci 1184cabdff1aSopenharmony_ci src_tmp = src; 1185cabdff1aSopenharmony_ci dst_tmp = dst; 1186cabdff1aSopenharmony_ci 1187cabdff1aSopenharmony_ci src0 = __lsx_vld(src_tmp, 0); 1188cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1189cabdff1aSopenharmony_ci src1, src2); 1190cabdff1aSopenharmony_ci src_tmp += src_stride_3x; 1191cabdff1aSopenharmony_ci 1192cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1); 1193cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3); 1194cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5); 1195cabdff1aSopenharmony_ci 1196cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1); 1197cabdff1aSopenharmony_ci dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0); 1198cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1, 1199cabdff1aSopenharmony_ci dsth0, dsth1); 1200cabdff1aSopenharmony_ci dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1); 1201cabdff1aSopenharmony_ci 1202cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r); 1203cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l); 1204cabdff1aSopenharmony_ci 1205cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 1206cabdff1aSopenharmony_ci src3 = __lsx_vld(src_tmp, 0); 1207cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1208cabdff1aSopenharmony_ci src4, src5); 1209cabdff1aSopenharmony_ci src6 = __lsx_vldx(src_tmp, src_stride_3x); 1210cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 1211cabdff1aSopenharmony_ci 1212cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4, 1213cabdff1aSopenharmony_ci src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3); 1214cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6, 1215cabdff1aSopenharmony_ci src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7); 1216cabdff1aSopenharmony_ci 1217cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 1218cabdff1aSopenharmony_ci vec6, filt0, dsth3, dsth4, dsth5, dsth6); 1219cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4, 1220cabdff1aSopenharmony_ci vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1, 1221cabdff1aSopenharmony_ci dsth3, dsth4, dsth5, dsth6); 1222cabdff1aSopenharmony_ci 1223cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4, 1224cabdff1aSopenharmony_ci dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r); 1225cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4, 1226cabdff1aSopenharmony_ci dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l); 1227cabdff1aSopenharmony_ci 1228cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 1229cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 1230cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 1231cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 1232cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 1233cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 1234cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 1235cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 1236cabdff1aSopenharmony_ci filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 1237cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 1238cabdff1aSopenharmony_ci 1239cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l, 1240cabdff1aSopenharmony_ci dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3); 1241cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1); 1242cabdff1aSopenharmony_ci 1243cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst_tmp, 0, 0); 1244cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1); 1245cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0); 1246cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1); 1247cabdff1aSopenharmony_ci dst_tmp += dst_stride_4x; 1248cabdff1aSopenharmony_ci 1249cabdff1aSopenharmony_ci dst10_r = dst54_r; 1250cabdff1aSopenharmony_ci dst10_l = dst54_l; 1251cabdff1aSopenharmony_ci dst21_r = dst65_r; 1252cabdff1aSopenharmony_ci dst21_l = dst65_l; 1253cabdff1aSopenharmony_ci dsth2 = dsth6; 1254cabdff1aSopenharmony_ci } 1255cabdff1aSopenharmony_ci 1256cabdff1aSopenharmony_ci src += 8; 1257cabdff1aSopenharmony_ci dst += 8; 1258cabdff1aSopenharmony_ci 1259cabdff1aSopenharmony_ci mask2 = __lsx_vld(ff_hevc_mask_arr, 16); 1260cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask2, 2); 1261cabdff1aSopenharmony_ci 1262cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 1263cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 1264cabdff1aSopenharmony_ci src += src_stride_3x; 1265cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1); 1266cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3); 1267cabdff1aSopenharmony_ci 1268cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21); 1269cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1, 1270cabdff1aSopenharmony_ci dst10, dst21); 1271cabdff1aSopenharmony_ci 1272cabdff1aSopenharmony_ci dst10_r = __lsx_vilvl_h(dst21, dst10); 1273cabdff1aSopenharmony_ci dst21_r = __lsx_vilvh_h(dst21, dst10); 1274cabdff1aSopenharmony_ci dst22 = __lsx_vreplvei_d(dst21, 1); 1275cabdff1aSopenharmony_ci 1276cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 1277cabdff1aSopenharmony_ci src3 = __lsx_vld(src, 0); 1278cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5); 1279cabdff1aSopenharmony_ci src6 = __lsx_vldx(src, src_stride_3x); 1280cabdff1aSopenharmony_ci src += src_stride_4x; 1281cabdff1aSopenharmony_ci src7 = __lsx_vld(src, 0); 1282cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9); 1283cabdff1aSopenharmony_ci src10 = __lsx_vldx(src, src_stride_3x); 1284cabdff1aSopenharmony_ci src += src_stride_4x; 1285cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3, src8, 1286cabdff1aSopenharmony_ci src4, mask2, src8, src4, mask3, vec0, vec1, vec2, vec3); 1287cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3, src10, 1288cabdff1aSopenharmony_ci src6, mask2, src10, src6, mask3, vec4, vec5, vec6, vec7); 1289cabdff1aSopenharmony_ci 1290cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 1291cabdff1aSopenharmony_ci vec6, filt0, dst73, dst84, dst95, dst106); 1292cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3, 1293cabdff1aSopenharmony_ci filt1, dst95, vec5, filt1, dst106, vec7, filt1, 1294cabdff1aSopenharmony_ci dst73, dst84, dst95, dst106); 1295cabdff1aSopenharmony_ci 1296cabdff1aSopenharmony_ci dst32_r = __lsx_vilvl_h(dst73, dst22); 1297cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst84, dst73, dst95, dst84, dst43_r, dst54_r); 1298cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r); 1299cabdff1aSopenharmony_ci dst65_r = __lsx_vilvl_h(dst106, dst95); 1300cabdff1aSopenharmony_ci dst109_r = __lsx_vilvh_h(dst106, dst95); 1301cabdff1aSopenharmony_ci dst22 = __lsx_vreplvei_d(dst73, 1); 1302cabdff1aSopenharmony_ci dst76_r = __lsx_vilvl_h(dst22, dst106); 1303cabdff1aSopenharmony_ci 1304cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r, 1305cabdff1aSopenharmony_ci filt_h0, dst43_r, filt_h0, dst0, dst1, dst2, dst3); 1306cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r, 1307cabdff1aSopenharmony_ci filt_h0, dst87_r, filt_h0, dst4, dst5, dst6, dst7); 1308cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0, dst32_r, filt_h1, dst1, dst43_r, 1309cabdff1aSopenharmony_ci filt_h1, dst2, dst54_r, filt_h1, dst3, dst65_r, filt_h1, 1310cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 1311cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst4, dst76_r, filt_h1, dst5, dst87_r, 1312cabdff1aSopenharmony_ci filt_h1, dst6, dst98_r, filt_h1, dst7, dst109_r, filt_h1, 1313cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 1314cabdff1aSopenharmony_ci 1315cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vsrani_h_w, dst1, dst0, 6, dst3, dst2, 6, dst5, dst4, 1316cabdff1aSopenharmony_ci 6, dst7, dst6, 6, tmp0, tmp1, tmp2, tmp3); 1317cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1); 1318cabdff1aSopenharmony_ci 1319cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst, 0, 0); 1320cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst + dst_stride, 0, 1); 1321cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2); 1322cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3); 1323cabdff1aSopenharmony_ci dst += dst_stride_4x; 1324cabdff1aSopenharmony_ci __lsx_vstelm_w(out1, dst, 0, 0); 1325cabdff1aSopenharmony_ci __lsx_vstelm_w(out1, dst + dst_stride, 0, 1); 1326cabdff1aSopenharmony_ci __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2); 1327cabdff1aSopenharmony_ci __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3); 1328cabdff1aSopenharmony_ci dst += dst_stride_4x; 1329cabdff1aSopenharmony_ci 1330cabdff1aSopenharmony_ci dst10_r = dst98_r; 1331cabdff1aSopenharmony_ci dst21_r = dst109_r; 1332cabdff1aSopenharmony_ci dst22 = __lsx_vreplvei_d(dst106, 1); 1333cabdff1aSopenharmony_ci } 1334cabdff1aSopenharmony_ci} 1335cabdff1aSopenharmony_ci 1336cabdff1aSopenharmony_cistatic void hevc_hv_4t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1337cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 1338cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 1339cabdff1aSopenharmony_ci{ 1340cabdff1aSopenharmony_ci if (4 == height) { 1341cabdff1aSopenharmony_ci hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, filter_x, 1342cabdff1aSopenharmony_ci filter_y, 2); 1343cabdff1aSopenharmony_ci } else { 1344cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 1345cabdff1aSopenharmony_ci filter_x, filter_y, height, 2); 1346cabdff1aSopenharmony_ci } 1347cabdff1aSopenharmony_ci} 1348cabdff1aSopenharmony_ci 1349cabdff1aSopenharmony_cistatic void hevc_hv_4t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1350cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 1351cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 1352cabdff1aSopenharmony_ci{ 1353cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 1354cabdff1aSopenharmony_ci filter_x, filter_y, height, 3); 1355cabdff1aSopenharmony_ci} 1356cabdff1aSopenharmony_ci 1357cabdff1aSopenharmony_cistatic void hevc_hv_4t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1358cabdff1aSopenharmony_ci int32_t dst_stride, const int8_t *filter_x, 1359cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t height) 1360cabdff1aSopenharmony_ci{ 1361cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 1362cabdff1aSopenharmony_ci filter_x, filter_y, height, 4); 1363cabdff1aSopenharmony_ci} 1364cabdff1aSopenharmony_ci 1365cabdff1aSopenharmony_ci#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 1366cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \ 1367cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 1368cabdff1aSopenharmony_ci uint8_t *src, \ 1369cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 1370cabdff1aSopenharmony_ci int height, \ 1371cabdff1aSopenharmony_ci intptr_t mx, \ 1372cabdff1aSopenharmony_ci intptr_t my, \ 1373cabdff1aSopenharmony_ci int width) \ 1374cabdff1aSopenharmony_ci{ \ 1375cabdff1aSopenharmony_ci const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 1376cabdff1aSopenharmony_ci \ 1377cabdff1aSopenharmony_ci common_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \ 1378cabdff1aSopenharmony_ci filter, height); \ 1379cabdff1aSopenharmony_ci} 1380cabdff1aSopenharmony_ci 1381cabdff1aSopenharmony_ciUNI_MC(qpel, h, 64, 8, hz, mx); 1382cabdff1aSopenharmony_ci 1383cabdff1aSopenharmony_ciUNI_MC(qpel, v, 24, 8, vt, my); 1384cabdff1aSopenharmony_ciUNI_MC(qpel, v, 32, 8, vt, my); 1385cabdff1aSopenharmony_ciUNI_MC(qpel, v, 48, 8, vt, my); 1386cabdff1aSopenharmony_ciUNI_MC(qpel, v, 64, 8, vt, my); 1387cabdff1aSopenharmony_ci 1388cabdff1aSopenharmony_ciUNI_MC(epel, v, 24, 4, vt, my); 1389cabdff1aSopenharmony_ciUNI_MC(epel, v, 32, 4, vt, my); 1390cabdff1aSopenharmony_ci 1391cabdff1aSopenharmony_ci#undef UNI_MC 1392cabdff1aSopenharmony_ci 1393cabdff1aSopenharmony_ci#define UNI_MC_HV(PEL, WIDTH, TAP) \ 1394cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \ 1395cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 1396cabdff1aSopenharmony_ci uint8_t *src, \ 1397cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 1398cabdff1aSopenharmony_ci int height, \ 1399cabdff1aSopenharmony_ci intptr_t mx, \ 1400cabdff1aSopenharmony_ci intptr_t my, \ 1401cabdff1aSopenharmony_ci int width) \ 1402cabdff1aSopenharmony_ci{ \ 1403cabdff1aSopenharmony_ci const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 1404cabdff1aSopenharmony_ci const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 1405cabdff1aSopenharmony_ci \ 1406cabdff1aSopenharmony_ci hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \ 1407cabdff1aSopenharmony_ci filter_x, filter_y, height); \ 1408cabdff1aSopenharmony_ci} 1409cabdff1aSopenharmony_ci 1410cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 8, 8); 1411cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 16, 8); 1412cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 24, 8); 1413cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 32, 8); 1414cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 48, 8); 1415cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 64, 8); 1416cabdff1aSopenharmony_ci 1417cabdff1aSopenharmony_ciUNI_MC_HV(epel, 8, 4); 1418cabdff1aSopenharmony_ciUNI_MC_HV(epel, 12, 4); 1419cabdff1aSopenharmony_ciUNI_MC_HV(epel, 16, 4); 1420cabdff1aSopenharmony_ciUNI_MC_HV(epel, 24, 4); 1421cabdff1aSopenharmony_ciUNI_MC_HV(epel, 32, 4); 1422cabdff1aSopenharmony_ci 1423cabdff1aSopenharmony_ci#undef UNI_MC_HV 1424