1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2022 Loongson Technology Corporation Limited 3cabdff1aSopenharmony_ci * Contributed by Lu Wang <wanglu@loongson.cn> 4cabdff1aSopenharmony_ci * Hao Chen <chenhao@loongson.cn> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 24cabdff1aSopenharmony_ci#include "hevcdsp_lsx.h" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 27cabdff1aSopenharmony_ci /* 8 width cases */ 28cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 29cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 30cabdff1aSopenharmony_ci}; 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_cistatic av_always_inline __m128i 33cabdff1aSopenharmony_cihevc_bi_rnd_clip(__m128i in0, __m128i vec0, __m128i in1, __m128i vec1) 34cabdff1aSopenharmony_ci{ 35cabdff1aSopenharmony_ci __m128i out; 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci vec0 = __lsx_vsadd_h(in0, vec0); 38cabdff1aSopenharmony_ci vec1 = __lsx_vsadd_h(in1, vec1); 39cabdff1aSopenharmony_ci out = __lsx_vssrarni_bu_h(vec1, vec0, 7); 40cabdff1aSopenharmony_ci return out; 41cabdff1aSopenharmony_ci} 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_ci/* hevc_bi_copy: dst = av_clip_uint8((src0 << 6 + src1) >> 7) */ 44cabdff1aSopenharmony_cistatic 45cabdff1aSopenharmony_civoid hevc_bi_copy_4w_lsx(uint8_t *src0_ptr, int32_t src_stride, 46cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 47cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, int32_t height) 48cabdff1aSopenharmony_ci{ 49cabdff1aSopenharmony_ci int32_t loop_cnt = height >> 3; 50cabdff1aSopenharmony_ci int32_t res = (height & 0x07) >> 1; 51cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 52cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 53cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 54cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 55cabdff1aSopenharmony_ci int32_t src2_stride_2x = (src2_stride << 1); 56cabdff1aSopenharmony_ci int32_t src2_stride_4x = (src2_stride << 2); 57cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 58cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 59cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride; 60cabdff1aSopenharmony_ci __m128i src0, src1; 61cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 62cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3; 63cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3; 64cabdff1aSopenharmony_ci __m128i reg0, reg1, reg2, reg3; 65cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3; 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ci for (;loop_cnt--;) { 68cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_w(src0_ptr, 0); 69cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0); 70cabdff1aSopenharmony_ci reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0); 71cabdff1aSopenharmony_ci reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0); 72cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 73cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1); 74cabdff1aSopenharmony_ci src0 = __lsx_vilvl_d(tmp1, tmp0); 75cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_w(src0_ptr, 0); 76cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0); 77cabdff1aSopenharmony_ci reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0); 78cabdff1aSopenharmony_ci reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0); 79cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1); 80cabdff1aSopenharmony_ci src1 = __lsx_vilvl_d(tmp1, tmp0); 81cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 82cabdff1aSopenharmony_ci 83cabdff1aSopenharmony_ci tmp0 = __lsx_vldrepl_d(src1_ptr, 0); 84cabdff1aSopenharmony_ci tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0); 85cabdff1aSopenharmony_ci tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0); 86cabdff1aSopenharmony_ci tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0); 87cabdff1aSopenharmony_ci src1_ptr += src2_stride_4x; 88cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in0, in1); 89cabdff1aSopenharmony_ci tmp0 = __lsx_vldrepl_d(src1_ptr, 0); 90cabdff1aSopenharmony_ci tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0); 91cabdff1aSopenharmony_ci tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0); 92cabdff1aSopenharmony_ci tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0); 93cabdff1aSopenharmony_ci src1_ptr += src2_stride_4x; 94cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in2, in3); 95cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst0, dst2); 96cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, dst1, dst3); 97cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst1, dst3); 98cabdff1aSopenharmony_ci dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 99cabdff1aSopenharmony_ci dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 100cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); 101cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1); 102cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst + dst_stride_2x, 0, 2); 103cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst + dst_stride_3x, 0, 3); 104cabdff1aSopenharmony_ci dst += dst_stride_4x; 105cabdff1aSopenharmony_ci __lsx_vstelm_w(dst1, dst, 0, 0); 106cabdff1aSopenharmony_ci __lsx_vstelm_w(dst1, dst + dst_stride, 0, 1); 107cabdff1aSopenharmony_ci __lsx_vstelm_w(dst1, dst + dst_stride_2x, 0, 2); 108cabdff1aSopenharmony_ci __lsx_vstelm_w(dst1, dst + dst_stride_3x, 0, 3); 109cabdff1aSopenharmony_ci dst += dst_stride_4x; 110cabdff1aSopenharmony_ci } 111cabdff1aSopenharmony_ci for(;res--;) { 112cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_w(src0_ptr, 0); 113cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0); 114cabdff1aSopenharmony_ci reg2 = __lsx_vldrepl_d(src1_ptr, 0); 115cabdff1aSopenharmony_ci reg3 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0); 116cabdff1aSopenharmony_ci src0 = __lsx_vilvl_w(reg1, reg0); 117cabdff1aSopenharmony_ci in0 = __lsx_vilvl_d(reg3, reg2); 118cabdff1aSopenharmony_ci dst0 = __lsx_vsllwil_hu_bu(src0, 6); 119cabdff1aSopenharmony_ci dst0 = __lsx_vsadd_h(dst0, in0); 120cabdff1aSopenharmony_ci dst0 = __lsx_vssrarni_bu_h(dst0, dst0, 7); 121cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); 122cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1); 123cabdff1aSopenharmony_ci src0_ptr += src_stride_2x; 124cabdff1aSopenharmony_ci src1_ptr += src2_stride_2x; 125cabdff1aSopenharmony_ci dst += dst_stride_2x; 126cabdff1aSopenharmony_ci } 127cabdff1aSopenharmony_ci} 128cabdff1aSopenharmony_ci 129cabdff1aSopenharmony_cistatic 130cabdff1aSopenharmony_civoid hevc_bi_copy_6w_lsx(uint8_t *src0_ptr, int32_t src_stride, 131cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 132cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, int32_t height) 133cabdff1aSopenharmony_ci{ 134cabdff1aSopenharmony_ci int32_t loop_cnt; 135cabdff1aSopenharmony_ci int32_t res = (height & 0x07) >> 1; 136cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 137cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 138cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 139cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 140cabdff1aSopenharmony_ci int32_t src2_stride_x = (src2_stride << 1); 141cabdff1aSopenharmony_ci int32_t src2_stride_2x = (src2_stride << 2); 142cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 143cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 144cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 145cabdff1aSopenharmony_ci __m128i out0, out1, out2, out3; 146cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 147cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3; 148cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5, in6, in7; 149cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 150cabdff1aSopenharmony_ci __m128i reg0, reg1, reg2, reg3; 151cabdff1aSopenharmony_ci 152cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 153cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_d(src0_ptr, 0); 154cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 155cabdff1aSopenharmony_ci reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0); 156cabdff1aSopenharmony_ci reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0); 157cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1); 158cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 159cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_d(src0_ptr, 0); 160cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 161cabdff1aSopenharmony_ci reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0); 162cabdff1aSopenharmony_ci reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0); 163cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3); 164cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 165cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr, 0); 166cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 167cabdff1aSopenharmony_ci src2_stride_2x, in1, in2); 168cabdff1aSopenharmony_ci in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 169cabdff1aSopenharmony_ci src1_ptr += src2_stride_2x; 170cabdff1aSopenharmony_ci in4 = __lsx_vld(src1_ptr, 0); 171cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 172cabdff1aSopenharmony_ci src2_stride_2x, in5, in6); 173cabdff1aSopenharmony_ci in7 = __lsx_vldx(src1_ptr, src2_stride_3x); 174cabdff1aSopenharmony_ci src1_ptr += src2_stride_2x; 175cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 176cabdff1aSopenharmony_ci dst0, dst2, dst4, dst6); 177cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3, 178cabdff1aSopenharmony_ci dst1, dst3, dst5, dst7); 179cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1, dst3, 180cabdff1aSopenharmony_ci dst5, dst7); 181cabdff1aSopenharmony_ci out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 182cabdff1aSopenharmony_ci out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 183cabdff1aSopenharmony_ci out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5); 184cabdff1aSopenharmony_ci out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7); 185cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst, 0, 0); 186cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst + dst_stride, 0, 2); 187cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst, 4, 2); 188cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst + dst_stride, 4, 6); 189cabdff1aSopenharmony_ci __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 0); 190cabdff1aSopenharmony_ci __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 2); 191cabdff1aSopenharmony_ci __lsx_vstelm_h(out1, dst + dst_stride_2x, 4, 2); 192cabdff1aSopenharmony_ci __lsx_vstelm_h(out1, dst + dst_stride_3x, 4, 6); 193cabdff1aSopenharmony_ci dst += dst_stride_4x; 194cabdff1aSopenharmony_ci __lsx_vstelm_w(out2, dst, 0, 0); 195cabdff1aSopenharmony_ci __lsx_vstelm_w(out2, dst + dst_stride, 0, 2); 196cabdff1aSopenharmony_ci __lsx_vstelm_h(out2, dst, 4, 2); 197cabdff1aSopenharmony_ci __lsx_vstelm_h(out2, dst + dst_stride, 4, 6); 198cabdff1aSopenharmony_ci __lsx_vstelm_w(out3, dst + dst_stride_2x, 0, 0); 199cabdff1aSopenharmony_ci __lsx_vstelm_w(out3, dst + dst_stride_3x, 0, 2); 200cabdff1aSopenharmony_ci __lsx_vstelm_h(out3, dst + dst_stride_2x, 4, 2); 201cabdff1aSopenharmony_ci __lsx_vstelm_h(out3, dst + dst_stride_3x, 4, 6); 202cabdff1aSopenharmony_ci dst += dst_stride_4x; 203cabdff1aSopenharmony_ci } 204cabdff1aSopenharmony_ci for (;res--;) { 205cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_d(src0_ptr, 0); 206cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 207cabdff1aSopenharmony_ci src0 = __lsx_vilvl_d(reg1, reg0); 208cabdff1aSopenharmony_ci src0_ptr += src_stride_2x; 209cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr, 0); 210cabdff1aSopenharmony_ci in1 = __lsx_vldx(src1_ptr, src2_stride_x); 211cabdff1aSopenharmony_ci src1_ptr += src2_stride_x; 212cabdff1aSopenharmony_ci dst0 = __lsx_vsllwil_hu_bu(src0, 6); 213cabdff1aSopenharmony_ci dst1 = __lsx_vilvh_b(zero, src0); 214cabdff1aSopenharmony_ci dst1 = __lsx_vslli_h(dst1, 6); 215cabdff1aSopenharmony_ci out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 216cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst, 0, 0); 217cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst, 4, 2); 218cabdff1aSopenharmony_ci dst += dst_stride; 219cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst, 0, 2); 220cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst, 4, 6); 221cabdff1aSopenharmony_ci dst += dst_stride; 222cabdff1aSopenharmony_ci } 223cabdff1aSopenharmony_ci} 224cabdff1aSopenharmony_ci 225cabdff1aSopenharmony_cistatic 226cabdff1aSopenharmony_civoid hevc_bi_copy_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, 227cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 228cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, int32_t height) 229cabdff1aSopenharmony_ci{ 230cabdff1aSopenharmony_ci int32_t loop_cnt = height >> 3; 231cabdff1aSopenharmony_ci int32_t res = (height & 7) >> 1; 232cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 233cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 234cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 235cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 236cabdff1aSopenharmony_ci int32_t src2_stride_x = (src2_stride << 1); 237cabdff1aSopenharmony_ci int32_t src2_stride_2x = (src2_stride << 2); 238cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 239cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 240cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 241cabdff1aSopenharmony_ci __m128i out0, out1, out2, out3; 242cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3; 243cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 244cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5, in6, in7; 245cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 246cabdff1aSopenharmony_ci __m128i reg0, reg1, reg2, reg3; 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 249cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_d(src0_ptr, 0); 250cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 251cabdff1aSopenharmony_ci reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0); 252cabdff1aSopenharmony_ci reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0); 253cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1); 254cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 255cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_d(src0_ptr, 0); 256cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 257cabdff1aSopenharmony_ci reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0); 258cabdff1aSopenharmony_ci reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0); 259cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3); 260cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 261cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 262cabdff1aSopenharmony_ci dst0, dst2, dst4, dst6); 263cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, 264cabdff1aSopenharmony_ci src3, dst1, dst3, dst5, dst7); 265cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1, 266cabdff1aSopenharmony_ci dst3, dst5, dst7); 267cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr, 0); 268cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 269cabdff1aSopenharmony_ci src2_stride_2x, in1, in2); 270cabdff1aSopenharmony_ci in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 271cabdff1aSopenharmony_ci src1_ptr += src2_stride_2x; 272cabdff1aSopenharmony_ci in4 = __lsx_vld(src1_ptr, 0); 273cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 274cabdff1aSopenharmony_ci src2_stride_2x, in5, in6); 275cabdff1aSopenharmony_ci in7 = __lsx_vldx(src1_ptr, src2_stride_3x); 276cabdff1aSopenharmony_ci src1_ptr += src2_stride_2x; 277cabdff1aSopenharmony_ci out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 278cabdff1aSopenharmony_ci out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 279cabdff1aSopenharmony_ci out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5); 280cabdff1aSopenharmony_ci out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7); 281cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst, 0, 0); 282cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 283cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 284cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 285cabdff1aSopenharmony_ci dst += dst_stride_4x; 286cabdff1aSopenharmony_ci __lsx_vstelm_d(out2, dst, 0, 0); 287cabdff1aSopenharmony_ci __lsx_vstelm_d(out2, dst + dst_stride, 0, 1); 288cabdff1aSopenharmony_ci __lsx_vstelm_d(out3, dst + dst_stride_2x, 0, 0); 289cabdff1aSopenharmony_ci __lsx_vstelm_d(out3, dst + dst_stride_3x, 0, 1); 290cabdff1aSopenharmony_ci dst += dst_stride_4x; 291cabdff1aSopenharmony_ci } 292cabdff1aSopenharmony_ci for (;res--;) { 293cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_d(src0_ptr, 0); 294cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 295cabdff1aSopenharmony_ci src0 = __lsx_vilvl_d(reg1, reg0); 296cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr, 0); 297cabdff1aSopenharmony_ci in1 = __lsx_vldx(src1_ptr, src2_stride_x); 298cabdff1aSopenharmony_ci dst0 = __lsx_vsllwil_hu_bu(src0, 6); 299cabdff1aSopenharmony_ci dst1 = __lsx_vilvh_b(zero, src0); 300cabdff1aSopenharmony_ci dst1 = __lsx_vslli_h(dst1, 6); 301cabdff1aSopenharmony_ci out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 302cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst, 0, 0); 303cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 304cabdff1aSopenharmony_ci src0_ptr += src_stride_2x; 305cabdff1aSopenharmony_ci src1_ptr += src2_stride_x; 306cabdff1aSopenharmony_ci dst += dst_stride_2x; 307cabdff1aSopenharmony_ci } 308cabdff1aSopenharmony_ci} 309cabdff1aSopenharmony_ci 310cabdff1aSopenharmony_cistatic 311cabdff1aSopenharmony_civoid hevc_bi_copy_12w_lsx(uint8_t *src0_ptr, int32_t src_stride, 312cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 313cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, int32_t height) 314cabdff1aSopenharmony_ci{ 315cabdff1aSopenharmony_ci uint32_t loop_cnt; 316cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 317cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 318cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 319cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 320cabdff1aSopenharmony_ci int32_t src2_stride_x = (src2_stride << 1); 321cabdff1aSopenharmony_ci int32_t src2_stride_2x = (src2_stride << 2); 322cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 323cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 324cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 325cabdff1aSopenharmony_ci int16_t* _src1 = src1_ptr + 8; 326cabdff1aSopenharmony_ci __m128i out0, out1, out2; 327cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3; 328cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5, in6, in7; 329cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5; 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 332cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr, 0); 333cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 334cabdff1aSopenharmony_ci src1, src2); 335cabdff1aSopenharmony_ci src3 = __lsx_vldx(src0_ptr, src_stride_3x); 336cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 337cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr, 0); 338cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 339cabdff1aSopenharmony_ci src2_stride_2x, in1, in2); 340cabdff1aSopenharmony_ci in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 341cabdff1aSopenharmony_ci src1_ptr += src2_stride_2x; 342cabdff1aSopenharmony_ci in4 = __lsx_vld(_src1, 0); 343cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x, 344cabdff1aSopenharmony_ci in5, in6); 345cabdff1aSopenharmony_ci in7 = __lsx_vldx(_src1, src2_stride_3x); 346cabdff1aSopenharmony_ci _src1 += src2_stride_2x; 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5); 349cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 350cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3) 351cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1); 352cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst4, dst5) 353cabdff1aSopenharmony_ci out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 354cabdff1aSopenharmony_ci out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 355cabdff1aSopenharmony_ci out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5); 356cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst, 0, 0); 357cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 358cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 359cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 360cabdff1aSopenharmony_ci __lsx_vstelm_w(out2, dst, 8, 0); 361cabdff1aSopenharmony_ci __lsx_vstelm_w(out2, dst + dst_stride, 8, 1); 362cabdff1aSopenharmony_ci __lsx_vstelm_w(out2, dst + dst_stride_2x, 8, 2); 363cabdff1aSopenharmony_ci __lsx_vstelm_w(out2, dst + dst_stride_3x, 8, 3); 364cabdff1aSopenharmony_ci dst += dst_stride_4x; 365cabdff1aSopenharmony_ci } 366cabdff1aSopenharmony_ci} 367cabdff1aSopenharmony_ci 368cabdff1aSopenharmony_cistatic 369cabdff1aSopenharmony_civoid hevc_bi_copy_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 370cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 371cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, int32_t height) 372cabdff1aSopenharmony_ci{ 373cabdff1aSopenharmony_ci uint32_t loop_cnt; 374cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 375cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 376cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 377cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 378cabdff1aSopenharmony_ci int32_t src2_stride_x = (src2_stride << 1); 379cabdff1aSopenharmony_ci int32_t src2_stride_2x = (src2_stride << 2); 380cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 381cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 382cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 383cabdff1aSopenharmony_ci int16_t *_src1 = src1_ptr + 8; 384cabdff1aSopenharmony_ci __m128i out0, out1, out2, out3; 385cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3; 386cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5, in6, in7; 387cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; 388cabdff1aSopenharmony_ci __m128i zero = {0}; 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 391cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr, 0); 392cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 393cabdff1aSopenharmony_ci src1, src2); 394cabdff1aSopenharmony_ci src3 = __lsx_vldx(src0_ptr, src_stride_3x); 395cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 396cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr, 0); 397cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 398cabdff1aSopenharmony_ci src2_stride_2x, in1, in2); 399cabdff1aSopenharmony_ci in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 400cabdff1aSopenharmony_ci src1_ptr += src2_stride_2x; 401cabdff1aSopenharmony_ci in4 = __lsx_vld(_src1, 0); 402cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x, 403cabdff1aSopenharmony_ci in5, in6); 404cabdff1aSopenharmony_ci in7 = __lsx_vldx(_src1, src2_stride_3x); 405cabdff1aSopenharmony_ci _src1 += src2_stride_2x; 406cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 407cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r) 408cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3, 409cabdff1aSopenharmony_ci dst0_l, dst1_l, dst2_l, dst3_l); 410cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6, 411cabdff1aSopenharmony_ci dst0_l, dst1_l, dst2_l, dst3_l); 412cabdff1aSopenharmony_ci 413cabdff1aSopenharmony_ci out0 = hevc_bi_rnd_clip(in0, dst0_r, in4, dst0_l); 414cabdff1aSopenharmony_ci out1 = hevc_bi_rnd_clip(in1, dst1_r, in5, dst1_l); 415cabdff1aSopenharmony_ci out2 = hevc_bi_rnd_clip(in2, dst2_r, in6, dst2_l); 416cabdff1aSopenharmony_ci out3 = hevc_bi_rnd_clip(in3, dst3_r, in7, dst3_l); 417cabdff1aSopenharmony_ci __lsx_vst(out0, dst, 0); 418cabdff1aSopenharmony_ci __lsx_vstx(out1, dst, dst_stride); 419cabdff1aSopenharmony_ci __lsx_vstx(out2, dst, dst_stride_2x); 420cabdff1aSopenharmony_ci __lsx_vstx(out3, dst, dst_stride_3x); 421cabdff1aSopenharmony_ci dst += dst_stride_4x; 422cabdff1aSopenharmony_ci } 423cabdff1aSopenharmony_ci} 424cabdff1aSopenharmony_ci 425cabdff1aSopenharmony_cistatic 426cabdff1aSopenharmony_civoid hevc_bi_copy_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 427cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 428cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, int32_t height) 429cabdff1aSopenharmony_ci{ 430cabdff1aSopenharmony_ci hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 431cabdff1aSopenharmony_ci dst, dst_stride, height); 432cabdff1aSopenharmony_ci hevc_bi_copy_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 433cabdff1aSopenharmony_ci dst + 16, dst_stride, height); 434cabdff1aSopenharmony_ci} 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_cistatic 437cabdff1aSopenharmony_civoid hevc_bi_copy_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 438cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 439cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, int32_t height) 440cabdff1aSopenharmony_ci{ 441cabdff1aSopenharmony_ci hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 442cabdff1aSopenharmony_ci dst, dst_stride, height); 443cabdff1aSopenharmony_ci hevc_bi_copy_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 444cabdff1aSopenharmony_ci dst + 16, dst_stride, height); 445cabdff1aSopenharmony_ci} 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_cistatic 448cabdff1aSopenharmony_civoid hevc_bi_copy_48w_lsx(uint8_t *src0_ptr, int32_t src_stride, 449cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 450cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, int32_t height) 451cabdff1aSopenharmony_ci{ 452cabdff1aSopenharmony_ci hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 453cabdff1aSopenharmony_ci dst, dst_stride, height); 454cabdff1aSopenharmony_ci hevc_bi_copy_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 455cabdff1aSopenharmony_ci dst + 16, dst_stride, height); 456cabdff1aSopenharmony_ci} 457cabdff1aSopenharmony_ci 458cabdff1aSopenharmony_cistatic 459cabdff1aSopenharmony_civoid hevc_bi_copy_64w_lsx(uint8_t *src0_ptr, int32_t src_stride, 460cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 461cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, int32_t height) 462cabdff1aSopenharmony_ci{ 463cabdff1aSopenharmony_ci hevc_bi_copy_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 464cabdff1aSopenharmony_ci dst, dst_stride, height); 465cabdff1aSopenharmony_ci hevc_bi_copy_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride, 466cabdff1aSopenharmony_ci dst + 32, dst_stride, height); 467cabdff1aSopenharmony_ci} 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_cistatic void hevc_hz_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 470cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 471cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 472cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 473cabdff1aSopenharmony_ci{ 474cabdff1aSopenharmony_ci uint32_t loop_cnt; 475cabdff1aSopenharmony_ci const int32_t dst_stride_2x = (dst_stride << 1); 476cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3; 477cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 478cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 479cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3; 480cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3; 481cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3; 482cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 483cabdff1aSopenharmony_ci 484cabdff1aSopenharmony_ci src0_ptr -= 3; 485cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 486cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 487cabdff1aSopenharmony_ci 488cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 489cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 490cabdff1aSopenharmony_ci 491cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 492cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src0, src1); 493cabdff1aSopenharmony_ci src0_ptr += src_stride; 494cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src2, src3); 495cabdff1aSopenharmony_ci src0_ptr += src_stride; 496cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1); 497cabdff1aSopenharmony_ci src1_ptr += src2_stride; 498cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3); 499cabdff1aSopenharmony_ci src1_ptr += src2_stride; 500cabdff1aSopenharmony_ci 501cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, 502cabdff1aSopenharmony_ci vec0, vec1); 503cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, 504cabdff1aSopenharmony_ci vec2, vec3); 505cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 506cabdff1aSopenharmony_ci vec3, filt0, dst0, dst1, dst2, dst3); 507cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, 508cabdff1aSopenharmony_ci vec0, vec1); 509cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, 510cabdff1aSopenharmony_ci vec2, vec3); 511cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 512cabdff1aSopenharmony_ci dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 513cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, 514cabdff1aSopenharmony_ci vec0, vec1); 515cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, 516cabdff1aSopenharmony_ci vec2, vec3); 517cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2, 518cabdff1aSopenharmony_ci dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3); 519cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, 520cabdff1aSopenharmony_ci vec0, vec1); 521cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3, 522cabdff1aSopenharmony_ci vec2, vec3); 523cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3, 524cabdff1aSopenharmony_ci dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3); 525cabdff1aSopenharmony_ci 526cabdff1aSopenharmony_ci dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 527cabdff1aSopenharmony_ci dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 528cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 529cabdff1aSopenharmony_ci __lsx_vstx(dst1, dst, dst_stride); 530cabdff1aSopenharmony_ci dst += dst_stride_2x; 531cabdff1aSopenharmony_ci } 532cabdff1aSopenharmony_ci} 533cabdff1aSopenharmony_ci 534cabdff1aSopenharmony_cistatic void hevc_hz_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 535cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 536cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 537cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 538cabdff1aSopenharmony_ci{ 539cabdff1aSopenharmony_ci uint32_t loop_cnt; 540cabdff1aSopenharmony_ci __m128i src0, src1, tmp0, tmp1; 541cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 542cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7; 543cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3; 544cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2; 545cabdff1aSopenharmony_ci __m128i in0, in1, in2; 546cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 547cabdff1aSopenharmony_ci 548cabdff1aSopenharmony_ci src0_ptr -= 3; 549cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 550cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 551cabdff1aSopenharmony_ci 552cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1, 553cabdff1aSopenharmony_ci mask2, mask3, mask4); 554cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6); 555cabdff1aSopenharmony_ci mask7 = __lsx_vaddi_bu(mask0, 14); 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 558cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1); 559cabdff1aSopenharmony_ci src0_ptr += src_stride; 560cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1); 561cabdff1aSopenharmony_ci in2 = __lsx_vld(src1_ptr, 32); 562cabdff1aSopenharmony_ci src1_ptr += src2_stride; 563cabdff1aSopenharmony_ci 564cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1, 565cabdff1aSopenharmony_ci src1, mask0, src0, src0, mask1, vec0, vec1, vec2, vec3); 566cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, dst0, dst1); 567cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec2, filt0); 568cabdff1aSopenharmony_ci dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt1); 569cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask5, src1, src1, mask1, src0, 570cabdff1aSopenharmony_ci src0, mask2, src1, src0, mask6, vec0, vec1, vec2, vec3); 571cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec0, filt1, dst2, vec1, filt1, 572cabdff1aSopenharmony_ci dst0, vec2, filt2, dst1, vec3, filt2, dst1, dst2, dst0, dst1); 573cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask2, src0, src0, mask3, src1, src0, 574cabdff1aSopenharmony_ci mask7, src1, src1, mask3, vec0, vec1, vec2, vec3); 575cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec0, filt2, dst0, vec1, filt3, 576cabdff1aSopenharmony_ci dst1, vec2, filt3, dst2, vec3, filt3, dst2, dst0, dst1, dst2); 577cabdff1aSopenharmony_ci 578cabdff1aSopenharmony_ci tmp0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 579cabdff1aSopenharmony_ci dst2 = __lsx_vsadd_h(dst2, in2); 580cabdff1aSopenharmony_ci tmp1 = __lsx_vssrarni_bu_h(dst2, dst2, 7); 581cabdff1aSopenharmony_ci 582cabdff1aSopenharmony_ci __lsx_vst(tmp0, dst, 0); 583cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp1, dst, 16, 0); 584cabdff1aSopenharmony_ci dst += dst_stride; 585cabdff1aSopenharmony_ci } 586cabdff1aSopenharmony_ci} 587cabdff1aSopenharmony_ci 588cabdff1aSopenharmony_cistatic void hevc_hz_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 589cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 590cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 591cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 592cabdff1aSopenharmony_ci{ 593cabdff1aSopenharmony_ci hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 594cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 595cabdff1aSopenharmony_ci hevc_hz_8t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 596cabdff1aSopenharmony_ci dst + 16, dst_stride, filter, height); 597cabdff1aSopenharmony_ci} 598cabdff1aSopenharmony_ci 599cabdff1aSopenharmony_cistatic void hevc_hz_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride, 600cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 601cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 602cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 603cabdff1aSopenharmony_ci{ 604cabdff1aSopenharmony_ci hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 605cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 606cabdff1aSopenharmony_ci hevc_hz_8t_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 607cabdff1aSopenharmony_ci dst + 16, dst_stride, filter, height); 608cabdff1aSopenharmony_ci} 609cabdff1aSopenharmony_ci 610cabdff1aSopenharmony_cistatic void hevc_hz_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride, 611cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 612cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 613cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 614cabdff1aSopenharmony_ci{ 615cabdff1aSopenharmony_ci hevc_hz_8t_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 616cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 617cabdff1aSopenharmony_ci hevc_hz_8t_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride, 618cabdff1aSopenharmony_ci dst + 32, dst_stride, filter, height); 619cabdff1aSopenharmony_ci} 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_cistatic av_always_inline 622cabdff1aSopenharmony_civoid hevc_vt_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, 623cabdff1aSopenharmony_ci int32_t src2_stride, uint8_t *dst, int32_t dst_stride,\ 624cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 625cabdff1aSopenharmony_ci{ 626cabdff1aSopenharmony_ci int32_t loop_cnt; 627cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 628cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 629cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 630cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 631cabdff1aSopenharmony_ci int32_t src2_stride_x = (src2_stride << 1); 632cabdff1aSopenharmony_ci int32_t src2_stride_2x = (src2_stride << 2); 633cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 634cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 635cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 636cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5; 637cabdff1aSopenharmony_ci __m128i src6, src7, src8, src9, src10; 638cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3; 639cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src54_r, src76_r, src98_r; 640cabdff1aSopenharmony_ci __m128i src21_r, src43_r, src65_r, src87_r, src109_r; 641cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst2_r, dst3_r; 642cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci src0_ptr -= src_stride_3x; 645cabdff1aSopenharmony_ci 646cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 647cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 648cabdff1aSopenharmony_ci 649cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr, 0); 650cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 651cabdff1aSopenharmony_ci src1, src2); 652cabdff1aSopenharmony_ci src3 = __lsx_vldx(src0_ptr, src_stride_3x); 653cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 654cabdff1aSopenharmony_ci src4 = __lsx_vld(src0_ptr, 0); 655cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 656cabdff1aSopenharmony_ci src5, src6); 657cabdff1aSopenharmony_ci src0_ptr += src_stride_3x; 658cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 659cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 660cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 661cabdff1aSopenharmony_ci 662cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 663cabdff1aSopenharmony_ci src7 = __lsx_vld(src0_ptr, 0); 664cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 665cabdff1aSopenharmony_ci src8, src9); 666cabdff1aSopenharmony_ci src10 = __lsx_vldx(src0_ptr, src_stride_3x); 667cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 668cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr, 0); 669cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x, 670cabdff1aSopenharmony_ci in1, in2); 671cabdff1aSopenharmony_ci in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 672cabdff1aSopenharmony_ci src1_ptr += src2_stride_2x; 673cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, 674cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 675cabdff1aSopenharmony_ci 676cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r, 677cabdff1aSopenharmony_ci filt0, src43_r, filt0, dst0_r, dst1_r, dst2_r, dst3_r); 678cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r, 679cabdff1aSopenharmony_ci filt1, dst2_r, src54_r, filt1, dst3_r, src65_r, filt1, 680cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 681cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r, src65_r, 682cabdff1aSopenharmony_ci filt2, dst2_r, src76_r, filt2, dst3_r, src87_r, filt2, 683cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 684cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r, src87_r, 685cabdff1aSopenharmony_ci filt3, dst2_r, src98_r, filt3, dst3_r, src109_r, filt3, 686cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 687cabdff1aSopenharmony_ci 688cabdff1aSopenharmony_ci dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r); 689cabdff1aSopenharmony_ci dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r); 690cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0_r, dst, 0, 0); 691cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1); 692cabdff1aSopenharmony_ci __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0); 693cabdff1aSopenharmony_ci __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1); 694cabdff1aSopenharmony_ci dst += dst_stride_4x; 695cabdff1aSopenharmony_ci 696cabdff1aSopenharmony_ci src10_r = src54_r; 697cabdff1aSopenharmony_ci src32_r = src76_r; 698cabdff1aSopenharmony_ci src54_r = src98_r; 699cabdff1aSopenharmony_ci src21_r = src65_r; 700cabdff1aSopenharmony_ci src43_r = src87_r; 701cabdff1aSopenharmony_ci src65_r = src109_r; 702cabdff1aSopenharmony_ci 703cabdff1aSopenharmony_ci src6 = src10; 704cabdff1aSopenharmony_ci } 705cabdff1aSopenharmony_ci} 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_cistatic av_always_inline 708cabdff1aSopenharmony_civoid hevc_vt_8t_16multx2mult_lsx(uint8_t *src0_ptr, int32_t src_stride, 709cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 710cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 711cabdff1aSopenharmony_ci const int8_t *filter, int32_t height, 712cabdff1aSopenharmony_ci int32_t width) 713cabdff1aSopenharmony_ci{ 714cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp; 715cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 716cabdff1aSopenharmony_ci uint8_t *dst_tmp; 717cabdff1aSopenharmony_ci uint32_t loop_cnt; 718cabdff1aSopenharmony_ci uint32_t cnt; 719cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 720cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 721cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 722cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 723cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 724cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3; 725cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src54_r, src76_r; 726cabdff1aSopenharmony_ci __m128i src21_r, src43_r, src65_r, src87_r; 727cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r; 728cabdff1aSopenharmony_ci __m128i src10_l, src32_l, src54_l, src76_l; 729cabdff1aSopenharmony_ci __m128i src21_l, src43_l, src65_l, src87_l; 730cabdff1aSopenharmony_ci __m128i dst0_l, dst1_l; 731cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 732cabdff1aSopenharmony_ci 733cabdff1aSopenharmony_ci src0_ptr -= src_stride_3x; 734cabdff1aSopenharmony_ci 735cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 736cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 737cabdff1aSopenharmony_ci 738cabdff1aSopenharmony_ci for (cnt = (width >> 4); cnt--;) { 739cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 740cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 741cabdff1aSopenharmony_ci dst_tmp = dst; 742cabdff1aSopenharmony_ci 743cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr_tmp, 0); 744cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 745cabdff1aSopenharmony_ci src_stride_2x, src1, src2); 746cabdff1aSopenharmony_ci src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x); 747cabdff1aSopenharmony_ci src0_ptr_tmp += src_stride_4x; 748cabdff1aSopenharmony_ci src4 = __lsx_vld(src0_ptr_tmp, 0); 749cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 750cabdff1aSopenharmony_ci src_stride_2x, src5, src6); 751cabdff1aSopenharmony_ci src0_ptr_tmp += src_stride_3x; 752cabdff1aSopenharmony_ci 753cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 754cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 755cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 756cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, 757cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 758cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l); 759cabdff1aSopenharmony_ci 760cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 761cabdff1aSopenharmony_ci src7 = __lsx_vld(src0_ptr_tmp, 0); 762cabdff1aSopenharmony_ci src8 = __lsx_vldx(src0_ptr_tmp, src_stride); 763cabdff1aSopenharmony_ci src0_ptr_tmp += src_stride_2x; 764cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in0, in2); 765cabdff1aSopenharmony_ci src1_ptr_tmp += src2_stride; 766cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in1, in3); 767cabdff1aSopenharmony_ci src1_ptr_tmp += src2_stride; 768cabdff1aSopenharmony_ci 769cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 770cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l); 771cabdff1aSopenharmony_ci 772cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l, 773cabdff1aSopenharmony_ci filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l); 774cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, 775cabdff1aSopenharmony_ci src43_r, filt1, dst0_l, src32_l, filt1, dst1_l, src43_l, 776cabdff1aSopenharmony_ci filt1, dst0_r, dst1_r, dst0_l, dst1_l); 777cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r, 778cabdff1aSopenharmony_ci src65_r, filt2, dst0_l, src54_l, filt2, dst1_l, src65_l, 779cabdff1aSopenharmony_ci filt2, dst0_r, dst1_r, dst0_l, dst1_l); 780cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r, 781cabdff1aSopenharmony_ci src87_r, filt3, dst0_l, src76_l, filt3, dst1_l, src87_l, 782cabdff1aSopenharmony_ci filt3, dst0_r, dst1_r, dst0_l, dst1_l); 783cabdff1aSopenharmony_ci dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l); 784cabdff1aSopenharmony_ci dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l); 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst_tmp, 0); 787cabdff1aSopenharmony_ci __lsx_vstx(dst1_r, dst_tmp, dst_stride); 788cabdff1aSopenharmony_ci dst_tmp += dst_stride_2x; 789cabdff1aSopenharmony_ci 790cabdff1aSopenharmony_ci src10_r = src32_r; 791cabdff1aSopenharmony_ci src32_r = src54_r; 792cabdff1aSopenharmony_ci src54_r = src76_r; 793cabdff1aSopenharmony_ci src21_r = src43_r; 794cabdff1aSopenharmony_ci src43_r = src65_r; 795cabdff1aSopenharmony_ci src65_r = src87_r; 796cabdff1aSopenharmony_ci src10_l = src32_l; 797cabdff1aSopenharmony_ci src32_l = src54_l; 798cabdff1aSopenharmony_ci src54_l = src76_l; 799cabdff1aSopenharmony_ci src21_l = src43_l; 800cabdff1aSopenharmony_ci src43_l = src65_l; 801cabdff1aSopenharmony_ci src65_l = src87_l; 802cabdff1aSopenharmony_ci src6 = src8; 803cabdff1aSopenharmony_ci } 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_ci src0_ptr += 16; 806cabdff1aSopenharmony_ci src1_ptr += 16; 807cabdff1aSopenharmony_ci dst += 16; 808cabdff1aSopenharmony_ci } 809cabdff1aSopenharmony_ci} 810cabdff1aSopenharmony_ci 811cabdff1aSopenharmony_cistatic void hevc_vt_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 812cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 813cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 814cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 815cabdff1aSopenharmony_ci{ 816cabdff1aSopenharmony_ci hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 817cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 16); 818cabdff1aSopenharmony_ci} 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_cistatic void hevc_vt_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 821cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 822cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 823cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 824cabdff1aSopenharmony_ci{ 825cabdff1aSopenharmony_ci hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 826cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 16); 827cabdff1aSopenharmony_ci hevc_vt_8t_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 828cabdff1aSopenharmony_ci dst + 16, dst_stride, filter, height); 829cabdff1aSopenharmony_ci} 830cabdff1aSopenharmony_ci 831cabdff1aSopenharmony_cistatic void hevc_vt_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 832cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 833cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 834cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 835cabdff1aSopenharmony_ci{ 836cabdff1aSopenharmony_ci hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 837cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 32); 838cabdff1aSopenharmony_ci} 839cabdff1aSopenharmony_ci 840cabdff1aSopenharmony_cistatic void hevc_vt_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride, 841cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 842cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 843cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 844cabdff1aSopenharmony_ci{ 845cabdff1aSopenharmony_ci hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 846cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 48); 847cabdff1aSopenharmony_ci} 848cabdff1aSopenharmony_ci 849cabdff1aSopenharmony_cistatic void hevc_vt_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride, 850cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 851cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 852cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 853cabdff1aSopenharmony_ci{ 854cabdff1aSopenharmony_ci hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 855cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 64); 856cabdff1aSopenharmony_ci} 857cabdff1aSopenharmony_ci 858cabdff1aSopenharmony_cistatic av_always_inline 859cabdff1aSopenharmony_civoid hevc_hv_8t_8multx1mult_lsx(uint8_t *src0_ptr, int32_t src_stride, 860cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 861cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 862cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 863cabdff1aSopenharmony_ci int32_t height, int32_t width) 864cabdff1aSopenharmony_ci{ 865cabdff1aSopenharmony_ci uint32_t loop_cnt; 866cabdff1aSopenharmony_ci uint32_t cnt; 867cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp; 868cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 869cabdff1aSopenharmony_ci uint8_t *dst_tmp; 870cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 871cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 872cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 873cabdff1aSopenharmony_ci __m128i out; 874cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 875cabdff1aSopenharmony_ci __m128i in0, tmp; 876cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 877cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1, filt_h2, filt_h3; 878cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 879cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 880cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 881cabdff1aSopenharmony_ci __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 882cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 883cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l; 884cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst76_r; 885cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst54_l, dst76_l; 886cabdff1aSopenharmony_ci 887cabdff1aSopenharmony_ci src0_ptr -= src_stride_3x + 3; 888cabdff1aSopenharmony_ci 889cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, filter_x, 890cabdff1aSopenharmony_ci 6, filt0, filt1, filt2, filt3); 891cabdff1aSopenharmony_ci filt_h3 = __lsx_vld(filter_y, 0); 892cabdff1aSopenharmony_ci filt_h3 = __lsx_vsllwil_h_b(filt_h3, 0); 893cabdff1aSopenharmony_ci 894cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vreplvei_w, filt_h3, 0, filt_h3, 1, filt_h3, 2, filt_h3, 3, 895cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 896cabdff1aSopenharmony_ci 897cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 898cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 899cabdff1aSopenharmony_ci 900cabdff1aSopenharmony_ci for (cnt = width >> 3; cnt--;) { 901cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 902cabdff1aSopenharmony_ci dst_tmp = dst; 903cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 904cabdff1aSopenharmony_ci 905cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr_tmp, 0); 906cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 907cabdff1aSopenharmony_ci src_stride_2x, src1, src2); 908cabdff1aSopenharmony_ci src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x); 909cabdff1aSopenharmony_ci src0_ptr_tmp += src_stride_4x; 910cabdff1aSopenharmony_ci src4 = __lsx_vld(src0_ptr_tmp, 0); 911cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 912cabdff1aSopenharmony_ci src_stride_2x, src5, src6); 913cabdff1aSopenharmony_ci src0_ptr_tmp += src_stride_3x; 914cabdff1aSopenharmony_ci 915cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 916cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, 917cabdff1aSopenharmony_ci src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3); 918cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, 919cabdff1aSopenharmony_ci src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7); 920cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, 921cabdff1aSopenharmony_ci src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11); 922cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, 923cabdff1aSopenharmony_ci src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15); 924cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0, 925cabdff1aSopenharmony_ci vec12, filt0, dst0, dst1, dst2, dst3); 926cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1, 927cabdff1aSopenharmony_ci dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3); 928cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2, 929cabdff1aSopenharmony_ci dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3); 930cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3, 931cabdff1aSopenharmony_ci dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3); 932cabdff1aSopenharmony_ci 933cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, 934cabdff1aSopenharmony_ci src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3); 935cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, 936cabdff1aSopenharmony_ci src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7); 937cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, 938cabdff1aSopenharmony_ci src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11); 939cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5); 940cabdff1aSopenharmony_ci dst6 = __lsx_vdp2_h_bu_b(vec8, filt0); 941cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1, 942cabdff1aSopenharmony_ci dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4); 943cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2, 944cabdff1aSopenharmony_ci dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5); 945cabdff1aSopenharmony_ci dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3); 946cabdff1aSopenharmony_ci 947cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 948cabdff1aSopenharmony_ci src7 = __lsx_vld(src0_ptr_tmp, 0); 949cabdff1aSopenharmony_ci src0_ptr_tmp += src_stride; 950cabdff1aSopenharmony_ci 951cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr_tmp, 0); 952cabdff1aSopenharmony_ci src1_ptr_tmp += src2_stride; 953cabdff1aSopenharmony_ci 954cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7, 955cabdff1aSopenharmony_ci src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3); 956cabdff1aSopenharmony_ci dst7 = __lsx_vdp2_h_bu_b(vec0, filt0); 957cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, 958cabdff1aSopenharmony_ci filt2, dst7, dst7); 959cabdff1aSopenharmony_ci dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3); 960cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, 961cabdff1aSopenharmony_ci dst6, dst10_r, dst32_r, dst54_r, dst76_r); 962cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, 963cabdff1aSopenharmony_ci dst6, dst10_l, dst32_l, dst54_l, dst76_l); 964cabdff1aSopenharmony_ci 965cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, 966cabdff1aSopenharmony_ci dst0_r, dst0_l); 967cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, 968cabdff1aSopenharmony_ci dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, 969cabdff1aSopenharmony_ci dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l); 970cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, 971cabdff1aSopenharmony_ci dst76_l, filt_h3, dst0_r, dst0_l); 972cabdff1aSopenharmony_ci dst0_r = __lsx_vsrli_w(dst0_r, 6); 973cabdff1aSopenharmony_ci dst0_l = __lsx_vsrli_w(dst0_l, 6); 974cabdff1aSopenharmony_ci 975cabdff1aSopenharmony_ci tmp = __lsx_vpickev_h(dst0_l, dst0_r); 976cabdff1aSopenharmony_ci tmp = __lsx_vsadd_h(tmp, in0); 977cabdff1aSopenharmony_ci tmp = __lsx_vmaxi_h(tmp, 0); 978cabdff1aSopenharmony_ci out = __lsx_vssrlrni_bu_h(tmp, tmp, 7); 979cabdff1aSopenharmony_ci __lsx_vstelm_d(out, dst_tmp, 0, 0); 980cabdff1aSopenharmony_ci dst_tmp += dst_stride; 981cabdff1aSopenharmony_ci 982cabdff1aSopenharmony_ci dst0 = dst1; 983cabdff1aSopenharmony_ci dst1 = dst2; 984cabdff1aSopenharmony_ci dst2 = dst3; 985cabdff1aSopenharmony_ci dst3 = dst4; 986cabdff1aSopenharmony_ci dst4 = dst5; 987cabdff1aSopenharmony_ci dst5 = dst6; 988cabdff1aSopenharmony_ci dst6 = dst7; 989cabdff1aSopenharmony_ci } 990cabdff1aSopenharmony_ci 991cabdff1aSopenharmony_ci src0_ptr += 8; 992cabdff1aSopenharmony_ci dst += 8; 993cabdff1aSopenharmony_ci src1_ptr += 8; 994cabdff1aSopenharmony_ci } 995cabdff1aSopenharmony_ci} 996cabdff1aSopenharmony_ci 997cabdff1aSopenharmony_cistatic void hevc_hv_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, 998cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 999cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1000cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1001cabdff1aSopenharmony_ci int32_t height) 1002cabdff1aSopenharmony_ci{ 1003cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1004cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, height, 8); 1005cabdff1aSopenharmony_ci} 1006cabdff1aSopenharmony_ci 1007cabdff1aSopenharmony_cistatic void hevc_hv_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1008cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1009cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1010cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1011cabdff1aSopenharmony_ci int32_t height) 1012cabdff1aSopenharmony_ci{ 1013cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1014cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, height, 16); 1015cabdff1aSopenharmony_ci} 1016cabdff1aSopenharmony_ci 1017cabdff1aSopenharmony_cistatic void hevc_hv_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1018cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1019cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1020cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1021cabdff1aSopenharmony_ci int32_t height) 1022cabdff1aSopenharmony_ci{ 1023cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1024cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, height, 24); 1025cabdff1aSopenharmony_ci} 1026cabdff1aSopenharmony_ci 1027cabdff1aSopenharmony_cistatic void hevc_hv_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1028cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1029cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1030cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1031cabdff1aSopenharmony_ci int32_t height) 1032cabdff1aSopenharmony_ci{ 1033cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1034cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, height, 32); 1035cabdff1aSopenharmony_ci} 1036cabdff1aSopenharmony_ci 1037cabdff1aSopenharmony_cistatic void hevc_hv_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1038cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1039cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1040cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1041cabdff1aSopenharmony_ci int32_t height) 1042cabdff1aSopenharmony_ci{ 1043cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1044cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, height, 48); 1045cabdff1aSopenharmony_ci} 1046cabdff1aSopenharmony_ci 1047cabdff1aSopenharmony_cistatic void hevc_hv_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1048cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1049cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1050cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1051cabdff1aSopenharmony_ci int32_t height) 1052cabdff1aSopenharmony_ci{ 1053cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1054cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, height, 64); 1055cabdff1aSopenharmony_ci} 1056cabdff1aSopenharmony_ci 1057cabdff1aSopenharmony_cistatic void hevc_hz_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1058cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1059cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1060cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1061cabdff1aSopenharmony_ci{ 1062cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 1063cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1064cabdff1aSopenharmony_ci uint32_t loop_cnt; 1065cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 1066cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 1067cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1068cabdff1aSopenharmony_ci int32_t src2_stride_x = src2_stride << 1; 1069cabdff1aSopenharmony_ci int32_t src2_stride_2x = src2_stride << 2; 1070cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 1071cabdff1aSopenharmony_ci 1072cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 1073cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1074cabdff1aSopenharmony_ci __m128i filt0, filt1; 1075cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1076cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 1077cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3; 1078cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1079cabdff1aSopenharmony_ci 1080cabdff1aSopenharmony_ci src0_ptr -= 1; 1081cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 1082cabdff1aSopenharmony_ci 1083cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2); 1084cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 10); 1085cabdff1aSopenharmony_ci 1086cabdff1aSopenharmony_ci dst_tmp = dst + 16; 1087cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr + 16; 1088cabdff1aSopenharmony_ci 1089cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1090cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1); 1091cabdff1aSopenharmony_ci src0_ptr += src_stride; 1092cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src3); 1093cabdff1aSopenharmony_ci src0_ptr += src_stride; 1094cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src5); 1095cabdff1aSopenharmony_ci src0_ptr += src_stride; 1096cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src6, src7); 1097cabdff1aSopenharmony_ci src0_ptr += src_stride; 1098cabdff1aSopenharmony_ci 1099cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1); 1100cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1101cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3); 1102cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1103cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in4, in5); 1104cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1105cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in6, in7); 1106cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1107cabdff1aSopenharmony_ci 1108cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src2, 1109cabdff1aSopenharmony_ci src2, mask0, src3, src2, mask2, vec0, vec1, vec2, vec3); 1110cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 1111cabdff1aSopenharmony_ci vec3, filt0, dst0, dst1, dst2, dst3); 1112cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src2, 1113cabdff1aSopenharmony_ci src2, mask1, src3, src2, mask3, vec0, vec1, vec2, vec3); 1114cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 1115cabdff1aSopenharmony_ci dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 1116cabdff1aSopenharmony_ci 1117cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src4, mask2, src6, 1118cabdff1aSopenharmony_ci src6, mask0, src7, src6, mask2, vec0, vec1, vec2, vec3); 1119cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 1120cabdff1aSopenharmony_ci vec3, filt0, dst4, dst5, dst6, dst7); 1121cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src4, mask3, src6, 1122cabdff1aSopenharmony_ci src6, mask1, src7, src6, mask3, vec0, vec1, vec2, vec3); 1123cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec0, filt1, dst5, vec1, filt1, 1124cabdff1aSopenharmony_ci dst6, vec2, filt1, dst7, vec3, filt1, dst4, dst5, dst6, dst7); 1125cabdff1aSopenharmony_ci 1126cabdff1aSopenharmony_ci dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 1127cabdff1aSopenharmony_ci dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 1128cabdff1aSopenharmony_ci dst2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5); 1129cabdff1aSopenharmony_ci dst3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7); 1130cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 1131cabdff1aSopenharmony_ci __lsx_vstx(dst1, dst, dst_stride); 1132cabdff1aSopenharmony_ci __lsx_vstx(dst2, dst, dst_stride_2x); 1133cabdff1aSopenharmony_ci __lsx_vstx(dst3, dst, dst_stride_3x); 1134cabdff1aSopenharmony_ci dst += dst_stride_4x; 1135cabdff1aSopenharmony_ci 1136cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr_tmp, 0); 1137cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp, 1138cabdff1aSopenharmony_ci src2_stride_2x, in1, in2); 1139cabdff1aSopenharmony_ci in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x); 1140cabdff1aSopenharmony_ci src1_ptr_tmp += src2_stride_2x; 1141cabdff1aSopenharmony_ci 1142cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src3, src3, mask0, src5, 1143cabdff1aSopenharmony_ci src5, mask0, src7, src7, mask0, vec0, vec1, vec2, vec3); 1144cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 1145cabdff1aSopenharmony_ci vec3, filt0, dst0, dst1, dst2, dst3); 1146cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask1, src3, src3, mask1, src5, 1147cabdff1aSopenharmony_ci src5, mask1, src7, src7, mask1, vec0, vec1, vec2, vec3); 1148cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 1149cabdff1aSopenharmony_ci dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 1150cabdff1aSopenharmony_ci dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 1151cabdff1aSopenharmony_ci dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 1152cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst_tmp, 0, 0); 1153cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst_tmp + dst_stride, 0, 1); 1154cabdff1aSopenharmony_ci __lsx_vstelm_d(dst1, dst_tmp + dst_stride_2x, 0, 0); 1155cabdff1aSopenharmony_ci __lsx_vstelm_d(dst1, dst_tmp + dst_stride_3x, 0, 1); 1156cabdff1aSopenharmony_ci dst_tmp += dst_stride_4x; 1157cabdff1aSopenharmony_ci } 1158cabdff1aSopenharmony_ci} 1159cabdff1aSopenharmony_ci 1160cabdff1aSopenharmony_cistatic void hevc_hz_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1161cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1162cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1163cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1164cabdff1aSopenharmony_ci{ 1165cabdff1aSopenharmony_ci uint32_t loop_cnt; 1166cabdff1aSopenharmony_ci __m128i src0, src1, src2; 1167cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3; 1168cabdff1aSopenharmony_ci __m128i filt0, filt1; 1169cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1170cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 1171cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3; 1172cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3; 1173cabdff1aSopenharmony_ci 1174cabdff1aSopenharmony_ci src0_ptr -= 1; 1175cabdff1aSopenharmony_ci 1176cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 1177cabdff1aSopenharmony_ci 1178cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2); 1179cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 10); 1180cabdff1aSopenharmony_ci 1181cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1182cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1); 1183cabdff1aSopenharmony_ci src2 = __lsx_vld(src0_ptr, 24); 1184cabdff1aSopenharmony_ci src0_ptr += src_stride; 1185cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, src1_ptr, 32, 1186cabdff1aSopenharmony_ci src1_ptr, 48, in0, in1, in2, in3); 1187cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1188cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src1, 1189cabdff1aSopenharmony_ci src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3); 1190cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 1191cabdff1aSopenharmony_ci vec3, filt0, dst0, dst1, dst2, dst3); 1192cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src1, 1193cabdff1aSopenharmony_ci src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3); 1194cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 1195cabdff1aSopenharmony_ci dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 1196cabdff1aSopenharmony_ci dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 1197cabdff1aSopenharmony_ci dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 1198cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 1199cabdff1aSopenharmony_ci __lsx_vst(dst1, dst, 16); 1200cabdff1aSopenharmony_ci dst += dst_stride; 1201cabdff1aSopenharmony_ci } 1202cabdff1aSopenharmony_ci} 1203cabdff1aSopenharmony_ci 1204cabdff1aSopenharmony_cistatic void hevc_vt_4t_12w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1205cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1206cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1207cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1208cabdff1aSopenharmony_ci{ 1209cabdff1aSopenharmony_ci int32_t loop_cnt; 1210cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1211cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 1212cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 1213cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1214cabdff1aSopenharmony_ci int32_t src2_stride_x = (src2_stride << 1); 1215cabdff1aSopenharmony_ci int32_t src2_stride_2x = (src2_stride << 2); 1216cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1217cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1218cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 1219cabdff1aSopenharmony_ci int16_t *_src1 = src1_ptr + 8; 1220cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6; 1221cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1222cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 1223cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst2_r, dst3_r; 1224cabdff1aSopenharmony_ci __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 1225cabdff1aSopenharmony_ci __m128i src2110, src4332, src6554; 1226cabdff1aSopenharmony_ci __m128i dst0_l, dst1_l, filt0, filt1; 1227cabdff1aSopenharmony_ci 1228cabdff1aSopenharmony_ci src0_ptr -= src_stride; 1229cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 1230cabdff1aSopenharmony_ci 1231cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr, 0); 1232cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1233cabdff1aSopenharmony_ci src1, src2); 1234cabdff1aSopenharmony_ci src0_ptr += src_stride_3x; 1235cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 1236cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 1237cabdff1aSopenharmony_ci src2110 = __lsx_vilvl_d(src21_l, src10_l); 1238cabdff1aSopenharmony_ci 1239cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1240cabdff1aSopenharmony_ci src3 = __lsx_vld(src0_ptr, 0); 1241cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1242cabdff1aSopenharmony_ci src4, src5); 1243cabdff1aSopenharmony_ci src6 = __lsx_vldx(src0_ptr, src_stride_3x); 1244cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 1245cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr, 0); 1246cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 1247cabdff1aSopenharmony_ci src2_stride_2x, in1, in2); 1248cabdff1aSopenharmony_ci in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 1249cabdff1aSopenharmony_ci src1_ptr += src2_stride_2x; 1250cabdff1aSopenharmony_ci in4 = __lsx_vld(_src1, 0); 1251cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x, 1252cabdff1aSopenharmony_ci in5, in6); 1253cabdff1aSopenharmony_ci in7 = __lsx_vldx(_src1, src2_stride_3x); 1254cabdff1aSopenharmony_ci _src1 += src2_stride_2x; 1255cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5); 1256cabdff1aSopenharmony_ci 1257cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 1258cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 1259cabdff1aSopenharmony_ci src4332 = __lsx_vilvl_d(src43_l, src32_l); 1260cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src54_r, src65_r); 1261cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src54_l, src65_l); 1262cabdff1aSopenharmony_ci src6554 = __lsx_vilvl_d(src65_l, src54_l); 1263cabdff1aSopenharmony_ci 1264cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src2110, 1265cabdff1aSopenharmony_ci filt0, src32_r, filt0, dst0_r, dst1_r, dst0_l, dst2_r); 1266cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, src43_r, filt0, src4332, filt0, 1267cabdff1aSopenharmony_ci dst3_r, dst1_l); 1268cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, 1269cabdff1aSopenharmony_ci src43_r, filt1, dst0_l, src4332, filt1, dst2_r, src54_r, 1270cabdff1aSopenharmony_ci filt1, dst0_r, dst1_r, dst0_l, dst2_r); 1271cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst1_l, 1272cabdff1aSopenharmony_ci src6554, filt1, dst3_r, dst1_l); 1273cabdff1aSopenharmony_ci dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r); 1274cabdff1aSopenharmony_ci dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r); 1275cabdff1aSopenharmony_ci dst0_l = hevc_bi_rnd_clip(in4, dst0_l, in5, dst1_l); 1276cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0_r, dst, 0, 0); 1277cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1); 1278cabdff1aSopenharmony_ci __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0); 1279cabdff1aSopenharmony_ci __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1); 1280cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0_l, dst, 8, 0); 1281cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0_l, dst + dst_stride, 8, 1); 1282cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0_l, dst + dst_stride_2x, 8, 2); 1283cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0_l, dst + dst_stride_3x, 8, 3); 1284cabdff1aSopenharmony_ci dst += dst_stride_4x; 1285cabdff1aSopenharmony_ci 1286cabdff1aSopenharmony_ci src2 = src6; 1287cabdff1aSopenharmony_ci src10_r = src54_r; 1288cabdff1aSopenharmony_ci src21_r = src65_r; 1289cabdff1aSopenharmony_ci src2110 = src6554; 1290cabdff1aSopenharmony_ci } 1291cabdff1aSopenharmony_ci} 1292cabdff1aSopenharmony_ci 1293cabdff1aSopenharmony_cistatic void hevc_vt_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1294cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1295cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1296cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1297cabdff1aSopenharmony_ci{ 1298cabdff1aSopenharmony_ci int32_t loop_cnt; 1299cabdff1aSopenharmony_ci const int32_t src_stride_2x = (src_stride << 1); 1300cabdff1aSopenharmony_ci const int32_t dst_stride_2x = (dst_stride << 1); 1301cabdff1aSopenharmony_ci const int32_t src_stride_3x = src_stride_2x + src_stride; 1302cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5; 1303cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3; 1304cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src21_r, src43_r; 1305cabdff1aSopenharmony_ci __m128i src10_l, src32_l, src21_l, src43_l; 1306cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst0_l, dst1_l; 1307cabdff1aSopenharmony_ci __m128i filt0, filt1; 1308cabdff1aSopenharmony_ci 1309cabdff1aSopenharmony_ci src0_ptr -= src_stride; 1310cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 1311cabdff1aSopenharmony_ci 1312cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr, 0); 1313cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1314cabdff1aSopenharmony_ci src1, src2); 1315cabdff1aSopenharmony_ci src0_ptr += src_stride_3x; 1316cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 1317cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 1318cabdff1aSopenharmony_ci 1319cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1320cabdff1aSopenharmony_ci src3 = __lsx_vld(src0_ptr, 0); 1321cabdff1aSopenharmony_ci src4 = __lsx_vldx(src0_ptr, src_stride); 1322cabdff1aSopenharmony_ci src0_ptr += src_stride_2x; 1323cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2); 1324cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1325cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3); 1326cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1327cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 1328cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 1329cabdff1aSopenharmony_ci 1330cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l, 1331cabdff1aSopenharmony_ci filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l); 1332cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r, 1333cabdff1aSopenharmony_ci filt1, dst0_l, src32_l, filt1, dst1_l, src43_l, filt1, 1334cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l); 1335cabdff1aSopenharmony_ci 1336cabdff1aSopenharmony_ci dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l); 1337cabdff1aSopenharmony_ci dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l); 1338cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 1339cabdff1aSopenharmony_ci __lsx_vstx(dst1_r, dst, dst_stride); 1340cabdff1aSopenharmony_ci dst += dst_stride_2x; 1341cabdff1aSopenharmony_ci 1342cabdff1aSopenharmony_ci src5 = __lsx_vld(src0_ptr, 0); 1343cabdff1aSopenharmony_ci src2 = __lsx_vldx(src0_ptr, src_stride); 1344cabdff1aSopenharmony_ci src0_ptr += src_stride_2x; 1345cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2); 1346cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1347cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3); 1348cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1349cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r); 1350cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l); 1351cabdff1aSopenharmony_ci 1352cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r, 1353cabdff1aSopenharmony_ci filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 1354cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, 1355cabdff1aSopenharmony_ci src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, 1356cabdff1aSopenharmony_ci filt1, dst0_r, dst0_l, dst1_r, dst1_l); 1357cabdff1aSopenharmony_ci dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l); 1358cabdff1aSopenharmony_ci dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l); 1359cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 1360cabdff1aSopenharmony_ci __lsx_vstx(dst1_r, dst, dst_stride); 1361cabdff1aSopenharmony_ci dst += dst_stride_2x; 1362cabdff1aSopenharmony_ci } 1363cabdff1aSopenharmony_ci} 1364cabdff1aSopenharmony_ci 1365cabdff1aSopenharmony_cistatic void hevc_vt_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1366cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1367cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1368cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1369cabdff1aSopenharmony_ci{ 1370cabdff1aSopenharmony_ci uint32_t loop_cnt; 1371cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 1372cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5; 1373cabdff1aSopenharmony_ci __m128i src6, src7, src8, src9, src10, src11; 1374cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5; 1375cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src76_r, src98_r; 1376cabdff1aSopenharmony_ci __m128i src21_r, src43_r, src87_r, src109_r; 1377cabdff1aSopenharmony_ci __m128i src10_l, src32_l, src21_l, src43_l; 1378cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst2_r, dst3_r; 1379cabdff1aSopenharmony_ci __m128i dst0_l, dst1_l; 1380cabdff1aSopenharmony_ci __m128i filt0, filt1; 1381cabdff1aSopenharmony_ci 1382cabdff1aSopenharmony_ci src0_ptr -= src_stride; 1383cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 1384cabdff1aSopenharmony_ci 1385cabdff1aSopenharmony_ci /* 16width */ 1386cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src6); 1387cabdff1aSopenharmony_ci src0_ptr += src_stride; 1388cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src1, src7); 1389cabdff1aSopenharmony_ci src0_ptr += src_stride; 1390cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8); 1391cabdff1aSopenharmony_ci src0_ptr += src_stride; 1392cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 1393cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 1394cabdff1aSopenharmony_ci /* 8width */ 1395cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 1396cabdff1aSopenharmony_ci 1397cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1398cabdff1aSopenharmony_ci /* 16width */ 1399cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src3, src9); 1400cabdff1aSopenharmony_ci src0_ptr += src_stride; 1401cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src10); 1402cabdff1aSopenharmony_ci src0_ptr += src_stride; 1403cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2); 1404cabdff1aSopenharmony_ci in4 = __lsx_vld(src1_ptr, 32); 1405cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1406cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3); 1407cabdff1aSopenharmony_ci in5 = __lsx_vld(src1_ptr, 32); 1408cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1409cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 1410cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 1411cabdff1aSopenharmony_ci /* 8width */ 1412cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r); 1413cabdff1aSopenharmony_ci /* 16width */ 1414cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r, 1415cabdff1aSopenharmony_ci filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 1416cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l, 1417cabdff1aSopenharmony_ci src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l, filt1, 1418cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 1419cabdff1aSopenharmony_ci /* 8width */ 1420cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0, 1421cabdff1aSopenharmony_ci dst2_r, dst3_r); 1422cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r, 1423cabdff1aSopenharmony_ci src109_r, filt1, dst2_r, dst3_r); 1424cabdff1aSopenharmony_ci /* 16width */ 1425cabdff1aSopenharmony_ci dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l); 1426cabdff1aSopenharmony_ci dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l); 1427cabdff1aSopenharmony_ci dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r); 1428cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 1429cabdff1aSopenharmony_ci __lsx_vstx(dst1_r, dst, dst_stride); 1430cabdff1aSopenharmony_ci __lsx_vstelm_d(dst2_r, dst, 16, 0); 1431cabdff1aSopenharmony_ci __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1); 1432cabdff1aSopenharmony_ci dst += dst_stride_2x; 1433cabdff1aSopenharmony_ci 1434cabdff1aSopenharmony_ci /* 16width */ 1435cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, src0_ptr, 0, src1_ptr, 0, src1_ptr, 16, src1_ptr, 1436cabdff1aSopenharmony_ci 32, src5, in0, in2, in4); 1437cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1438cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, src0_ptr, 16, src1_ptr, 0, src1_ptr, 16, src1_ptr, 1439cabdff1aSopenharmony_ci 32, src11, in1, in3, in5); 1440cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1441cabdff1aSopenharmony_ci src0_ptr += src_stride; 1442cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8); 1443cabdff1aSopenharmony_ci src0_ptr += src_stride; 1444cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r); 1445cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l); 1446cabdff1aSopenharmony_ci /* 8width */ 1447cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r); 1448cabdff1aSopenharmony_ci /* 16width */ 1449cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r, 1450cabdff1aSopenharmony_ci filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 1451cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, 1452cabdff1aSopenharmony_ci src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, 1453cabdff1aSopenharmony_ci filt1, dst0_r, dst0_l, dst1_r, dst1_l); 1454cabdff1aSopenharmony_ci 1455cabdff1aSopenharmony_ci /* 8width */ 1456cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0, 1457cabdff1aSopenharmony_ci dst2_r, dst3_r); 1458cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r, 1459cabdff1aSopenharmony_ci src87_r, filt1, dst2_r, dst3_r); 1460cabdff1aSopenharmony_ci 1461cabdff1aSopenharmony_ci dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l); 1462cabdff1aSopenharmony_ci dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l); 1463cabdff1aSopenharmony_ci dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r); 1464cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 1465cabdff1aSopenharmony_ci __lsx_vstx(dst1_r, dst, dst_stride); 1466cabdff1aSopenharmony_ci __lsx_vstelm_d(dst2_r, dst, 16, 0); 1467cabdff1aSopenharmony_ci __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1); 1468cabdff1aSopenharmony_ci dst += dst_stride_2x; 1469cabdff1aSopenharmony_ci } 1470cabdff1aSopenharmony_ci} 1471cabdff1aSopenharmony_ci 1472cabdff1aSopenharmony_cistatic void hevc_vt_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1473cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1474cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1475cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1476cabdff1aSopenharmony_ci{ 1477cabdff1aSopenharmony_ci hevc_vt_4t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1478cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 1479cabdff1aSopenharmony_ci hevc_vt_4t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 1480cabdff1aSopenharmony_ci dst + 16, dst_stride, filter, height); 1481cabdff1aSopenharmony_ci} 1482cabdff1aSopenharmony_ci 1483cabdff1aSopenharmony_cistatic void hevc_hv_4t_6w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1484cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1485cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1486cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1487cabdff1aSopenharmony_ci int32_t height) 1488cabdff1aSopenharmony_ci{ 1489cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1490cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 1491cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1492cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 1493cabdff1aSopenharmony_ci int32_t src2_stride_2x = (src2_stride << 1); 1494cabdff1aSopenharmony_ci int32_t src2_stride_4x = (src2_stride << 2); 1495cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1496cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1497cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride; 1498cabdff1aSopenharmony_ci __m128i out0, out1; 1499cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6; 1500cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, mask1; 1501cabdff1aSopenharmony_ci __m128i filt0, filt1, filt_h0, filt_h1; 1502cabdff1aSopenharmony_ci __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5; 1503cabdff1aSopenharmony_ci __m128i dsth6, dsth7, dsth8, dsth9, dsth10; 1504cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 1505cabdff1aSopenharmony_ci __m128i dst4_r, dst5_r, dst6_r, dst7_r; 1506cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; 1507cabdff1aSopenharmony_ci __m128i reg0, reg1, reg2, reg3; 1508cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1509cabdff1aSopenharmony_ci 1510cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 1511cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1512cabdff1aSopenharmony_ci 1513cabdff1aSopenharmony_ci filt_h1 = __lsx_vld(filter_y, 0); 1514cabdff1aSopenharmony_ci filt_h1 = __lsx_vsllwil_h_b(filt_h1, 0); 1515cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filt_h1, 0, filt_h1, 1, filt_h0, filt_h1); 1516cabdff1aSopenharmony_ci 1517cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 1518cabdff1aSopenharmony_ci 1519cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr, 0); 1520cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1521cabdff1aSopenharmony_ci src1, src2); 1522cabdff1aSopenharmony_ci src0_ptr += src_stride_3x; 1523cabdff1aSopenharmony_ci 1524cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1); 1525cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3); 1526cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5); 1527cabdff1aSopenharmony_ci 1528cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1); 1529cabdff1aSopenharmony_ci dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0); 1530cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1, 1531cabdff1aSopenharmony_ci dsth0, dsth1); 1532cabdff1aSopenharmony_ci dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1); 1533cabdff1aSopenharmony_ci 1534cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, tmp0, tmp2); 1535cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, tmp1, tmp3); 1536cabdff1aSopenharmony_ci 1537cabdff1aSopenharmony_ci src3 = __lsx_vld(src0_ptr, 0); 1538cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1539cabdff1aSopenharmony_ci src4, src5); 1540cabdff1aSopenharmony_ci src6 = __lsx_vldx(src0_ptr, src_stride_3x); 1541cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 1542cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1); 1543cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3); 1544cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5); 1545cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7); 1546cabdff1aSopenharmony_ci 1547cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 1548cabdff1aSopenharmony_ci filt0, dsth3, dsth4, dsth5, dsth6); 1549cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4, vec3, filt1, dsth5, 1550cabdff1aSopenharmony_ci vec5, filt1, dsth6, vec7, filt1, dsth3, dsth4, dsth5, dsth6); 1551cabdff1aSopenharmony_ci 1552cabdff1aSopenharmony_ci src3 = __lsx_vld(src0_ptr, 0); 1553cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1554cabdff1aSopenharmony_ci src4, src5); 1555cabdff1aSopenharmony_ci src6 = __lsx_vldx(src0_ptr, src_stride_3x); 1556cabdff1aSopenharmony_ci 1557cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1); 1558cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3); 1559cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5); 1560cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7); 1561cabdff1aSopenharmony_ci 1562cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 1563cabdff1aSopenharmony_ci filt0, dsth7, dsth8, dsth9, dsth10); 1564cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth7, vec1, filt1, dsth8, vec3, filt1, dsth9, 1565cabdff1aSopenharmony_ci vec5, filt1, dsth10, vec7, filt1, dsth7, dsth8, dsth9, dsth10); 1566cabdff1aSopenharmony_ci 1567cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, tmp4, tmp6); 1568cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, tmp5, tmp7); 1569cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dsth5, dsth4, dsth6, dsth5, dsth0, dsth2); 1570cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dsth5, dsth4, dsth6, dsth5, dsth1, dsth3); 1571cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, tmp0, filt_h0, tmp2, filt_h0, tmp4, filt_h0, 1572cabdff1aSopenharmony_ci tmp6, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r); 1573cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, tmp4, filt_h1, dst1_r, tmp6, 1574cabdff1aSopenharmony_ci filt_h1, dst2_r, dsth0, filt_h1, dst3_r, dsth2, filt_h1, 1575cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 1576cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_d, tmp3, tmp1, tmp7, tmp5, tmp0, tmp8); 1577cabdff1aSopenharmony_ci dst0_l = __lsx_vdp2_w_h(tmp0, filt_h0); 1578cabdff1aSopenharmony_ci dst0_l = __lsx_vdp2add_w_h(dst0_l, tmp8, filt_h1); 1579cabdff1aSopenharmony_ci 1580cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dsth7, dsth6, dsth8, dsth7, tmp0, tmp2); 1581cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dsth7, dsth6, dsth8, dsth7, tmp1, tmp3); 1582cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dsth9, dsth8, dsth10, dsth9, tmp4, tmp6); 1583cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dsth9, dsth8, dsth10, dsth9, tmp5, tmp7); 1584cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dsth0, filt_h0, dsth2, filt_h0, tmp0, filt_h0, 1585cabdff1aSopenharmony_ci tmp2, filt_h0, dst4_r, dst5_r, dst6_r, dst7_r); 1586cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, tmp0, filt_h1, dst5_r, tmp2, 1587cabdff1aSopenharmony_ci filt_h1, dst6_r, tmp4, filt_h1, dst7_r, tmp6, filt_h1, 1588cabdff1aSopenharmony_ci dst4_r, dst5_r, dst6_r, dst7_r); 1589cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_d, dsth3, dsth1, tmp3, tmp1, tmp0, tmp1); 1590cabdff1aSopenharmony_ci tmp2 = __lsx_vpickev_d(tmp7, tmp5); 1591cabdff1aSopenharmony_ci 1592cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_w_h, tmp8, filt_h0, tmp0, filt_h0, dst1_l, dst2_l); 1593cabdff1aSopenharmony_ci dst3_l = __lsx_vdp2_w_h(tmp1, filt_h0); 1594cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_w_h, dst1_l, tmp0, filt_h1, dst2_l, tmp1, filt_h1, 1595cabdff1aSopenharmony_ci dst1_l, dst2_l); 1596cabdff1aSopenharmony_ci dst3_l = __lsx_vdp2add_w_h(dst3_l, tmp2, filt_h1); 1597cabdff1aSopenharmony_ci 1598cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_d, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6, 1599cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 1600cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_d, dst4_r, 6, dst5_r, 6, dst6_r, 6, dst7_r, 6, 1601cabdff1aSopenharmony_ci dst4_r, dst5_r, dst6_r, dst7_r); 1602cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_d, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6, 1603cabdff1aSopenharmony_ci dst0_l, dst1_l, dst2_l, dst3_l); 1604cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); 1605cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3); 1606cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5); 1607cabdff1aSopenharmony_ci 1608cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_d(src1_ptr, 0); 1609cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0); 1610cabdff1aSopenharmony_ci dsth0 = __lsx_vilvl_d(reg1, reg0); 1611cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0); 1612cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0); 1613cabdff1aSopenharmony_ci dsth1 = __lsx_vilvl_d(reg1, reg0); 1614cabdff1aSopenharmony_ci src1_ptr += src2_stride_4x; 1615cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_d(src1_ptr, 0); 1616cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0); 1617cabdff1aSopenharmony_ci dsth2 = __lsx_vilvl_d(reg1, reg0); 1618cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0); 1619cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0); 1620cabdff1aSopenharmony_ci dsth3 = __lsx_vilvl_d(reg1, reg0); 1621cabdff1aSopenharmony_ci 1622cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsadd_h, dsth0, tmp0, dsth1, tmp1, dsth2, tmp2, dsth3, 1623cabdff1aSopenharmony_ci tmp3, tmp0, tmp1, tmp2, tmp3); 1624cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, 1625cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1626cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); 1627cabdff1aSopenharmony_ci 1628cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst, 0, 0); 1629cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst + dst_stride, 0, 1); 1630cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2); 1631cabdff1aSopenharmony_ci __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3); 1632cabdff1aSopenharmony_ci dst += dst_stride_4x; 1633cabdff1aSopenharmony_ci __lsx_vstelm_w(out1, dst, 0, 0); 1634cabdff1aSopenharmony_ci __lsx_vstelm_w(out1, dst + dst_stride, 0, 1); 1635cabdff1aSopenharmony_ci __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2); 1636cabdff1aSopenharmony_ci __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3); 1637cabdff1aSopenharmony_ci dst -= dst_stride_4x; 1638cabdff1aSopenharmony_ci 1639cabdff1aSopenharmony_ci src1_ptr -= src2_stride_4x; 1640cabdff1aSopenharmony_ci 1641cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_w(src1_ptr, 8); 1642cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8); 1643cabdff1aSopenharmony_ci reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8); 1644cabdff1aSopenharmony_ci reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8); 1645cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1); 1646cabdff1aSopenharmony_ci dsth4 = __lsx_vilvl_d(tmp1, tmp0); 1647cabdff1aSopenharmony_ci src1_ptr += src2_stride_4x; 1648cabdff1aSopenharmony_ci 1649cabdff1aSopenharmony_ci reg0 = __lsx_vldrepl_w(src1_ptr, 8); 1650cabdff1aSopenharmony_ci reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8); 1651cabdff1aSopenharmony_ci reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8); 1652cabdff1aSopenharmony_ci reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8); 1653cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1); 1654cabdff1aSopenharmony_ci dsth5 = __lsx_vilvl_d(tmp1, tmp0); 1655cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsadd_h, dsth4, tmp4, dsth5, tmp5, tmp4, tmp5); 1656cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 7, tmp4, tmp5); 1657cabdff1aSopenharmony_ci out0 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7); 1658cabdff1aSopenharmony_ci 1659cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst, 4, 0); 1660cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst + dst_stride, 4, 1); 1661cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 2); 1662cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 3); 1663cabdff1aSopenharmony_ci dst += dst_stride_4x; 1664cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst, 4, 4); 1665cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst + dst_stride, 4, 5); 1666cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 6); 1667cabdff1aSopenharmony_ci __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 7); 1668cabdff1aSopenharmony_ci} 1669cabdff1aSopenharmony_ci 1670cabdff1aSopenharmony_cistatic av_always_inline 1671cabdff1aSopenharmony_civoid hevc_hv_4t_8x2_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, 1672cabdff1aSopenharmony_ci int32_t src2_stride, uint8_t *dst, int32_t dst_stride, 1673cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y) 1674cabdff1aSopenharmony_ci{ 1675cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1676cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1677cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1678cabdff1aSopenharmony_ci 1679cabdff1aSopenharmony_ci __m128i out; 1680cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4; 1681cabdff1aSopenharmony_ci __m128i filt0, filt1; 1682cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1; 1683cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1684cabdff1aSopenharmony_ci __m128i mask1, filter_vec; 1685cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 1686cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4; 1687cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l; 1688cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst21_r, dst43_r; 1689cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst21_l, dst43_l; 1690cabdff1aSopenharmony_ci __m128i tmp0, tmp1; 1691cabdff1aSopenharmony_ci __m128i in0, in1; 1692cabdff1aSopenharmony_ci 1693cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 1694cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1695cabdff1aSopenharmony_ci 1696cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 1697cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1698cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 1699cabdff1aSopenharmony_ci 1700cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 1701cabdff1aSopenharmony_ci 1702cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr, 0); 1703cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1704cabdff1aSopenharmony_ci src0_ptr, src_stride_3x, src0_ptr, src_stride_4x, 1705cabdff1aSopenharmony_ci src1, src2, src3, src4); 1706cabdff1aSopenharmony_ci 1707cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr + src2_stride, 0, in0, in1); 1708cabdff1aSopenharmony_ci 1709cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1); 1710cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3); 1711cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5); 1712cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7); 1713cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9); 1714cabdff1aSopenharmony_ci 1715cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 1716cabdff1aSopenharmony_ci filt0, dst0, dst1, dst2, dst3); 1717cabdff1aSopenharmony_ci dst4 = __lsx_vdp2_h_bu_b(vec8, filt0); 1718cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2, 1719cabdff1aSopenharmony_ci vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3); 1720cabdff1aSopenharmony_ci dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1); 1721cabdff1aSopenharmony_ci 1722cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 1723cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 1724cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r); 1725cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l); 1726cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 1727cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 1728cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 1729cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 1730cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 1731cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 1732cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 1733cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); 1734cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, tmp0, tmp1); 1735cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp0, tmp1); 1736cabdff1aSopenharmony_ci out = __lsx_vssrlrni_bu_h(tmp1, tmp0, 7); 1737cabdff1aSopenharmony_ci __lsx_vstelm_d(out, dst, 0, 0); 1738cabdff1aSopenharmony_ci __lsx_vstelm_d(out, dst + dst_stride, 0, 1); 1739cabdff1aSopenharmony_ci} 1740cabdff1aSopenharmony_ci 1741cabdff1aSopenharmony_cistatic av_always_inline 1742cabdff1aSopenharmony_civoid hevc_hv_4t_8multx4_lsx(uint8_t *src0_ptr, int32_t src_stride, 1743cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1744cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1745cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1746cabdff1aSopenharmony_ci int32_t width8mult) 1747cabdff1aSopenharmony_ci{ 1748cabdff1aSopenharmony_ci uint32_t cnt; 1749cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1750cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 1751cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1752cabdff1aSopenharmony_ci int32_t src2_stride_x = (src2_stride << 1); 1753cabdff1aSopenharmony_ci int32_t src2_stride_2x = (src2_stride << 2); 1754cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1755cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1756cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 1757cabdff1aSopenharmony_ci 1758cabdff1aSopenharmony_ci __m128i out0, out1; 1759cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 1760cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1761cabdff1aSopenharmony_ci __m128i filt0, filt1, filt_h0, filt_h1, filter_vec; 1762cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 1763cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3; 1764cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 1765cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 1766cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 1767cabdff1aSopenharmony_ci 1768cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 1769cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1770cabdff1aSopenharmony_ci 1771cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 1772cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1773cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 1774cabdff1aSopenharmony_ci 1775cabdff1aSopenharmony_ci mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1776cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 1777cabdff1aSopenharmony_ci 1778cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 1779cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr, 0); 1780cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1781cabdff1aSopenharmony_ci src1, src2); 1782cabdff1aSopenharmony_ci src3 = __lsx_vldx(src0_ptr, src_stride_3x); 1783cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 1784cabdff1aSopenharmony_ci src4 = __lsx_vld(src0_ptr, 0); 1785cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1786cabdff1aSopenharmony_ci src5, src6); 1787cabdff1aSopenharmony_ci src0_ptr += (8 - src_stride_4x); 1788cabdff1aSopenharmony_ci 1789cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr, 0); 1790cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 1791cabdff1aSopenharmony_ci src2_stride_2x, in1, in2); 1792cabdff1aSopenharmony_ci in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 1793cabdff1aSopenharmony_ci src1_ptr += 8; 1794cabdff1aSopenharmony_ci 1795cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, 1796cabdff1aSopenharmony_ci vec0, vec1); 1797cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, 1798cabdff1aSopenharmony_ci vec2, vec3); 1799cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, 1800cabdff1aSopenharmony_ci vec4, vec5); 1801cabdff1aSopenharmony_ci 1802cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 1803cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 1804cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 1805cabdff1aSopenharmony_ci dst0, dst1); 1806cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 1807cabdff1aSopenharmony_ci 1808cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 1809cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 1810cabdff1aSopenharmony_ci 1811cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, 1812cabdff1aSopenharmony_ci vec0, vec1); 1813cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, 1814cabdff1aSopenharmony_ci vec2, vec3); 1815cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, 1816cabdff1aSopenharmony_ci vec4, vec5); 1817cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, 1818cabdff1aSopenharmony_ci vec6, vec7); 1819cabdff1aSopenharmony_ci 1820cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 1821cabdff1aSopenharmony_ci vec6, filt0, dst3, dst4, dst5, dst6); 1822cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1, 1823cabdff1aSopenharmony_ci dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6); 1824cabdff1aSopenharmony_ci 1825cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r); 1826cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l); 1827cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r); 1828cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l); 1829cabdff1aSopenharmony_ci 1830cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 1831cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 1832cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 1833cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 1834cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 1835cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 1836cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 1837cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 1838cabdff1aSopenharmony_ci filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 1839cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 1840cabdff1aSopenharmony_ci 1841cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 1842cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 1843cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, 1844cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 1845cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, 1846cabdff1aSopenharmony_ci dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); 1847cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 1848cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1849cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, 1850cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1851cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); 1852cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst, 0, 0); 1853cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 1854cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 1855cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 1856cabdff1aSopenharmony_ci dst += 8; 1857cabdff1aSopenharmony_ci } 1858cabdff1aSopenharmony_ci} 1859cabdff1aSopenharmony_ci 1860cabdff1aSopenharmony_cistatic av_always_inline 1861cabdff1aSopenharmony_civoid hevc_hv_4t_8x6_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, 1862cabdff1aSopenharmony_ci int32_t src2_stride, uint8_t *dst, int32_t dst_stride, 1863cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y) 1864cabdff1aSopenharmony_ci{ 1865cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1866cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 1867cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1868cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 1869cabdff1aSopenharmony_ci int32_t src2_stride_x = (src2_stride << 1); 1870cabdff1aSopenharmony_ci int32_t src2_stride_2x = (src2_stride << 2); 1871cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1872cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1873cabdff1aSopenharmony_ci int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 1874cabdff1aSopenharmony_ci 1875cabdff1aSopenharmony_ci __m128i out0, out1, out2; 1876cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 1877cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5; 1878cabdff1aSopenharmony_ci __m128i filt0, filt1; 1879cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1; 1880cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1881cabdff1aSopenharmony_ci __m128i mask1, filter_vec; 1882cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 1883cabdff1aSopenharmony_ci __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 1884cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 1885cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 1886cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 1887cabdff1aSopenharmony_ci __m128i dst4_r, dst4_l, dst5_r, dst5_l; 1888cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst10_l, dst32_l; 1889cabdff1aSopenharmony_ci __m128i dst21_r, dst43_r, dst21_l, dst43_l; 1890cabdff1aSopenharmony_ci __m128i dst54_r, dst54_l, dst65_r, dst65_l; 1891cabdff1aSopenharmony_ci __m128i dst76_r, dst76_l, dst87_r, dst87_l; 1892cabdff1aSopenharmony_ci 1893cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 1894cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1895cabdff1aSopenharmony_ci 1896cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 1897cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1898cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 1899cabdff1aSopenharmony_ci 1900cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 1901cabdff1aSopenharmony_ci 1902cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr, 0); 1903cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1904cabdff1aSopenharmony_ci src1, src2); 1905cabdff1aSopenharmony_ci src3 = __lsx_vldx(src0_ptr, src_stride_3x); 1906cabdff1aSopenharmony_ci src0_ptr += src_stride_4x; 1907cabdff1aSopenharmony_ci src4 = __lsx_vld(src0_ptr, 0); 1908cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1909cabdff1aSopenharmony_ci src0_ptr, src_stride_3x, src0_ptr, src_stride_4x, 1910cabdff1aSopenharmony_ci src5, src6, src7, src8); 1911cabdff1aSopenharmony_ci 1912cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr, 0); 1913cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x, 1914cabdff1aSopenharmony_ci in1, in2); 1915cabdff1aSopenharmony_ci in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 1916cabdff1aSopenharmony_ci src1_ptr += src2_stride_2x; 1917cabdff1aSopenharmony_ci in4 = __lsx_vld(src1_ptr, 0); 1918cabdff1aSopenharmony_ci in5 = __lsx_vldx(src1_ptr, src2_stride_x); 1919cabdff1aSopenharmony_ci 1920cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1); 1921cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3); 1922cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5); 1923cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7); 1924cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9); 1925cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec10, vec11); 1926cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec12, vec13); 1927cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, vec14, vec15); 1928cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17); 1929cabdff1aSopenharmony_ci 1930cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 1931cabdff1aSopenharmony_ci filt0, dst0, dst1, dst2, dst3); 1932cabdff1aSopenharmony_ci dst4 = __lsx_vdp2_h_bu_b(vec8, filt0); 1933cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec10, filt0, vec12, filt0, vec14, filt0, 1934cabdff1aSopenharmony_ci vec16, filt0, dst5, dst6, dst7, dst8); 1935cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2, 1936cabdff1aSopenharmony_ci vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3); 1937cabdff1aSopenharmony_ci dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1); 1938cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec11, filt1, dst6, vec13, filt1, 1939cabdff1aSopenharmony_ci dst7, vec15, filt1, dst8, vec17, filt1, dst5, dst6, dst7, dst8); 1940cabdff1aSopenharmony_ci 1941cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 1942cabdff1aSopenharmony_ci dst10_r, dst21_r, dst32_r, dst43_r); 1943cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 1944cabdff1aSopenharmony_ci dst10_l, dst21_l, dst32_l, dst43_l); 1945cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7, 1946cabdff1aSopenharmony_ci dst54_r, dst65_r, dst76_r, dst87_r); 1947cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7, 1948cabdff1aSopenharmony_ci dst54_l, dst65_l, dst76_l, dst87_l); 1949cabdff1aSopenharmony_ci 1950cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 1951cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 1952cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 1953cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 1954cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r, 1955cabdff1aSopenharmony_ci filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l); 1956cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 1957cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 1958cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 1959cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 1960cabdff1aSopenharmony_ci filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 1961cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 1962cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l, 1963cabdff1aSopenharmony_ci filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1, 1964cabdff1aSopenharmony_ci dst4_r, dst4_l, dst5_r, dst5_l); 1965cabdff1aSopenharmony_ci 1966cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 1967cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 1968cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, 1969cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 1970cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6, 1971cabdff1aSopenharmony_ci dst4_r, dst4_l, dst5_r, dst5_l); 1972cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, 1973cabdff1aSopenharmony_ci dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); 1974cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5); 1975cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 1976cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1977cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsadd_h, in4, tmp4, in5, tmp5, tmp4, tmp5); 1978cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, 1979cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1980cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 0, tmp4, tmp5); 1981cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); 1982cabdff1aSopenharmony_ci out2 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7); 1983cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst, 0, 0); 1984cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 1985cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 1986cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 1987cabdff1aSopenharmony_ci dst += dst_stride_4x; 1988cabdff1aSopenharmony_ci __lsx_vstelm_d(out2, dst, 0, 0); 1989cabdff1aSopenharmony_ci __lsx_vstelm_d(out2, dst + dst_stride, 0, 1); 1990cabdff1aSopenharmony_ci} 1991cabdff1aSopenharmony_ci 1992cabdff1aSopenharmony_cistatic av_always_inline 1993cabdff1aSopenharmony_civoid hevc_hv_4t_8multx4mult_lsx(uint8_t *src0_ptr, int32_t src_stride, 1994cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 1995cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1996cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1997cabdff1aSopenharmony_ci int32_t height, int32_t width) 1998cabdff1aSopenharmony_ci{ 1999cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 2000cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp; 2001cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 2002cabdff1aSopenharmony_ci uint8_t *dst_tmp; 2003cabdff1aSopenharmony_ci const int32_t src_stride_2x = (src_stride << 1); 2004cabdff1aSopenharmony_ci const int32_t dst_stride_2x = (dst_stride << 1); 2005cabdff1aSopenharmony_ci const int32_t src_stride_4x = (src_stride << 2); 2006cabdff1aSopenharmony_ci const int32_t dst_stride_4x = (dst_stride << 2); 2007cabdff1aSopenharmony_ci const int32_t src2_stride_x = (src2_stride << 1); 2008cabdff1aSopenharmony_ci const int32_t src2_stride_2x = (src2_stride << 2); 2009cabdff1aSopenharmony_ci const int32_t src_stride_3x = src_stride_2x + src_stride; 2010cabdff1aSopenharmony_ci const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 2011cabdff1aSopenharmony_ci const int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 2012cabdff1aSopenharmony_ci __m128i out0, out1; 2013cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6; 2014cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3; 2015cabdff1aSopenharmony_ci __m128i filt0, filt1; 2016cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1; 2017cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 2018cabdff1aSopenharmony_ci __m128i mask1, filter_vec; 2019cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2020cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5; 2021cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 2022cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3; 2023cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst21_r, dst43_r; 2024cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst21_l, dst43_l; 2025cabdff1aSopenharmony_ci __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6; 2026cabdff1aSopenharmony_ci 2027cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 2028cabdff1aSopenharmony_ci 2029cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 2030cabdff1aSopenharmony_ci 2031cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 2032cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 2033cabdff1aSopenharmony_ci 2034cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 2035cabdff1aSopenharmony_ci 2036cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 2037cabdff1aSopenharmony_ci 2038cabdff1aSopenharmony_ci for (cnt = width >> 3; cnt--;) { 2039cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 2040cabdff1aSopenharmony_ci dst_tmp = dst; 2041cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 2042cabdff1aSopenharmony_ci 2043cabdff1aSopenharmony_ci src0 = __lsx_vld(src0_ptr_tmp, 0); 2044cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 2045cabdff1aSopenharmony_ci src_stride_2x, src1, src2); 2046cabdff1aSopenharmony_ci src0_ptr_tmp += src_stride_3x; 2047cabdff1aSopenharmony_ci 2048cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, 2049cabdff1aSopenharmony_ci vec0, vec1); 2050cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, 2051cabdff1aSopenharmony_ci vec2, vec3); 2052cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, 2053cabdff1aSopenharmony_ci vec4, vec5); 2054cabdff1aSopenharmony_ci 2055cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 2056cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 2057cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 2058cabdff1aSopenharmony_ci dst0, dst1); 2059cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 2060cabdff1aSopenharmony_ci 2061cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 2062cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 2063cabdff1aSopenharmony_ci 2064cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 2065cabdff1aSopenharmony_ci src3 = __lsx_vld(src0_ptr_tmp, 0); 2066cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 2067cabdff1aSopenharmony_ci src_stride_2x, src4, src5); 2068cabdff1aSopenharmony_ci src6 = __lsx_vldx(src0_ptr_tmp, src_stride_3x); 2069cabdff1aSopenharmony_ci src0_ptr_tmp += src_stride_4x; 2070cabdff1aSopenharmony_ci in0 = __lsx_vld(src1_ptr_tmp, 0); 2071cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp, 2072cabdff1aSopenharmony_ci src2_stride_2x, in1, in2); 2073cabdff1aSopenharmony_ci in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x); 2074cabdff1aSopenharmony_ci src1_ptr_tmp += src2_stride_2x; 2075cabdff1aSopenharmony_ci 2076cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4, 2077cabdff1aSopenharmony_ci src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3); 2078cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6, 2079cabdff1aSopenharmony_ci src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7); 2080cabdff1aSopenharmony_ci 2081cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 2082cabdff1aSopenharmony_ci vec6, filt0, dst3, dst4, dst5, dst6); 2083cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, 2084cabdff1aSopenharmony_ci filt1, dst5, vec5, filt1, dst6, vec7, filt1, 2085cabdff1aSopenharmony_ci dst3, dst4, dst5, dst6); 2086cabdff1aSopenharmony_ci 2087cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r); 2088cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l); 2089cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r); 2090cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l); 2091cabdff1aSopenharmony_ci 2092cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 2093cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 2094cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 2095cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 2096cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, 2097cabdff1aSopenharmony_ci dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, 2098cabdff1aSopenharmony_ci dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l); 2099cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, 2100cabdff1aSopenharmony_ci dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, 2101cabdff1aSopenharmony_ci dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l); 2102cabdff1aSopenharmony_ci 2103cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 2104cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 2105cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, 2106cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 2107cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, 2108cabdff1aSopenharmony_ci dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); 2109cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 2110cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 2111cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, tmp0, 2112cabdff1aSopenharmony_ci tmp1, tmp2, tmp3); 2113cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); 2114cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst_tmp, 0, 0); 2115cabdff1aSopenharmony_ci __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1); 2116cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0); 2117cabdff1aSopenharmony_ci __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1); 2118cabdff1aSopenharmony_ci dst_tmp += dst_stride_4x; 2119cabdff1aSopenharmony_ci 2120cabdff1aSopenharmony_ci dst10_r = dst54_r; 2121cabdff1aSopenharmony_ci dst10_l = dst54_l; 2122cabdff1aSopenharmony_ci dst21_r = dst65_r; 2123cabdff1aSopenharmony_ci dst21_l = dst65_l; 2124cabdff1aSopenharmony_ci dst2 = dst6; 2125cabdff1aSopenharmony_ci } 2126cabdff1aSopenharmony_ci 2127cabdff1aSopenharmony_ci src0_ptr += 8; 2128cabdff1aSopenharmony_ci dst += 8; 2129cabdff1aSopenharmony_ci src1_ptr += 8; 2130cabdff1aSopenharmony_ci } 2131cabdff1aSopenharmony_ci} 2132cabdff1aSopenharmony_ci 2133cabdff1aSopenharmony_cistatic void hevc_hv_4t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, 2134cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 2135cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2136cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 2137cabdff1aSopenharmony_ci int32_t height) 2138cabdff1aSopenharmony_ci{ 2139cabdff1aSopenharmony_ci if (2 == height) { 2140cabdff1aSopenharmony_ci hevc_hv_4t_8x2_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2141cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y); 2142cabdff1aSopenharmony_ci } else if (4 == height) { 2143cabdff1aSopenharmony_ci hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2144cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 1); 2145cabdff1aSopenharmony_ci } else if (6 == height) { 2146cabdff1aSopenharmony_ci hevc_hv_4t_8x6_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2147cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y); 2148cabdff1aSopenharmony_ci } else { 2149cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2150cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, height, 8); 2151cabdff1aSopenharmony_ci } 2152cabdff1aSopenharmony_ci} 2153cabdff1aSopenharmony_ci 2154cabdff1aSopenharmony_cistatic void hevc_hv_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 2155cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 2156cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2157cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 2158cabdff1aSopenharmony_ci int32_t height) 2159cabdff1aSopenharmony_ci{ 2160cabdff1aSopenharmony_ci if (4 == height) { 2161cabdff1aSopenharmony_ci hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2162cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2); 2163cabdff1aSopenharmony_ci } else { 2164cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2165cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, height, 16); 2166cabdff1aSopenharmony_ci } 2167cabdff1aSopenharmony_ci} 2168cabdff1aSopenharmony_ci 2169cabdff1aSopenharmony_cistatic void hevc_hv_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 2170cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 2171cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2172cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 2173cabdff1aSopenharmony_ci int32_t height) 2174cabdff1aSopenharmony_ci{ 2175cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2176cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, height, 24); 2177cabdff1aSopenharmony_ci} 2178cabdff1aSopenharmony_ci 2179cabdff1aSopenharmony_cistatic void hevc_hv_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 2180cabdff1aSopenharmony_ci int16_t *src1_ptr, int32_t src2_stride, 2181cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2182cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 2183cabdff1aSopenharmony_ci int32_t height) 2184cabdff1aSopenharmony_ci{ 2185cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2186cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, height, 32); 2187cabdff1aSopenharmony_ci} 2188cabdff1aSopenharmony_ci 2189cabdff1aSopenharmony_ci#define BI_MC_COPY(WIDTH) \ 2190cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_lsx(uint8_t *dst, \ 2191cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 2192cabdff1aSopenharmony_ci uint8_t *src, \ 2193cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 2194cabdff1aSopenharmony_ci int16_t *src_16bit, \ 2195cabdff1aSopenharmony_ci int height, \ 2196cabdff1aSopenharmony_ci intptr_t mx, \ 2197cabdff1aSopenharmony_ci intptr_t my, \ 2198cabdff1aSopenharmony_ci int width) \ 2199cabdff1aSopenharmony_ci{ \ 2200cabdff1aSopenharmony_ci hevc_bi_copy_##WIDTH##w_lsx(src, src_stride, src_16bit, MAX_PB_SIZE, \ 2201cabdff1aSopenharmony_ci dst, dst_stride, height); \ 2202cabdff1aSopenharmony_ci} 2203cabdff1aSopenharmony_ci 2204cabdff1aSopenharmony_ciBI_MC_COPY(4); 2205cabdff1aSopenharmony_ciBI_MC_COPY(6); 2206cabdff1aSopenharmony_ciBI_MC_COPY(8); 2207cabdff1aSopenharmony_ciBI_MC_COPY(12); 2208cabdff1aSopenharmony_ciBI_MC_COPY(16); 2209cabdff1aSopenharmony_ciBI_MC_COPY(24); 2210cabdff1aSopenharmony_ciBI_MC_COPY(32); 2211cabdff1aSopenharmony_ciBI_MC_COPY(48); 2212cabdff1aSopenharmony_ciBI_MC_COPY(64); 2213cabdff1aSopenharmony_ci 2214cabdff1aSopenharmony_ci#undef BI_MC_COPY 2215cabdff1aSopenharmony_ci 2216cabdff1aSopenharmony_ci#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 2217cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \ 2218cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 2219cabdff1aSopenharmony_ci uint8_t *src, \ 2220cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 2221cabdff1aSopenharmony_ci int16_t *src_16bit, \ 2222cabdff1aSopenharmony_ci int height, \ 2223cabdff1aSopenharmony_ci intptr_t mx, \ 2224cabdff1aSopenharmony_ci intptr_t my, \ 2225cabdff1aSopenharmony_ci int width) \ 2226cabdff1aSopenharmony_ci{ \ 2227cabdff1aSopenharmony_ci const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 2228cabdff1aSopenharmony_ci \ 2229cabdff1aSopenharmony_ci hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit, \ 2230cabdff1aSopenharmony_ci MAX_PB_SIZE, dst, dst_stride, \ 2231cabdff1aSopenharmony_ci filter, height); \ 2232cabdff1aSopenharmony_ci} 2233cabdff1aSopenharmony_ci 2234cabdff1aSopenharmony_ciBI_MC(qpel, h, 16, 8, hz, mx); 2235cabdff1aSopenharmony_ciBI_MC(qpel, h, 24, 8, hz, mx); 2236cabdff1aSopenharmony_ciBI_MC(qpel, h, 32, 8, hz, mx); 2237cabdff1aSopenharmony_ciBI_MC(qpel, h, 48, 8, hz, mx); 2238cabdff1aSopenharmony_ciBI_MC(qpel, h, 64, 8, hz, mx); 2239cabdff1aSopenharmony_ci 2240cabdff1aSopenharmony_ciBI_MC(qpel, v, 8, 8, vt, my); 2241cabdff1aSopenharmony_ciBI_MC(qpel, v, 16, 8, vt, my); 2242cabdff1aSopenharmony_ciBI_MC(qpel, v, 24, 8, vt, my); 2243cabdff1aSopenharmony_ciBI_MC(qpel, v, 32, 8, vt, my); 2244cabdff1aSopenharmony_ciBI_MC(qpel, v, 48, 8, vt, my); 2245cabdff1aSopenharmony_ciBI_MC(qpel, v, 64, 8, vt, my); 2246cabdff1aSopenharmony_ci 2247cabdff1aSopenharmony_ciBI_MC(epel, h, 24, 4, hz, mx); 2248cabdff1aSopenharmony_ciBI_MC(epel, h, 32, 4, hz, mx); 2249cabdff1aSopenharmony_ci 2250cabdff1aSopenharmony_ciBI_MC(epel, v, 12, 4, vt, my); 2251cabdff1aSopenharmony_ciBI_MC(epel, v, 16, 4, vt, my); 2252cabdff1aSopenharmony_ciBI_MC(epel, v, 24, 4, vt, my); 2253cabdff1aSopenharmony_ciBI_MC(epel, v, 32, 4, vt, my); 2254cabdff1aSopenharmony_ci 2255cabdff1aSopenharmony_ci#undef BI_MC 2256cabdff1aSopenharmony_ci 2257cabdff1aSopenharmony_ci#define BI_MC_HV(PEL, WIDTH, TAP) \ 2258cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \ 2259cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 2260cabdff1aSopenharmony_ci uint8_t *src, \ 2261cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 2262cabdff1aSopenharmony_ci int16_t *src_16bit, \ 2263cabdff1aSopenharmony_ci int height, \ 2264cabdff1aSopenharmony_ci intptr_t mx, \ 2265cabdff1aSopenharmony_ci intptr_t my, \ 2266cabdff1aSopenharmony_ci int width) \ 2267cabdff1aSopenharmony_ci{ \ 2268cabdff1aSopenharmony_ci const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 2269cabdff1aSopenharmony_ci const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 2270cabdff1aSopenharmony_ci \ 2271cabdff1aSopenharmony_ci hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit, \ 2272cabdff1aSopenharmony_ci MAX_PB_SIZE, dst, dst_stride, \ 2273cabdff1aSopenharmony_ci filter_x, filter_y, height); \ 2274cabdff1aSopenharmony_ci} 2275cabdff1aSopenharmony_ci 2276cabdff1aSopenharmony_ciBI_MC_HV(qpel, 8, 8); 2277cabdff1aSopenharmony_ciBI_MC_HV(qpel, 16, 8); 2278cabdff1aSopenharmony_ciBI_MC_HV(qpel, 24, 8); 2279cabdff1aSopenharmony_ciBI_MC_HV(qpel, 32, 8); 2280cabdff1aSopenharmony_ciBI_MC_HV(qpel, 48, 8); 2281cabdff1aSopenharmony_ciBI_MC_HV(qpel, 64, 8); 2282cabdff1aSopenharmony_ci 2283cabdff1aSopenharmony_ciBI_MC_HV(epel, 8, 4); 2284cabdff1aSopenharmony_ciBI_MC_HV(epel, 6, 4); 2285cabdff1aSopenharmony_ciBI_MC_HV(epel, 16, 4); 2286cabdff1aSopenharmony_ciBI_MC_HV(epel, 24, 4); 2287cabdff1aSopenharmony_ciBI_MC_HV(epel, 32, 4); 2288cabdff1aSopenharmony_ci 2289cabdff1aSopenharmony_ci#undef BI_MC_HV 2290