1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2022 Loongson Technology Corporation Limited 3cabdff1aSopenharmony_ci * Contributed by Lu Wang <wanglu@loongson.cn> 4cabdff1aSopenharmony_ci * Hao Chen <chenhao@loongson.cn> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 24cabdff1aSopenharmony_ci#include "hevcdsp_lsx.h" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 27cabdff1aSopenharmony_ci /* 8 width cases */ 28cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 29cabdff1aSopenharmony_ci /* 4 width cases */ 30cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 31cabdff1aSopenharmony_ci}; 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ci/* hevc_copy: dst = src << 6 */ 34cabdff1aSopenharmony_cistatic void hevc_copy_4w_lsx(uint8_t *src, int32_t src_stride, 35cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 36cabdff1aSopenharmony_ci int32_t height) 37cabdff1aSopenharmony_ci{ 38cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 39cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 40cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 41cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 42cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 43cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 44cabdff1aSopenharmony_ci int32_t loop_cnt = height >> 3; 45cabdff1aSopenharmony_ci int32_t res = height & 0x07; 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 48cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3; 49cabdff1aSopenharmony_ci for (; loop_cnt--;) { 50cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 51cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, 52cabdff1aSopenharmony_ci src1, src2); 53cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 54cabdff1aSopenharmony_ci src += src_stride_4x; 55cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 56cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, 57cabdff1aSopenharmony_ci src5, src6); 58cabdff1aSopenharmony_ci src7 = __lsx_vldx(src, src_stride_3x); 59cabdff1aSopenharmony_ci src += src_stride_4x; 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src5, src4, src7, src6, 62cabdff1aSopenharmony_ci src0, src1, src2, src3); 63cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 64cabdff1aSopenharmony_ci in0, in1, in2, in3); 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ci __lsx_vstelm_d(in0, dst, 0, 0); 67cabdff1aSopenharmony_ci __lsx_vstelm_d(in0, dst + dst_stride, 0, 1); 68cabdff1aSopenharmony_ci __lsx_vstelm_d(in1, dst + dst_stride_2x, 0, 0); 69cabdff1aSopenharmony_ci __lsx_vstelm_d(in1, dst + dst_stride_3x, 0, 1); 70cabdff1aSopenharmony_ci dst += dst_stride_4x; 71cabdff1aSopenharmony_ci __lsx_vstelm_d(in2, dst, 0, 0); 72cabdff1aSopenharmony_ci __lsx_vstelm_d(in2, dst + dst_stride, 0, 1); 73cabdff1aSopenharmony_ci __lsx_vstelm_d(in3, dst + dst_stride_2x, 0, 0); 74cabdff1aSopenharmony_ci __lsx_vstelm_d(in3, dst + dst_stride_3x, 0, 1); 75cabdff1aSopenharmony_ci dst += dst_stride_4x; 76cabdff1aSopenharmony_ci } 77cabdff1aSopenharmony_ci for (;res--;) { 78cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 79cabdff1aSopenharmony_ci in0 = __lsx_vsllwil_hu_bu(src0, 6); 80cabdff1aSopenharmony_ci __lsx_vstelm_d(in0, dst, 0, 0); 81cabdff1aSopenharmony_ci src += src_stride; 82cabdff1aSopenharmony_ci dst += dst_stride; 83cabdff1aSopenharmony_ci } 84cabdff1aSopenharmony_ci} 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_cistatic void hevc_copy_6w_lsx(uint8_t *src, int32_t src_stride, 87cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 88cabdff1aSopenharmony_ci int32_t height) 89cabdff1aSopenharmony_ci{ 90cabdff1aSopenharmony_ci int32_t loop_cnt = (height >> 3); 91cabdff1aSopenharmony_ci int32_t res = height & 0x07; 92cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 93cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 94cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 95cabdff1aSopenharmony_ci 96cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 97cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5, in6, in7; 98cabdff1aSopenharmony_ci 99cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 100cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 101cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 102cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 103cabdff1aSopenharmony_ci src += src_stride_4x; 104cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 105cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 106cabdff1aSopenharmony_ci src7 = __lsx_vldx(src, src_stride_3x); 107cabdff1aSopenharmony_ci src += src_stride_4x; 108cabdff1aSopenharmony_ci 109cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 110cabdff1aSopenharmony_ci in0, in1, in2, in3); 111cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6, 112cabdff1aSopenharmony_ci in4, in5, in6, in7); 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ci __lsx_vstelm_d(in0, dst, 0, 0); 115cabdff1aSopenharmony_ci __lsx_vstelm_w(in0, dst, 8, 2); 116cabdff1aSopenharmony_ci dst += dst_stride; 117cabdff1aSopenharmony_ci __lsx_vstelm_d(in1, dst, 0, 0); 118cabdff1aSopenharmony_ci __lsx_vstelm_w(in1, dst, 8, 2); 119cabdff1aSopenharmony_ci dst += dst_stride; 120cabdff1aSopenharmony_ci __lsx_vstelm_d(in2, dst, 0, 0); 121cabdff1aSopenharmony_ci __lsx_vstelm_w(in2, dst, 8, 2); 122cabdff1aSopenharmony_ci dst += dst_stride; 123cabdff1aSopenharmony_ci __lsx_vstelm_d(in3, dst, 0, 0); 124cabdff1aSopenharmony_ci __lsx_vstelm_w(in3, dst, 8, 2); 125cabdff1aSopenharmony_ci dst += dst_stride; 126cabdff1aSopenharmony_ci __lsx_vstelm_d(in4, dst, 0, 0); 127cabdff1aSopenharmony_ci __lsx_vstelm_w(in4, dst, 8, 2); 128cabdff1aSopenharmony_ci dst += dst_stride; 129cabdff1aSopenharmony_ci __lsx_vstelm_d(in5, dst, 0, 0); 130cabdff1aSopenharmony_ci __lsx_vstelm_w(in5, dst, 8, 2); 131cabdff1aSopenharmony_ci dst += dst_stride; 132cabdff1aSopenharmony_ci __lsx_vstelm_d(in6, dst, 0, 0); 133cabdff1aSopenharmony_ci __lsx_vstelm_w(in6, dst, 8, 2); 134cabdff1aSopenharmony_ci dst += dst_stride; 135cabdff1aSopenharmony_ci __lsx_vstelm_d(in7, dst, 0, 0); 136cabdff1aSopenharmony_ci __lsx_vstelm_w(in7, dst, 8, 2); 137cabdff1aSopenharmony_ci dst += dst_stride; 138cabdff1aSopenharmony_ci } 139cabdff1aSopenharmony_ci for (;res--;) { 140cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 141cabdff1aSopenharmony_ci in0 = __lsx_vsllwil_hu_bu(src0, 6); 142cabdff1aSopenharmony_ci src += src_stride; 143cabdff1aSopenharmony_ci __lsx_vstelm_d(in0, dst, 0, 0); 144cabdff1aSopenharmony_ci __lsx_vstelm_w(in0, dst, 8, 2); 145cabdff1aSopenharmony_ci dst += dst_stride; 146cabdff1aSopenharmony_ci } 147cabdff1aSopenharmony_ci} 148cabdff1aSopenharmony_ci 149cabdff1aSopenharmony_cistatic void hevc_copy_8w_lsx(uint8_t *src, int32_t src_stride, 150cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 151cabdff1aSopenharmony_ci int32_t height) 152cabdff1aSopenharmony_ci{ 153cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 154cabdff1aSopenharmony_ci int32_t dst_stride_x = (dst_stride << 1); 155cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 156cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride_x << 1); 157cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 158cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride_x; 159cabdff1aSopenharmony_ci int32_t loop_cnt = height >> 3; 160cabdff1aSopenharmony_ci int32_t res = height & 0x07; 161cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 162cabdff1aSopenharmony_ci __m128i in0, in1, in2, in3, in4, in5, in6, in7; 163cabdff1aSopenharmony_ci 164cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 165cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 166cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, 167cabdff1aSopenharmony_ci src1, src2); 168cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 169cabdff1aSopenharmony_ci src += src_stride_4x; 170cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 171cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, 172cabdff1aSopenharmony_ci src5, src6); 173cabdff1aSopenharmony_ci src7 = __lsx_vldx(src, src_stride_3x); 174cabdff1aSopenharmony_ci src += src_stride_4x; 175cabdff1aSopenharmony_ci 176cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 177cabdff1aSopenharmony_ci in0, in1, in2, in3); 178cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6, 179cabdff1aSopenharmony_ci in4, in5, in6, in7); 180cabdff1aSopenharmony_ci __lsx_vst(in0, dst, 0); 181cabdff1aSopenharmony_ci __lsx_vstx(in1, dst, dst_stride_x); 182cabdff1aSopenharmony_ci __lsx_vstx(in2, dst, dst_stride_2x); 183cabdff1aSopenharmony_ci __lsx_vstx(in3, dst, dst_stride_3x); 184cabdff1aSopenharmony_ci dst += dst_stride_2x; 185cabdff1aSopenharmony_ci __lsx_vst(in4, dst, 0); 186cabdff1aSopenharmony_ci __lsx_vstx(in5, dst, dst_stride_x); 187cabdff1aSopenharmony_ci __lsx_vstx(in6, dst, dst_stride_2x); 188cabdff1aSopenharmony_ci __lsx_vstx(in7, dst, dst_stride_3x); 189cabdff1aSopenharmony_ci dst += dst_stride_2x; 190cabdff1aSopenharmony_ci } 191cabdff1aSopenharmony_ci for (;res--;) { 192cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 193cabdff1aSopenharmony_ci in0 = __lsx_vsllwil_hu_bu(src0, 6); 194cabdff1aSopenharmony_ci __lsx_vst(in0, dst, 0); 195cabdff1aSopenharmony_ci src += src_stride; 196cabdff1aSopenharmony_ci dst += dst_stride; 197cabdff1aSopenharmony_ci } 198cabdff1aSopenharmony_ci} 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_cistatic void hevc_copy_12w_lsx(uint8_t *src, int32_t src_stride, 201cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 202cabdff1aSopenharmony_ci int32_t height) 203cabdff1aSopenharmony_ci{ 204cabdff1aSopenharmony_ci uint32_t loop_cnt; 205cabdff1aSopenharmony_ci uint32_t res = height & 0x07; 206cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 207cabdff1aSopenharmony_ci int32_t dst_stride_x = (dst_stride << 1); 208cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 209cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride_x << 1); 210cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 211cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride_x; 212cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 213cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 214cabdff1aSopenharmony_ci __m128i in0, in1, in0_r, in1_r, in2_r, in3_r; 215cabdff1aSopenharmony_ci 216cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 217cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 218cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 219cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 220cabdff1aSopenharmony_ci src += src_stride_4x; 221cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 222cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 223cabdff1aSopenharmony_ci src7 = __lsx_vldx(src, src_stride_3x); 224cabdff1aSopenharmony_ci src += src_stride_4x; 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 227cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 228cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1); 229cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, in0, in1); 230cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 231cabdff1aSopenharmony_ci __lsx_vstx(in1_r, dst, dst_stride_x); 232cabdff1aSopenharmony_ci __lsx_vstx(in2_r, dst, dst_stride_2x); 233cabdff1aSopenharmony_ci __lsx_vstx(in3_r, dst, dst_stride_3x); 234cabdff1aSopenharmony_ci __lsx_vstelm_d(in0, dst, 16, 0); 235cabdff1aSopenharmony_ci dst += dst_stride; 236cabdff1aSopenharmony_ci __lsx_vstelm_d(in0, dst, 16, 1); 237cabdff1aSopenharmony_ci dst += dst_stride; 238cabdff1aSopenharmony_ci __lsx_vstelm_d(in1, dst, 16, 0); 239cabdff1aSopenharmony_ci dst += dst_stride; 240cabdff1aSopenharmony_ci __lsx_vstelm_d(in1, dst, 16, 1); 241cabdff1aSopenharmony_ci dst += dst_stride; 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6, 244cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 245cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_w, src5, src4, src7, src6, src0, src1); 246cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, in0, in1); 247cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 248cabdff1aSopenharmony_ci __lsx_vstx(in1_r, dst, dst_stride_x); 249cabdff1aSopenharmony_ci __lsx_vstx(in2_r, dst, dst_stride_2x); 250cabdff1aSopenharmony_ci __lsx_vstx(in3_r, dst, dst_stride_3x); 251cabdff1aSopenharmony_ci __lsx_vstelm_d(in0, dst, 16, 0); 252cabdff1aSopenharmony_ci dst += dst_stride; 253cabdff1aSopenharmony_ci __lsx_vstelm_d(in0, dst, 16, 1); 254cabdff1aSopenharmony_ci dst += dst_stride; 255cabdff1aSopenharmony_ci __lsx_vstelm_d(in1, dst, 16, 0); 256cabdff1aSopenharmony_ci dst += dst_stride; 257cabdff1aSopenharmony_ci __lsx_vstelm_d(in1, dst, 16, 1); 258cabdff1aSopenharmony_ci dst += dst_stride; 259cabdff1aSopenharmony_ci } 260cabdff1aSopenharmony_ci for (;res--;) { 261cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 262cabdff1aSopenharmony_ci in0 = __lsx_vsllwil_hu_bu(src0, 6); 263cabdff1aSopenharmony_ci src1 = __lsx_vilvh_b(zero, src0); 264cabdff1aSopenharmony_ci in1 = __lsx_vslli_h(src1, 6); 265cabdff1aSopenharmony_ci __lsx_vst(in0, dst, 0); 266cabdff1aSopenharmony_ci __lsx_vstelm_d(in1, dst, 16, 0); 267cabdff1aSopenharmony_ci src += src_stride; 268cabdff1aSopenharmony_ci dst += dst_stride; 269cabdff1aSopenharmony_ci } 270cabdff1aSopenharmony_ci} 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_cistatic void hevc_copy_16w_lsx(uint8_t *src, int32_t src_stride, 273cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 274cabdff1aSopenharmony_ci int32_t height) 275cabdff1aSopenharmony_ci{ 276cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 277cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 278cabdff1aSopenharmony_ci int32_t dst_stride_x = (dst_stride << 1); 279cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 280cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 2); 281cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 282cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride_x; 283cabdff1aSopenharmony_ci int32_t loop_cnt = height >> 3; 284cabdff1aSopenharmony_ci int32_t res = height & 0x07; 285cabdff1aSopenharmony_ci int16_t* dst1 = dst + 8; 286cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 287cabdff1aSopenharmony_ci __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 288cabdff1aSopenharmony_ci 289cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 290cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 291cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, 292cabdff1aSopenharmony_ci src1, src2); 293cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 294cabdff1aSopenharmony_ci src += src_stride_4x; 295cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 296cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, 297cabdff1aSopenharmony_ci src5, src6); 298cabdff1aSopenharmony_ci src7 = __lsx_vldx(src, src_stride_3x); 299cabdff1aSopenharmony_ci src += src_stride_4x; 300cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3, 301cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 302cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 303cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 304cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l, 305cabdff1aSopenharmony_ci in1_l, in2_l, in3_l); 306cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 307cabdff1aSopenharmony_ci __lsx_vstx(in1_r, dst, dst_stride_x); 308cabdff1aSopenharmony_ci __lsx_vstx(in2_r, dst, dst_stride_2x); 309cabdff1aSopenharmony_ci __lsx_vstx(in3_r, dst, dst_stride_3x); 310cabdff1aSopenharmony_ci __lsx_vst(in0_l, dst1, 0); 311cabdff1aSopenharmony_ci __lsx_vstx(in1_l, dst1, dst_stride_x); 312cabdff1aSopenharmony_ci __lsx_vstx(in2_l, dst1, dst_stride_2x); 313cabdff1aSopenharmony_ci __lsx_vstx(in3_l, dst1, dst_stride_3x); 314cabdff1aSopenharmony_ci dst += dst_stride_2x; 315cabdff1aSopenharmony_ci dst1 += dst_stride_2x; 316cabdff1aSopenharmony_ci 317cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, src7, 318cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 319cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6, 320cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 321cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l, 322cabdff1aSopenharmony_ci in1_l, in2_l, in3_l); 323cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 324cabdff1aSopenharmony_ci __lsx_vstx(in1_r, dst, dst_stride_x); 325cabdff1aSopenharmony_ci __lsx_vstx(in2_r, dst, dst_stride_2x); 326cabdff1aSopenharmony_ci __lsx_vstx(in3_r, dst, dst_stride_3x); 327cabdff1aSopenharmony_ci __lsx_vst(in0_l, dst1, 0); 328cabdff1aSopenharmony_ci __lsx_vstx(in1_l, dst1, dst_stride_x); 329cabdff1aSopenharmony_ci __lsx_vstx(in2_l, dst1, dst_stride_2x); 330cabdff1aSopenharmony_ci __lsx_vstx(in3_l, dst1, dst_stride_3x); 331cabdff1aSopenharmony_ci dst += dst_stride_2x; 332cabdff1aSopenharmony_ci dst1 += dst_stride_2x; 333cabdff1aSopenharmony_ci } 334cabdff1aSopenharmony_ci if (res) { 335cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 336cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 337cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 338cabdff1aSopenharmony_ci 339cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3, 340cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 341cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 342cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 343cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l, 344cabdff1aSopenharmony_ci in1_l, in2_l, in3_l); 345cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 346cabdff1aSopenharmony_ci __lsx_vstx(in1_r, dst, dst_stride_x); 347cabdff1aSopenharmony_ci __lsx_vstx(in2_r, dst, dst_stride_2x); 348cabdff1aSopenharmony_ci __lsx_vstx(in3_r, dst, dst_stride_3x); 349cabdff1aSopenharmony_ci dst += 8; 350cabdff1aSopenharmony_ci __lsx_vst(in0_l, dst, 0); 351cabdff1aSopenharmony_ci __lsx_vstx(in1_l, dst, dst_stride_x); 352cabdff1aSopenharmony_ci __lsx_vstx(in2_l, dst, dst_stride_2x); 353cabdff1aSopenharmony_ci __lsx_vstx(in3_l, dst, dst_stride_3x); 354cabdff1aSopenharmony_ci } 355cabdff1aSopenharmony_ci} 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_cistatic void hevc_copy_24w_lsx(uint8_t *src, int32_t src_stride, 358cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 359cabdff1aSopenharmony_ci int32_t height) 360cabdff1aSopenharmony_ci{ 361cabdff1aSopenharmony_ci uint32_t loop_cnt; 362cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 363cabdff1aSopenharmony_ci int32_t dst_stride_x = (dst_stride << 1); 364cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 365cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 2); 366cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 367cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride_x; 368cabdff1aSopenharmony_ci uint8_t *_src = src + 16; 369cabdff1aSopenharmony_ci int16_t *dst1 = dst; 370cabdff1aSopenharmony_ci __m128i zero = __lsx_vldi(0); 371cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 372cabdff1aSopenharmony_ci __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 373cabdff1aSopenharmony_ci 374cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 375cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 376cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 377cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 378cabdff1aSopenharmony_ci src += src_stride_4x; 379cabdff1aSopenharmony_ci src4 = __lsx_vld(_src, 0); 380cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, 381cabdff1aSopenharmony_ci src5, src6); 382cabdff1aSopenharmony_ci src7 = __lsx_vldx(_src, src_stride_3x); 383cabdff1aSopenharmony_ci _src += src_stride_4x; 384cabdff1aSopenharmony_ci 385cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, 386cabdff1aSopenharmony_ci src3, in0_l, in1_l, in2_l, in3_l); 387cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 388cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 389cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, 390cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 391cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 392cabdff1aSopenharmony_ci __lsx_vstx(in1_r, dst, dst_stride_x); 393cabdff1aSopenharmony_ci __lsx_vstx(in2_r, dst, dst_stride_2x); 394cabdff1aSopenharmony_ci __lsx_vstx(in3_r, dst, dst_stride_3x); 395cabdff1aSopenharmony_ci dst1 = dst + 8; 396cabdff1aSopenharmony_ci __lsx_vst(in0_l, dst1, 0); 397cabdff1aSopenharmony_ci __lsx_vstx(in1_l, dst1, dst_stride_x); 398cabdff1aSopenharmony_ci __lsx_vstx(in2_l, dst1, dst_stride_2x); 399cabdff1aSopenharmony_ci __lsx_vstx(in3_l, dst1, dst_stride_3x); 400cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6, 401cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 402cabdff1aSopenharmony_ci dst1 = dst1 + 8; 403cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst1, 0); 404cabdff1aSopenharmony_ci __lsx_vstx(in1_r, dst1, dst_stride_x); 405cabdff1aSopenharmony_ci __lsx_vstx(in2_r, dst1, dst_stride_2x); 406cabdff1aSopenharmony_ci __lsx_vstx(in3_r, dst1, dst_stride_3x); 407cabdff1aSopenharmony_ci dst += dst_stride_2x; 408cabdff1aSopenharmony_ci } 409cabdff1aSopenharmony_ci} 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_cistatic void hevc_copy_32w_lsx(uint8_t *src, int32_t src_stride, 412cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 413cabdff1aSopenharmony_ci int32_t height) 414cabdff1aSopenharmony_ci{ 415cabdff1aSopenharmony_ci uint32_t loop_cnt; 416cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 417cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 418cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 419cabdff1aSopenharmony_ci uint8_t *_src = src + 16; 420cabdff1aSopenharmony_ci __m128i zero = {0}; 421cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 422cabdff1aSopenharmony_ci __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 425cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 426cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src2, src4); 427cabdff1aSopenharmony_ci src6 = __lsx_vldx(src, src_stride_3x); 428cabdff1aSopenharmony_ci src += src_stride_4x; 429cabdff1aSopenharmony_ci src1 = __lsx_vld(_src, 0); 430cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, 431cabdff1aSopenharmony_ci src3, src5); 432cabdff1aSopenharmony_ci src7 = __lsx_vldx(_src, src_stride_3x); 433cabdff1aSopenharmony_ci _src += src_stride_4x; 434cabdff1aSopenharmony_ci 435cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, 436cabdff1aSopenharmony_ci src3, in0_l, in1_l, in2_l, in3_l); 437cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 438cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 439cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, 440cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 441cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 442cabdff1aSopenharmony_ci __lsx_vst(in0_l, dst, 16); 443cabdff1aSopenharmony_ci __lsx_vst(in1_r, dst, 32); 444cabdff1aSopenharmony_ci __lsx_vst(in1_l, dst, 48); 445cabdff1aSopenharmony_ci dst += dst_stride; 446cabdff1aSopenharmony_ci __lsx_vst(in2_r, dst, 0); 447cabdff1aSopenharmony_ci __lsx_vst(in2_l, dst, 16); 448cabdff1aSopenharmony_ci __lsx_vst(in3_r, dst, 32); 449cabdff1aSopenharmony_ci __lsx_vst(in3_l, dst, 48); 450cabdff1aSopenharmony_ci dst += dst_stride; 451cabdff1aSopenharmony_ci 452cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, src7, 453cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 454cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6, 455cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 456cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l, 457cabdff1aSopenharmony_ci in1_l, in2_l, in3_l); 458cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 459cabdff1aSopenharmony_ci __lsx_vst(in0_l, dst, 16); 460cabdff1aSopenharmony_ci __lsx_vst(in1_r, dst, 32); 461cabdff1aSopenharmony_ci __lsx_vst(in1_l, dst, 48); 462cabdff1aSopenharmony_ci dst += dst_stride; 463cabdff1aSopenharmony_ci __lsx_vst(in2_r, dst, 0); 464cabdff1aSopenharmony_ci __lsx_vst(in2_l, dst, 16); 465cabdff1aSopenharmony_ci __lsx_vst(in3_r, dst, 32); 466cabdff1aSopenharmony_ci __lsx_vst(in3_l, dst, 48); 467cabdff1aSopenharmony_ci dst += dst_stride; 468cabdff1aSopenharmony_ci } 469cabdff1aSopenharmony_ci} 470cabdff1aSopenharmony_ci 471cabdff1aSopenharmony_cistatic void hevc_copy_48w_lsx(uint8_t *src, int32_t src_stride, 472cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 473cabdff1aSopenharmony_ci int32_t height) 474cabdff1aSopenharmony_ci{ 475cabdff1aSopenharmony_ci uint32_t loop_cnt; 476cabdff1aSopenharmony_ci __m128i zero = {0}; 477cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 478cabdff1aSopenharmony_ci __m128i src8, src9, src10, src11; 479cabdff1aSopenharmony_ci __m128i in0_r, in1_r, in2_r, in3_r, in4_r, in5_r; 480cabdff1aSopenharmony_ci __m128i in0_l, in1_l, in2_l, in3_l, in4_l, in5_l; 481cabdff1aSopenharmony_ci 482cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 483cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); 484cabdff1aSopenharmony_ci src2 = __lsx_vld(src, 32); 485cabdff1aSopenharmony_ci src += src_stride; 486cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 16, src3, src4); 487cabdff1aSopenharmony_ci src5 = __lsx_vld(src, 32); 488cabdff1aSopenharmony_ci src += src_stride; 489cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 16, src6, src7); 490cabdff1aSopenharmony_ci src8 = __lsx_vld(src, 32); 491cabdff1aSopenharmony_ci src += src_stride; 492cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 16, src9, src10); 493cabdff1aSopenharmony_ci src11 = __lsx_vld(src, 32); 494cabdff1aSopenharmony_ci src += src_stride; 495cabdff1aSopenharmony_ci 496cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, 497cabdff1aSopenharmony_ci src3, in0_l, in1_l, in2_l, in3_l); 498cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, in4_l, in5_l); 499cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 500cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 501cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l, 502cabdff1aSopenharmony_ci in1_l, in2_l, in3_l); 503cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, in4_r, in5_r); 504cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l); 505cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 506cabdff1aSopenharmony_ci __lsx_vst(in0_l, dst, 16); 507cabdff1aSopenharmony_ci __lsx_vst(in1_r, dst, 32); 508cabdff1aSopenharmony_ci __lsx_vst(in1_l, dst, 48); 509cabdff1aSopenharmony_ci __lsx_vst(in2_r, dst, 64); 510cabdff1aSopenharmony_ci __lsx_vst(in2_l, dst, 80); 511cabdff1aSopenharmony_ci dst += dst_stride; 512cabdff1aSopenharmony_ci __lsx_vst(in3_r, dst, 0); 513cabdff1aSopenharmony_ci __lsx_vst(in3_l, dst, 16); 514cabdff1aSopenharmony_ci __lsx_vst(in4_r, dst, 32); 515cabdff1aSopenharmony_ci __lsx_vst(in4_l, dst, 48); 516cabdff1aSopenharmony_ci __lsx_vst(in5_r, dst, 64); 517cabdff1aSopenharmony_ci __lsx_vst(in5_l, dst, 80); 518cabdff1aSopenharmony_ci dst += dst_stride; 519cabdff1aSopenharmony_ci 520cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src6, zero, src7, zero, src8, zero, src9, 521cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 522cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, zero, src10, zero, src11, in4_l, in5_l); 523cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src6, 6, src7, 6, src8, 6, src9, 6, 524cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 525cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l, 526cabdff1aSopenharmony_ci in1_l, in2_l, in3_l); 527cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vsllwil_hu_bu, src10, 6, src11, 6, in4_r, in5_r); 528cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l); 529cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 530cabdff1aSopenharmony_ci __lsx_vst(in0_l, dst, 16); 531cabdff1aSopenharmony_ci __lsx_vst(in1_r, dst, 32); 532cabdff1aSopenharmony_ci __lsx_vst(in1_l, dst, 48); 533cabdff1aSopenharmony_ci __lsx_vst(in2_r, dst, 64); 534cabdff1aSopenharmony_ci __lsx_vst(in2_l, dst, 80); 535cabdff1aSopenharmony_ci dst += dst_stride; 536cabdff1aSopenharmony_ci __lsx_vst(in3_r, dst, 0); 537cabdff1aSopenharmony_ci __lsx_vst(in3_l, dst, 16); 538cabdff1aSopenharmony_ci __lsx_vst(in4_r, dst, 32); 539cabdff1aSopenharmony_ci __lsx_vst(in4_l, dst, 48); 540cabdff1aSopenharmony_ci __lsx_vst(in5_r, dst, 64); 541cabdff1aSopenharmony_ci __lsx_vst(in5_l, dst, 80); 542cabdff1aSopenharmony_ci dst += dst_stride; 543cabdff1aSopenharmony_ci } 544cabdff1aSopenharmony_ci} 545cabdff1aSopenharmony_ci 546cabdff1aSopenharmony_cistatic void hevc_copy_64w_lsx(uint8_t *src, int32_t src_stride, 547cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 548cabdff1aSopenharmony_ci int32_t height) 549cabdff1aSopenharmony_ci{ 550cabdff1aSopenharmony_ci uint32_t loop_cnt; 551cabdff1aSopenharmony_ci __m128i zero = {0}; 552cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 553cabdff1aSopenharmony_ci __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 554cabdff1aSopenharmony_ci 555cabdff1aSopenharmony_ci 556cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 557cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 558cabdff1aSopenharmony_ci src0, src1, src2, src3); 559cabdff1aSopenharmony_ci src += src_stride; 560cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 561cabdff1aSopenharmony_ci src4, src5, src6, src7); 562cabdff1aSopenharmony_ci src += src_stride; 563cabdff1aSopenharmony_ci 564cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, 565cabdff1aSopenharmony_ci src3, in0_l, in1_l, in2_l, in3_l); 566cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 567cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 568cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, 569cabdff1aSopenharmony_ci in0_l, in1_l, in2_l, in3_l); 570cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 571cabdff1aSopenharmony_ci __lsx_vst(in0_l, dst, 16); 572cabdff1aSopenharmony_ci __lsx_vst(in1_r, dst, 32); 573cabdff1aSopenharmony_ci __lsx_vst(in1_l, dst, 48); 574cabdff1aSopenharmony_ci __lsx_vst(in2_r, dst, 64); 575cabdff1aSopenharmony_ci __lsx_vst(in2_l, dst, 80); 576cabdff1aSopenharmony_ci __lsx_vst(in3_r, dst, 96); 577cabdff1aSopenharmony_ci __lsx_vst(in3_l, dst, 112); 578cabdff1aSopenharmony_ci dst += dst_stride; 579cabdff1aSopenharmony_ci 580cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, 581cabdff1aSopenharmony_ci src7, in0_l, in1_l, in2_l, in3_l); 582cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6, 583cabdff1aSopenharmony_ci in0_r, in1_r, in2_r, in3_r); 584cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l, 585cabdff1aSopenharmony_ci in1_l, in2_l, in3_l); 586cabdff1aSopenharmony_ci __lsx_vst(in0_r, dst, 0); 587cabdff1aSopenharmony_ci __lsx_vst(in0_l, dst, 16); 588cabdff1aSopenharmony_ci __lsx_vst(in1_r, dst, 32); 589cabdff1aSopenharmony_ci __lsx_vst(in1_l, dst, 48); 590cabdff1aSopenharmony_ci __lsx_vst(in2_r, dst, 64); 591cabdff1aSopenharmony_ci __lsx_vst(in2_l, dst, 80); 592cabdff1aSopenharmony_ci __lsx_vst(in3_r, dst, 96); 593cabdff1aSopenharmony_ci __lsx_vst(in3_l, dst, 112); 594cabdff1aSopenharmony_ci dst += dst_stride; 595cabdff1aSopenharmony_ci } 596cabdff1aSopenharmony_ci} 597cabdff1aSopenharmony_ci 598cabdff1aSopenharmony_cistatic void hevc_hz_8t_4w_lsx(uint8_t *src, int32_t src_stride, 599cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 600cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 601cabdff1aSopenharmony_ci{ 602cabdff1aSopenharmony_ci uint32_t loop_cnt = height >> 3; 603cabdff1aSopenharmony_ci uint32_t res = (height & 0x7) >> 1; 604cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 605cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 606cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 607cabdff1aSopenharmony_ci int32_t dst_stride_4x = (dst_stride << 2); 608cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 609cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 610cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 611cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 612cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 613cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3; 614cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3; 615cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 16); 616cabdff1aSopenharmony_ci 617cabdff1aSopenharmony_ci src -= 3; 618cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 619cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 622cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 623cabdff1aSopenharmony_ci 624cabdff1aSopenharmony_ci for (;loop_cnt--;) { 625cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 626cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 627cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 628cabdff1aSopenharmony_ci src += src_stride_4x; 629cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 630cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 631cabdff1aSopenharmony_ci src7 = __lsx_vldx(src, src_stride_3x); 632cabdff1aSopenharmony_ci src += src_stride_4x; 633cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, src1, 634cabdff1aSopenharmony_ci src0, mask2, src1, src0, mask3, vec0, vec1, vec2, vec3); 635cabdff1aSopenharmony_ci dst0 = __lsx_vdp2_h_bu_b(vec0, filt0); 636cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2, 637cabdff1aSopenharmony_ci dst0, dst0); 638cabdff1aSopenharmony_ci dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3); 639cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src2, mask0, src3, src2, mask1, src3, 640cabdff1aSopenharmony_ci src2, mask2, src3, src2, mask3, vec0, vec1, vec2, vec3); 641cabdff1aSopenharmony_ci dst1 = __lsx_vdp2_h_bu_b(vec0, filt0); 642cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2, 643cabdff1aSopenharmony_ci dst1, dst1); 644cabdff1aSopenharmony_ci dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3); 645cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src5, src4, mask0, src5, src4, mask1, src5, 646cabdff1aSopenharmony_ci src4, mask2, src5, src4, mask3, vec0, vec1, vec2, vec3); 647cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec0, filt0); 648cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2, 649cabdff1aSopenharmony_ci dst2, dst2); 650cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3); 651cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src7, src6, mask0, src7, src6, mask1, src7, 652cabdff1aSopenharmony_ci src6, mask2, src7, src6, mask3, vec0, vec1, vec2, vec3); 653cabdff1aSopenharmony_ci dst3 = __lsx_vdp2_h_bu_b(vec0, filt0); 654cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2, 655cabdff1aSopenharmony_ci dst3, dst3); 656cabdff1aSopenharmony_ci dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3); 657cabdff1aSopenharmony_ci 658cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); 659cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); 660cabdff1aSopenharmony_ci __lsx_vstelm_d(dst1, dst + dst_stride_2x, 0, 0); 661cabdff1aSopenharmony_ci __lsx_vstelm_d(dst1, dst + dst_stride_3x, 0, 1); 662cabdff1aSopenharmony_ci dst += dst_stride_4x; 663cabdff1aSopenharmony_ci __lsx_vstelm_d(dst2, dst, 0, 0); 664cabdff1aSopenharmony_ci __lsx_vstelm_d(dst2, dst + dst_stride, 0, 1); 665cabdff1aSopenharmony_ci __lsx_vstelm_d(dst3, dst + dst_stride_2x, 0, 0); 666cabdff1aSopenharmony_ci __lsx_vstelm_d(dst3, dst + dst_stride_3x, 0, 1); 667cabdff1aSopenharmony_ci dst += dst_stride_4x; 668cabdff1aSopenharmony_ci } 669cabdff1aSopenharmony_ci for (;res--;) { 670cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 671cabdff1aSopenharmony_ci src1 = __lsx_vldx(src, src_stride); 672cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, src1, 673cabdff1aSopenharmony_ci src0, mask2, src1, src0, mask3, vec0, vec1, vec2, vec3); 674cabdff1aSopenharmony_ci dst0 = __lsx_vdp2_h_bu_b(vec0, filt0); 675cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2, 676cabdff1aSopenharmony_ci dst0, dst0); 677cabdff1aSopenharmony_ci dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3); 678cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); 679cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); 680cabdff1aSopenharmony_ci src += src_stride_2x; 681cabdff1aSopenharmony_ci dst += dst_stride_2x; 682cabdff1aSopenharmony_ci } 683cabdff1aSopenharmony_ci} 684cabdff1aSopenharmony_ci 685cabdff1aSopenharmony_cistatic void hevc_hz_8t_8w_lsx(uint8_t *src, int32_t src_stride, 686cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 687cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 688cabdff1aSopenharmony_ci{ 689cabdff1aSopenharmony_ci uint32_t loop_cnt; 690cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 691cabdff1aSopenharmony_ci int32_t dst_stride_x = (dst_stride << 1); 692cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 693cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 2); 694cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 695cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride_x; 696cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3; 697cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 698cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 699cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3; 700cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3; 701cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 702cabdff1aSopenharmony_ci 703cabdff1aSopenharmony_ci src -= 3; 704cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 705cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 708cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 709cabdff1aSopenharmony_ci 710cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 711cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 712cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 713cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 714cabdff1aSopenharmony_ci src += src_stride_4x; 715cabdff1aSopenharmony_ci 716cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, 717cabdff1aSopenharmony_ci src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3); 718cabdff1aSopenharmony_ci dst0 = __lsx_vdp2_h_bu_b(vec0, filt0); 719cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2, 720cabdff1aSopenharmony_ci dst0, dst0); 721cabdff1aSopenharmony_ci dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3); 722cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, 723cabdff1aSopenharmony_ci src1, mask2, src1, src1, mask3, vec0, vec1, vec2, vec3); 724cabdff1aSopenharmony_ci dst1 = __lsx_vdp2_h_bu_b(vec0, filt0); 725cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2, 726cabdff1aSopenharmony_ci dst1, dst1); 727cabdff1aSopenharmony_ci dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3); 728cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, 729cabdff1aSopenharmony_ci src2, mask2, src2, src2, mask3, vec0, vec1, vec2, vec3); 730cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec0, filt0); 731cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2, 732cabdff1aSopenharmony_ci dst2, dst2); 733cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3); 734cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, 735cabdff1aSopenharmony_ci src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3); 736cabdff1aSopenharmony_ci dst3 = __lsx_vdp2_h_bu_b(vec0, filt0); 737cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2, 738cabdff1aSopenharmony_ci dst3, dst3); 739cabdff1aSopenharmony_ci dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3); 740cabdff1aSopenharmony_ci 741cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 742cabdff1aSopenharmony_ci __lsx_vstx(dst1, dst, dst_stride_x); 743cabdff1aSopenharmony_ci __lsx_vstx(dst2, dst, dst_stride_2x); 744cabdff1aSopenharmony_ci __lsx_vstx(dst3, dst, dst_stride_3x); 745cabdff1aSopenharmony_ci dst += dst_stride_2x; 746cabdff1aSopenharmony_ci } 747cabdff1aSopenharmony_ci} 748cabdff1aSopenharmony_ci 749cabdff1aSopenharmony_cistatic void hevc_hz_8t_12w_lsx(uint8_t *src, int32_t src_stride, 750cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 751cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 752cabdff1aSopenharmony_ci{ 753cabdff1aSopenharmony_ci uint32_t loop_cnt; 754cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 755cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 756cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 757cabdff1aSopenharmony_ci uint8_t *_src; 758cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 759cabdff1aSopenharmony_ci __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 760cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5; 761cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5; 762cabdff1aSopenharmony_ci 763cabdff1aSopenharmony_ci src -= 3; 764cabdff1aSopenharmony_ci _src = src + 8; 765cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 766cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 767cabdff1aSopenharmony_ci 768cabdff1aSopenharmony_ci mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 769cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 770cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 771cabdff1aSopenharmony_ci mask4 = __lsx_vld(ff_hevc_mask_arr, 16); 772cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6); 773cabdff1aSopenharmony_ci mask7 = __lsx_vaddi_bu(mask4, 6); 774cabdff1aSopenharmony_ci 775cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 776cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 777cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 778cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 779cabdff1aSopenharmony_ci src4 = __lsx_vld(_src, 0); 780cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, 781cabdff1aSopenharmony_ci src5, src6); 782cabdff1aSopenharmony_ci src7 = __lsx_vldx(_src, src_stride_3x); 783cabdff1aSopenharmony_ci src += src_stride_4x; 784cabdff1aSopenharmony_ci _src += src_stride_4x; 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, 787cabdff1aSopenharmony_ci vec0, vec1); 788cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, 789cabdff1aSopenharmony_ci vec2, vec3); 790cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask4, src7, src6, mask4, 791cabdff1aSopenharmony_ci vec4, vec5); 792cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 793cabdff1aSopenharmony_ci vec3, filt0, dst0, dst1, dst2, dst3); 794cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5); 795cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, 796cabdff1aSopenharmony_ci vec0, vec1); 797cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, 798cabdff1aSopenharmony_ci vec2, vec3); 799cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask5, src7, src6, mask5, 800cabdff1aSopenharmony_ci vec4, vec5); 801cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 802cabdff1aSopenharmony_ci dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 803cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1, 804cabdff1aSopenharmony_ci dst4, dst5); 805cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, 806cabdff1aSopenharmony_ci vec0, vec1); 807cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, 808cabdff1aSopenharmony_ci vec2, vec3); 809cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask6, src7, src6, mask6, 810cabdff1aSopenharmony_ci vec4, vec5); 811cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2, 812cabdff1aSopenharmony_ci dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3); 813cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2, 814cabdff1aSopenharmony_ci dst4, dst5); 815cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, 816cabdff1aSopenharmony_ci vec0, vec1); 817cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3, 818cabdff1aSopenharmony_ci vec2, vec3); 819cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask7, src7, src6, mask7, 820cabdff1aSopenharmony_ci vec4, vec5); 821cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3, 822cabdff1aSopenharmony_ci dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3); 823cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3, 824cabdff1aSopenharmony_ci dst4, dst5); 825cabdff1aSopenharmony_ci 826cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 827cabdff1aSopenharmony_ci __lsx_vstelm_d(dst4, dst, 16, 0); 828cabdff1aSopenharmony_ci dst += dst_stride; 829cabdff1aSopenharmony_ci __lsx_vst(dst1, dst, 0); 830cabdff1aSopenharmony_ci __lsx_vstelm_d(dst4, dst, 16, 1); 831cabdff1aSopenharmony_ci dst += dst_stride; 832cabdff1aSopenharmony_ci __lsx_vst(dst2, dst, 0); 833cabdff1aSopenharmony_ci __lsx_vstelm_d(dst5, dst, 16, 0); 834cabdff1aSopenharmony_ci dst += dst_stride; 835cabdff1aSopenharmony_ci __lsx_vst(dst3, dst, 0); 836cabdff1aSopenharmony_ci __lsx_vstelm_d(dst5, dst, 16, 1); 837cabdff1aSopenharmony_ci dst += dst_stride; 838cabdff1aSopenharmony_ci } 839cabdff1aSopenharmony_ci} 840cabdff1aSopenharmony_ci 841cabdff1aSopenharmony_cistatic void hevc_hz_8t_16w_lsx(uint8_t *src, int32_t src_stride, 842cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 843cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 844cabdff1aSopenharmony_ci{ 845cabdff1aSopenharmony_ci uint32_t loop_cnt; 846cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3; 847cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 848cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 849cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3; 850cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3; 851cabdff1aSopenharmony_ci __m128i mask0; 852cabdff1aSopenharmony_ci 853cabdff1aSopenharmony_ci src -= 3; 854cabdff1aSopenharmony_ci mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 855cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 856cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 857cabdff1aSopenharmony_ci 858cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 859cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 860cabdff1aSopenharmony_ci 861cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 862cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); 863cabdff1aSopenharmony_ci src += src_stride; 864cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); 865cabdff1aSopenharmony_ci src += src_stride; 866cabdff1aSopenharmony_ci 867cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, 868cabdff1aSopenharmony_ci vec0, vec1); 869cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, 870cabdff1aSopenharmony_ci vec2, vec3); 871cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 872cabdff1aSopenharmony_ci vec3, filt0, dst0, dst1, dst2, dst3); 873cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, 874cabdff1aSopenharmony_ci vec0, vec1); 875cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, 876cabdff1aSopenharmony_ci vec2, vec3); 877cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 878cabdff1aSopenharmony_ci dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 879cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, 880cabdff1aSopenharmony_ci vec0, vec1); 881cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, 882cabdff1aSopenharmony_ci vec2, vec3); 883cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2, 884cabdff1aSopenharmony_ci dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3); 885cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, 886cabdff1aSopenharmony_ci vec0, vec1); 887cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3, 888cabdff1aSopenharmony_ci vec2, vec3); 889cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3, 890cabdff1aSopenharmony_ci dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3); 891cabdff1aSopenharmony_ci 892cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 893cabdff1aSopenharmony_ci __lsx_vst(dst1, dst, 16); 894cabdff1aSopenharmony_ci dst += dst_stride; 895cabdff1aSopenharmony_ci __lsx_vst(dst2, dst, 0); 896cabdff1aSopenharmony_ci __lsx_vst(dst3, dst, 16); 897cabdff1aSopenharmony_ci dst += dst_stride; 898cabdff1aSopenharmony_ci } 899cabdff1aSopenharmony_ci} 900cabdff1aSopenharmony_ci 901cabdff1aSopenharmony_cistatic void hevc_hz_8t_24w_lsx(uint8_t *src, int32_t src_stride, 902cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 903cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 904cabdff1aSopenharmony_ci{ 905cabdff1aSopenharmony_ci uint32_t loop_cnt; 906cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3; 907cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 908cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7; 909cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5; 910cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5; 911cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 912cabdff1aSopenharmony_ci 913cabdff1aSopenharmony_ci src -= 3; 914cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 915cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 916cabdff1aSopenharmony_ci 917cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1, 918cabdff1aSopenharmony_ci mask2, mask3, mask4); 919cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6); 920cabdff1aSopenharmony_ci mask7 = __lsx_vaddi_bu(mask0, 14); 921cabdff1aSopenharmony_ci 922cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 923cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); 924cabdff1aSopenharmony_ci src += src_stride; 925cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 16, src2, src3); 926cabdff1aSopenharmony_ci src += src_stride; 927cabdff1aSopenharmony_ci 928cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1, 929cabdff1aSopenharmony_ci src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3); 930cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask4, src3, src3, mask0, 931cabdff1aSopenharmony_ci vec4, vec5); 932cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 933cabdff1aSopenharmony_ci vec3, filt0, dst0, dst1, dst2, dst3); 934cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5); 935cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask5, src1, 936cabdff1aSopenharmony_ci src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3); 937cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask5, src3, src3, mask1, 938cabdff1aSopenharmony_ci vec4, vec5); 939cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 940cabdff1aSopenharmony_ci dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 941cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1, 942cabdff1aSopenharmony_ci dst4, dst5); 943cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src0, mask6, src1, 944cabdff1aSopenharmony_ci src1, mask2, src2, src2, mask2, vec0, vec1, vec2, vec3); 945cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask6, src3, src3, mask2, 946cabdff1aSopenharmony_ci vec4, vec5); 947cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2, 948cabdff1aSopenharmony_ci dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3); 949cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2, 950cabdff1aSopenharmony_ci dst4, dst5); 951cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src0, mask7, src1, 952cabdff1aSopenharmony_ci src1, mask3, src2, src2, mask3, vec0, vec1, vec2, vec3); 953cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask7, src3, src3, mask3, 954cabdff1aSopenharmony_ci vec4, vec5); 955cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3, 956cabdff1aSopenharmony_ci dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3); 957cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3, 958cabdff1aSopenharmony_ci dst4, dst5); 959cabdff1aSopenharmony_ci 960cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 961cabdff1aSopenharmony_ci __lsx_vst(dst1, dst, 16); 962cabdff1aSopenharmony_ci __lsx_vst(dst2, dst, 32); 963cabdff1aSopenharmony_ci dst += dst_stride; 964cabdff1aSopenharmony_ci __lsx_vst(dst3, dst, 0); 965cabdff1aSopenharmony_ci __lsx_vst(dst4, dst, 16); 966cabdff1aSopenharmony_ci __lsx_vst(dst5, dst, 32); 967cabdff1aSopenharmony_ci dst += dst_stride; 968cabdff1aSopenharmony_ci } 969cabdff1aSopenharmony_ci} 970cabdff1aSopenharmony_ci 971cabdff1aSopenharmony_cistatic void hevc_hz_8t_32w_lsx(uint8_t *src, int32_t src_stride, 972cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 973cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 974cabdff1aSopenharmony_ci{ 975cabdff1aSopenharmony_ci uint32_t loop_cnt; 976cabdff1aSopenharmony_ci __m128i src0, src1, src2; 977cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 978cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7; 979cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3; 980cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3; 981cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 982cabdff1aSopenharmony_ci 983cabdff1aSopenharmony_ci src -= 3; 984cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 985cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 986cabdff1aSopenharmony_ci 987cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, 988cabdff1aSopenharmony_ci mask1, mask2, mask3, mask4); 989cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6); 990cabdff1aSopenharmony_ci mask7 = __lsx_vaddi_bu(mask0, 14); 991cabdff1aSopenharmony_ci 992cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 993cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); 994cabdff1aSopenharmony_ci src2 = __lsx_vld(src, 24); 995cabdff1aSopenharmony_ci src += src_stride; 996cabdff1aSopenharmony_ci 997cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, 998cabdff1aSopenharmony_ci src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3); 999cabdff1aSopenharmony_ci dst0 = __lsx_vdp2_h_bu_b(vec0, filt0); 1000cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2, 1001cabdff1aSopenharmony_ci dst0, dst0); 1002cabdff1aSopenharmony_ci dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3); 1003cabdff1aSopenharmony_ci 1004cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask4, src1, src0, mask5, src1, 1005cabdff1aSopenharmony_ci src0, mask6, src1, src0, mask7, vec0, vec1, vec2, vec3); 1006cabdff1aSopenharmony_ci dst1 = __lsx_vdp2_h_bu_b(vec0, filt0); 1007cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2, 1008cabdff1aSopenharmony_ci dst1, dst1); 1009cabdff1aSopenharmony_ci dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3); 1010cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, src1, 1011cabdff1aSopenharmony_ci mask2, src1, src1, mask3, vec0, vec1, vec2, vec3); 1012cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec0, filt0); 1013cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2, 1014cabdff1aSopenharmony_ci dst2, dst2); 1015cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3); 1016cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, src2, 1017cabdff1aSopenharmony_ci mask2, src2, src2, mask3, vec0, vec1, vec2, vec3); 1018cabdff1aSopenharmony_ci dst3 = __lsx_vdp2_h_bu_b(vec0, filt0); 1019cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2, 1020cabdff1aSopenharmony_ci dst3, dst3); 1021cabdff1aSopenharmony_ci dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3); 1022cabdff1aSopenharmony_ci 1023cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 1024cabdff1aSopenharmony_ci __lsx_vst(dst1, dst, 16); 1025cabdff1aSopenharmony_ci __lsx_vst(dst2, dst, 32); 1026cabdff1aSopenharmony_ci __lsx_vst(dst3, dst, 48); 1027cabdff1aSopenharmony_ci dst += dst_stride; 1028cabdff1aSopenharmony_ci } 1029cabdff1aSopenharmony_ci} 1030cabdff1aSopenharmony_ci 1031cabdff1aSopenharmony_cistatic void hevc_hz_8t_48w_lsx(uint8_t *src, int32_t src_stride, 1032cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1033cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1034cabdff1aSopenharmony_ci{ 1035cabdff1aSopenharmony_ci uint32_t loop_cnt; 1036cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3; 1037cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 1038cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1039cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5; 1040cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5; 1041cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1042cabdff1aSopenharmony_ci 1043cabdff1aSopenharmony_ci src -= 3; 1044cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1045cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1046cabdff1aSopenharmony_ci 1047cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1, 1048cabdff1aSopenharmony_ci mask2, mask3, mask4); 1049cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6); 1050cabdff1aSopenharmony_ci mask7 = __lsx_vaddi_bu(mask0, 14); 1051cabdff1aSopenharmony_ci 1052cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1053cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); 1054cabdff1aSopenharmony_ci src2 = __lsx_vld(src, 32); 1055cabdff1aSopenharmony_ci src3 = __lsx_vld(src, 40); 1056cabdff1aSopenharmony_ci src += src_stride; 1057cabdff1aSopenharmony_ci 1058cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1, src1, 1059cabdff1aSopenharmony_ci mask0, src2, src1, mask4, vec0, vec1, vec2, vec3); 1060cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 1061cabdff1aSopenharmony_ci vec3, filt0, dst0, dst1, dst2, dst3); 1062cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask5, src1, 1063cabdff1aSopenharmony_ci src1, mask1, src2, src1, mask5, vec0, vec1, vec2, vec3); 1064cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 1065cabdff1aSopenharmony_ci dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 1066cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src0, mask6, src1, 1067cabdff1aSopenharmony_ci src1, mask2, src2, src1, mask6, vec0, vec1, vec2, vec3); 1068cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2, 1069cabdff1aSopenharmony_ci dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3); 1070cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src0, mask7, src1, 1071cabdff1aSopenharmony_ci src1, mask3, src2, src1, mask7, vec0, vec1, vec2, vec3); 1072cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3, 1073cabdff1aSopenharmony_ci dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3); 1074cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 1075cabdff1aSopenharmony_ci __lsx_vst(dst1, dst, 16); 1076cabdff1aSopenharmony_ci __lsx_vst(dst2, dst, 32); 1077cabdff1aSopenharmony_ci __lsx_vst(dst3, dst, 48); 1078cabdff1aSopenharmony_ci 1079cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, 1080cabdff1aSopenharmony_ci vec4, vec5); 1081cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5); 1082cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, 1083cabdff1aSopenharmony_ci vec4, vec5); 1084cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1, 1085cabdff1aSopenharmony_ci dst4, dst5); 1086cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, 1087cabdff1aSopenharmony_ci vec4, vec5); 1088cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2, 1089cabdff1aSopenharmony_ci dst4, dst5); 1090cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3, 1091cabdff1aSopenharmony_ci vec4, vec5); 1092cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3, 1093cabdff1aSopenharmony_ci dst4, dst5); 1094cabdff1aSopenharmony_ci __lsx_vst(dst4, dst, 64); 1095cabdff1aSopenharmony_ci __lsx_vst(dst5, dst, 80); 1096cabdff1aSopenharmony_ci dst += dst_stride; 1097cabdff1aSopenharmony_ci } 1098cabdff1aSopenharmony_ci} 1099cabdff1aSopenharmony_ci 1100cabdff1aSopenharmony_cistatic void hevc_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride, 1101cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1102cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1103cabdff1aSopenharmony_ci{ 1104cabdff1aSopenharmony_ci uint32_t loop_cnt; 1105cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4; 1106cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 1107cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1108cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3; 1109cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1110cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1111cabdff1aSopenharmony_ci 1112cabdff1aSopenharmony_ci src -= 3; 1113cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1114cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1115cabdff1aSopenharmony_ci 1116cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1, 1117cabdff1aSopenharmony_ci mask2, mask3, mask4); 1118cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6) 1119cabdff1aSopenharmony_ci mask7 = __lsx_vaddi_bu(mask0, 14); 1120cabdff1aSopenharmony_ci 1121cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1122cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 1123cabdff1aSopenharmony_ci src0, src1, src2, src3); 1124cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 56); 1125cabdff1aSopenharmony_ci src += src_stride; 1126cabdff1aSopenharmony_ci 1127cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, 1128cabdff1aSopenharmony_ci src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3); 1129cabdff1aSopenharmony_ci dst0 = __lsx_vdp2_h_bu_b(vec0, filt0); 1130cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2, 1131cabdff1aSopenharmony_ci dst0, dst0); 1132cabdff1aSopenharmony_ci dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3); 1133cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 1134cabdff1aSopenharmony_ci 1135cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask4, src1, src0, mask5, src1, 1136cabdff1aSopenharmony_ci src0, mask6, src1, src0, mask7, vec0, vec1, vec2, vec3); 1137cabdff1aSopenharmony_ci dst1 = __lsx_vdp2_h_bu_b(vec0, filt0); 1138cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2, 1139cabdff1aSopenharmony_ci dst1, dst1); 1140cabdff1aSopenharmony_ci dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3); 1141cabdff1aSopenharmony_ci __lsx_vst(dst1, dst, 16); 1142cabdff1aSopenharmony_ci 1143cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, 1144cabdff1aSopenharmony_ci src1, mask2, src1, src1, mask3, vec0, vec1, vec2, vec3); 1145cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec0, filt0); 1146cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2, 1147cabdff1aSopenharmony_ci dst2, dst2); 1148cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3); 1149cabdff1aSopenharmony_ci __lsx_vst(dst2, dst, 32); 1150cabdff1aSopenharmony_ci 1151cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src1, mask4, src2, src1, mask5, src2, 1152cabdff1aSopenharmony_ci src1, mask6, src2, src1, mask7, vec0, vec1, vec2, vec3); 1153cabdff1aSopenharmony_ci dst3 = __lsx_vdp2_h_bu_b(vec0, filt0); 1154cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2, 1155cabdff1aSopenharmony_ci dst3, dst3); 1156cabdff1aSopenharmony_ci dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3); 1157cabdff1aSopenharmony_ci __lsx_vst(dst3, dst, 48); 1158cabdff1aSopenharmony_ci 1159cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, 1160cabdff1aSopenharmony_ci src2, mask2, src2, src2, mask3, vec0, vec1, vec2, vec3); 1161cabdff1aSopenharmony_ci dst4 = __lsx_vdp2_h_bu_b(vec0, filt0); 1162cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2, 1163cabdff1aSopenharmony_ci dst4, dst4); 1164cabdff1aSopenharmony_ci dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3); 1165cabdff1aSopenharmony_ci __lsx_vst(dst4, dst, 64); 1166cabdff1aSopenharmony_ci 1167cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src2, mask4, src3, src2, mask5, src3, 1168cabdff1aSopenharmony_ci src2, mask6, src3, src2, mask7, vec0, vec1, vec2, vec3); 1169cabdff1aSopenharmony_ci dst5 = __lsx_vdp2_h_bu_b(vec0, filt0); 1170cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec1, filt1, dst5, vec2, filt2, 1171cabdff1aSopenharmony_ci dst5, dst5); 1172cabdff1aSopenharmony_ci dst5 = __lsx_vdp2add_h_bu_b(dst5, vec3, filt3); 1173cabdff1aSopenharmony_ci __lsx_vst(dst5, dst, 80); 1174cabdff1aSopenharmony_ci 1175cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, 1176cabdff1aSopenharmony_ci src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3); 1177cabdff1aSopenharmony_ci dst6 = __lsx_vdp2_h_bu_b(vec0, filt0); 1178cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec1, filt1, dst6, vec2, filt2, 1179cabdff1aSopenharmony_ci dst6, dst6); 1180cabdff1aSopenharmony_ci dst6 = __lsx_vdp2add_h_bu_b(dst6, vec3, filt3); 1181cabdff1aSopenharmony_ci __lsx_vst(dst6, dst, 96); 1182cabdff1aSopenharmony_ci 1183cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, 1184cabdff1aSopenharmony_ci src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3); 1185cabdff1aSopenharmony_ci dst7 = __lsx_vdp2_h_bu_b(vec0, filt0); 1186cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2, 1187cabdff1aSopenharmony_ci dst7, dst7); 1188cabdff1aSopenharmony_ci dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3); 1189cabdff1aSopenharmony_ci __lsx_vst(dst7, dst, 112); 1190cabdff1aSopenharmony_ci dst += dst_stride; 1191cabdff1aSopenharmony_ci } 1192cabdff1aSopenharmony_ci} 1193cabdff1aSopenharmony_ci 1194cabdff1aSopenharmony_cistatic void hevc_vt_8t_4w_lsx(uint8_t *src, int32_t src_stride, 1195cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1196cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1197cabdff1aSopenharmony_ci{ 1198cabdff1aSopenharmony_ci int32_t loop_cnt; 1199cabdff1aSopenharmony_ci int32_t res = (height & 0x07) >> 1; 1200cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1201cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1202cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1203cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 1204cabdff1aSopenharmony_ci __m128i src9, src10, src11, src12, src13, src14; 1205cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src54_r, src76_r, src98_r; 1206cabdff1aSopenharmony_ci __m128i src21_r, src43_r, src65_r, src87_r, src109_r; 1207cabdff1aSopenharmony_ci __m128i src1110_r, src1211_r, src1312_r, src1413_r; 1208cabdff1aSopenharmony_ci __m128i src2110, src4332, src6554, src8776, src10998; 1209cabdff1aSopenharmony_ci __m128i src12111110, src14131312; 1210cabdff1aSopenharmony_ci __m128i dst10, dst32, dst54, dst76; 1211cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 1212cabdff1aSopenharmony_ci 1213cabdff1aSopenharmony_ci src -= src_stride_3x; 1214cabdff1aSopenharmony_ci 1215cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1216cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1217cabdff1aSopenharmony_ci 1218cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 1219cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 1220cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 1221cabdff1aSopenharmony_ci src += src_stride_4x; 1222cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 1223cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 1224cabdff1aSopenharmony_ci src += src_stride_3x; 1225cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 1226cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1227cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 1228cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, 1229cabdff1aSopenharmony_ci src2110, src4332); 1230cabdff1aSopenharmony_ci src6554 = __lsx_vilvl_d(src65_r, src54_r); 1231cabdff1aSopenharmony_ci 1232cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 1233cabdff1aSopenharmony_ci src7 = __lsx_vld(src, 0); 1234cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9); 1235cabdff1aSopenharmony_ci src10 = __lsx_vldx(src, src_stride_3x); 1236cabdff1aSopenharmony_ci src += src_stride_4x; 1237cabdff1aSopenharmony_ci src11 = __lsx_vld(src, 0); 1238cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, 1239cabdff1aSopenharmony_ci src12, src13); 1240cabdff1aSopenharmony_ci src14 = __lsx_vldx(src, src_stride_3x); 1241cabdff1aSopenharmony_ci src += src_stride_4x; 1242cabdff1aSopenharmony_ci 1243cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, 1244cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1245cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src11, src10, src12, src11, src13, src12, src14, 1246cabdff1aSopenharmony_ci src13, src1110_r, src1211_r, src1312_r, src1413_r); 1247cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_d, src87_r, src76_r, src109_r, src98_r, src1211_r, 1248cabdff1aSopenharmony_ci src1110_r, src1413_r, src1312_r, src8776, src10998, 1249cabdff1aSopenharmony_ci src12111110, src14131312); 1250cabdff1aSopenharmony_ci 1251cabdff1aSopenharmony_ci dst10 = __lsx_vdp2_h_bu_b(src2110, filt0); 1252cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554, 1253cabdff1aSopenharmony_ci filt2, dst10, dst10); 1254cabdff1aSopenharmony_ci dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3); 1255cabdff1aSopenharmony_ci dst32 = __lsx_vdp2_h_bu_b(src4332, filt0); 1256cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst32, src6554, filt1, dst32, src8776, 1257cabdff1aSopenharmony_ci filt2, dst32, dst32); 1258cabdff1aSopenharmony_ci dst32 = __lsx_vdp2add_h_bu_b(dst32, src10998, filt3); 1259cabdff1aSopenharmony_ci dst54 = __lsx_vdp2_h_bu_b(src6554, filt0); 1260cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst54, src8776, filt1, 1261cabdff1aSopenharmony_ci dst54, src10998, filt2, dst54, dst54); 1262cabdff1aSopenharmony_ci dst54 = __lsx_vdp2add_h_bu_b(dst54, src12111110, filt3); 1263cabdff1aSopenharmony_ci dst76 = __lsx_vdp2_h_bu_b(src8776, filt0); 1264cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst76, src10998, filt1, dst76, 1265cabdff1aSopenharmony_ci src12111110, filt2, dst76, dst76); 1266cabdff1aSopenharmony_ci dst76 = __lsx_vdp2add_h_bu_b(dst76, src14131312, filt3); 1267cabdff1aSopenharmony_ci 1268cabdff1aSopenharmony_ci __lsx_vstelm_d(dst10, dst, 0, 0); 1269cabdff1aSopenharmony_ci dst += dst_stride; 1270cabdff1aSopenharmony_ci __lsx_vstelm_d(dst10, dst, 0, 1); 1271cabdff1aSopenharmony_ci dst += dst_stride; 1272cabdff1aSopenharmony_ci __lsx_vstelm_d(dst32, dst, 0, 0); 1273cabdff1aSopenharmony_ci dst += dst_stride; 1274cabdff1aSopenharmony_ci __lsx_vstelm_d(dst32, dst, 0, 1); 1275cabdff1aSopenharmony_ci dst += dst_stride; 1276cabdff1aSopenharmony_ci __lsx_vstelm_d(dst54, dst, 0, 0); 1277cabdff1aSopenharmony_ci dst += dst_stride; 1278cabdff1aSopenharmony_ci __lsx_vstelm_d(dst54, dst, 0, 1); 1279cabdff1aSopenharmony_ci dst += dst_stride; 1280cabdff1aSopenharmony_ci __lsx_vstelm_d(dst76, dst, 0, 0); 1281cabdff1aSopenharmony_ci dst += dst_stride; 1282cabdff1aSopenharmony_ci __lsx_vstelm_d(dst76, dst, 0, 1); 1283cabdff1aSopenharmony_ci dst += dst_stride; 1284cabdff1aSopenharmony_ci 1285cabdff1aSopenharmony_ci src2110 = src10998; 1286cabdff1aSopenharmony_ci src4332 = src12111110; 1287cabdff1aSopenharmony_ci src6554 = src14131312; 1288cabdff1aSopenharmony_ci src6 = src14; 1289cabdff1aSopenharmony_ci } 1290cabdff1aSopenharmony_ci for (;res--;) { 1291cabdff1aSopenharmony_ci src7 = __lsx_vld(src, 0); 1292cabdff1aSopenharmony_ci src8 = __lsx_vldx(src, src_stride); 1293cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 1294cabdff1aSopenharmony_ci src += src_stride_2x; 1295cabdff1aSopenharmony_ci src8776 = __lsx_vilvl_d(src87_r, src76_r); 1296cabdff1aSopenharmony_ci 1297cabdff1aSopenharmony_ci dst10 = __lsx_vdp2_h_bu_b(src2110, filt0); 1298cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554, 1299cabdff1aSopenharmony_ci filt2, dst10, dst10); 1300cabdff1aSopenharmony_ci dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3); 1301cabdff1aSopenharmony_ci 1302cabdff1aSopenharmony_ci __lsx_vstelm_d(dst10, dst, 0, 0); 1303cabdff1aSopenharmony_ci dst += dst_stride; 1304cabdff1aSopenharmony_ci __lsx_vstelm_d(dst10, dst, 0, 1); 1305cabdff1aSopenharmony_ci dst += dst_stride; 1306cabdff1aSopenharmony_ci 1307cabdff1aSopenharmony_ci src2110 = src4332; 1308cabdff1aSopenharmony_ci src4332 = src6554; 1309cabdff1aSopenharmony_ci src6554 = src8776; 1310cabdff1aSopenharmony_ci src6 = src8; 1311cabdff1aSopenharmony_ci } 1312cabdff1aSopenharmony_ci} 1313cabdff1aSopenharmony_ci 1314cabdff1aSopenharmony_cistatic void hevc_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride, 1315cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1316cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1317cabdff1aSopenharmony_ci{ 1318cabdff1aSopenharmony_ci int32_t loop_cnt; 1319cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1320cabdff1aSopenharmony_ci int32_t dst_stride_x = (dst_stride << 1); 1321cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1322cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 2); 1323cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1324cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride_x; 1325cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1326cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src54_r, src76_r, src98_r; 1327cabdff1aSopenharmony_ci __m128i src21_r, src43_r, src65_r, src87_r, src109_r; 1328cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst2_r, dst3_r; 1329cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 1330cabdff1aSopenharmony_ci 1331cabdff1aSopenharmony_ci src -= src_stride_3x; 1332cabdff1aSopenharmony_ci 1333cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1334cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1335cabdff1aSopenharmony_ci 1336cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 1337cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 1338cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 1339cabdff1aSopenharmony_ci src += src_stride_4x; 1340cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 1341cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 1342cabdff1aSopenharmony_ci src += src_stride_3x; 1343cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 1344cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1345cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 1346cabdff1aSopenharmony_ci 1347cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1348cabdff1aSopenharmony_ci src7 = __lsx_vld(src, 0); 1349cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9); 1350cabdff1aSopenharmony_ci src10 = __lsx_vldx(src, src_stride_3x); 1351cabdff1aSopenharmony_ci src += src_stride_4x; 1352cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, 1353cabdff1aSopenharmony_ci src9, src76_r, src87_r, src98_r, src109_r); 1354cabdff1aSopenharmony_ci 1355cabdff1aSopenharmony_ci dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0); 1356cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r, 1357cabdff1aSopenharmony_ci src54_r, filt2, dst0_r, dst0_r); 1358cabdff1aSopenharmony_ci dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3); 1359cabdff1aSopenharmony_ci dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0); 1360cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r, 1361cabdff1aSopenharmony_ci src65_r, filt2, dst1_r, dst1_r); 1362cabdff1aSopenharmony_ci dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3); 1363cabdff1aSopenharmony_ci dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0); 1364cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r, 1365cabdff1aSopenharmony_ci src76_r, filt2, dst2_r, dst2_r); 1366cabdff1aSopenharmony_ci dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3); 1367cabdff1aSopenharmony_ci dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0); 1368cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r, 1369cabdff1aSopenharmony_ci src87_r, filt2, dst3_r, dst3_r); 1370cabdff1aSopenharmony_ci dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3); 1371cabdff1aSopenharmony_ci 1372cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 1373cabdff1aSopenharmony_ci __lsx_vstx(dst1_r, dst, dst_stride_x); 1374cabdff1aSopenharmony_ci __lsx_vstx(dst2_r, dst, dst_stride_2x); 1375cabdff1aSopenharmony_ci __lsx_vstx(dst3_r, dst, dst_stride_3x); 1376cabdff1aSopenharmony_ci dst += dst_stride_2x; 1377cabdff1aSopenharmony_ci 1378cabdff1aSopenharmony_ci src10_r = src54_r; 1379cabdff1aSopenharmony_ci src32_r = src76_r; 1380cabdff1aSopenharmony_ci src54_r = src98_r; 1381cabdff1aSopenharmony_ci src21_r = src65_r; 1382cabdff1aSopenharmony_ci src43_r = src87_r; 1383cabdff1aSopenharmony_ci src65_r = src109_r; 1384cabdff1aSopenharmony_ci src6 = src10; 1385cabdff1aSopenharmony_ci } 1386cabdff1aSopenharmony_ci} 1387cabdff1aSopenharmony_ci 1388cabdff1aSopenharmony_cistatic void hevc_vt_8t_12w_lsx(uint8_t *src, int32_t src_stride, 1389cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1390cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1391cabdff1aSopenharmony_ci{ 1392cabdff1aSopenharmony_ci int32_t loop_cnt; 1393cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1394cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1395cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1396cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1397cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src54_r, src76_r, src98_r; 1398cabdff1aSopenharmony_ci __m128i src21_r, src43_r, src65_r, src87_r, src109_r; 1399cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst2_r, dst3_r; 1400cabdff1aSopenharmony_ci __m128i src10_l, src32_l, src54_l, src76_l, src98_l; 1401cabdff1aSopenharmony_ci __m128i src21_l, src43_l, src65_l, src87_l, src109_l; 1402cabdff1aSopenharmony_ci __m128i src2110, src4332, src6554, src8776, src10998; 1403cabdff1aSopenharmony_ci __m128i dst0_l, dst1_l; 1404cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 1405cabdff1aSopenharmony_ci 1406cabdff1aSopenharmony_ci src -= src_stride_3x; 1407cabdff1aSopenharmony_ci 1408cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1409cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1410cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 1411cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 1412cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 1413cabdff1aSopenharmony_ci src += src_stride_4x; 1414cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 1415cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 1416cabdff1aSopenharmony_ci src += src_stride_3x; 1417cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 1418cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1419cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 1420cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, 1421cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 1422cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l); 1423cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, 1424cabdff1aSopenharmony_ci src2110, src4332); 1425cabdff1aSopenharmony_ci src6554 = __lsx_vilvl_d(src65_l, src54_l); 1426cabdff1aSopenharmony_ci 1427cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1428cabdff1aSopenharmony_ci src7 = __lsx_vld(src, 0); 1429cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9); 1430cabdff1aSopenharmony_ci src10 = __lsx_vldx(src, src_stride_3x); 1431cabdff1aSopenharmony_ci src += src_stride_4x; 1432cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, 1433cabdff1aSopenharmony_ci src9, src76_r, src87_r, src98_r, src109_r); 1434cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, 1435cabdff1aSopenharmony_ci src9, src76_l, src87_l, src98_l, src109_l); 1436cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, src87_l, src76_l, src109_l, src98_l, 1437cabdff1aSopenharmony_ci src8776, src10998); 1438cabdff1aSopenharmony_ci 1439cabdff1aSopenharmony_ci dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0); 1440cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r, 1441cabdff1aSopenharmony_ci src54_r, filt2, dst0_r, dst0_r); 1442cabdff1aSopenharmony_ci dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3); 1443cabdff1aSopenharmony_ci dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0); 1444cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r, 1445cabdff1aSopenharmony_ci src65_r, filt2, dst1_r, dst1_r); 1446cabdff1aSopenharmony_ci dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3); 1447cabdff1aSopenharmony_ci dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0); 1448cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r, 1449cabdff1aSopenharmony_ci src76_r, filt2, dst2_r, dst2_r); 1450cabdff1aSopenharmony_ci dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3); 1451cabdff1aSopenharmony_ci dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0); 1452cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r, 1453cabdff1aSopenharmony_ci src87_r, filt2, dst3_r, dst3_r); 1454cabdff1aSopenharmony_ci dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3); 1455cabdff1aSopenharmony_ci dst0_l = __lsx_vdp2_h_bu_b(src2110, filt0); 1456cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src4332, filt1, dst0_l, 1457cabdff1aSopenharmony_ci src6554, filt2, dst0_l, dst0_l); 1458cabdff1aSopenharmony_ci dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src8776, filt3); 1459cabdff1aSopenharmony_ci dst1_l = __lsx_vdp2_h_bu_b(src4332, filt0); 1460cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src6554, filt1, dst1_l, 1461cabdff1aSopenharmony_ci src8776, filt2, dst1_l, dst1_l); 1462cabdff1aSopenharmony_ci dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src10998, filt3); 1463cabdff1aSopenharmony_ci 1464cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 1465cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0_l, dst, 16, 0); 1466cabdff1aSopenharmony_ci dst += dst_stride; 1467cabdff1aSopenharmony_ci __lsx_vst(dst1_r, dst, 0); 1468cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0_l, dst, 16, 1); 1469cabdff1aSopenharmony_ci dst += dst_stride; 1470cabdff1aSopenharmony_ci __lsx_vst(dst2_r, dst, 0); 1471cabdff1aSopenharmony_ci __lsx_vstelm_d(dst1_l, dst, 16, 0); 1472cabdff1aSopenharmony_ci dst += dst_stride; 1473cabdff1aSopenharmony_ci __lsx_vst(dst3_r, dst, 0); 1474cabdff1aSopenharmony_ci __lsx_vstelm_d(dst1_l, dst, 16, 1); 1475cabdff1aSopenharmony_ci dst += dst_stride; 1476cabdff1aSopenharmony_ci 1477cabdff1aSopenharmony_ci src10_r = src54_r; 1478cabdff1aSopenharmony_ci src32_r = src76_r; 1479cabdff1aSopenharmony_ci src54_r = src98_r; 1480cabdff1aSopenharmony_ci src21_r = src65_r; 1481cabdff1aSopenharmony_ci src43_r = src87_r; 1482cabdff1aSopenharmony_ci src65_r = src109_r; 1483cabdff1aSopenharmony_ci src2110 = src6554; 1484cabdff1aSopenharmony_ci src4332 = src8776; 1485cabdff1aSopenharmony_ci src6554 = src10998; 1486cabdff1aSopenharmony_ci src6 = src10; 1487cabdff1aSopenharmony_ci } 1488cabdff1aSopenharmony_ci} 1489cabdff1aSopenharmony_ci 1490cabdff1aSopenharmony_cistatic void hevc_vt_8t_16multx4mult_lsx(uint8_t *src, 1491cabdff1aSopenharmony_ci int32_t src_stride, 1492cabdff1aSopenharmony_ci int16_t *dst, 1493cabdff1aSopenharmony_ci int32_t dst_stride, 1494cabdff1aSopenharmony_ci const int8_t *filter, 1495cabdff1aSopenharmony_ci int32_t height, 1496cabdff1aSopenharmony_ci int32_t width) 1497cabdff1aSopenharmony_ci{ 1498cabdff1aSopenharmony_ci uint8_t *src_tmp; 1499cabdff1aSopenharmony_ci int16_t *dst_tmp; 1500cabdff1aSopenharmony_ci int32_t loop_cnt, cnt; 1501cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1502cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1503cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1504cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1505cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src54_r, src76_r, src98_r; 1506cabdff1aSopenharmony_ci __m128i src21_r, src43_r, src65_r, src87_r, src109_r; 1507cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst2_r, dst3_r; 1508cabdff1aSopenharmony_ci __m128i src10_l, src32_l, src54_l, src76_l, src98_l; 1509cabdff1aSopenharmony_ci __m128i src21_l, src43_l, src65_l, src87_l, src109_l; 1510cabdff1aSopenharmony_ci __m128i dst0_l, dst1_l, dst2_l, dst3_l; 1511cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 1512cabdff1aSopenharmony_ci 1513cabdff1aSopenharmony_ci src -= src_stride_3x; 1514cabdff1aSopenharmony_ci 1515cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1516cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1517cabdff1aSopenharmony_ci 1518cabdff1aSopenharmony_ci for (cnt = width >> 4; cnt--;) { 1519cabdff1aSopenharmony_ci src_tmp = src; 1520cabdff1aSopenharmony_ci dst_tmp = dst; 1521cabdff1aSopenharmony_ci 1522cabdff1aSopenharmony_ci src0 = __lsx_vld(src_tmp, 0); 1523cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1524cabdff1aSopenharmony_ci src1, src2); 1525cabdff1aSopenharmony_ci src3 = __lsx_vldx(src_tmp, src_stride_3x); 1526cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 1527cabdff1aSopenharmony_ci src4 = __lsx_vld(src_tmp, 0); 1528cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1529cabdff1aSopenharmony_ci src5, src6); 1530cabdff1aSopenharmony_ci src_tmp += src_stride_3x; 1531cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 1532cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1533cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 1534cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, 1535cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 1536cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l); 1537cabdff1aSopenharmony_ci 1538cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1539cabdff1aSopenharmony_ci src7 = __lsx_vld(src_tmp, 0); 1540cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1541cabdff1aSopenharmony_ci src8, src9); 1542cabdff1aSopenharmony_ci src10 = __lsx_vldx(src_tmp, src_stride_3x); 1543cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 1544cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, 1545cabdff1aSopenharmony_ci src10, src9, src76_r, src87_r, src98_r, src109_r); 1546cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, 1547cabdff1aSopenharmony_ci src10, src9, src76_l, src87_l, src98_l, src109_l); 1548cabdff1aSopenharmony_ci 1549cabdff1aSopenharmony_ci dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0); 1550cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r, 1551cabdff1aSopenharmony_ci src54_r, filt2, dst0_r, dst0_r); 1552cabdff1aSopenharmony_ci dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3); 1553cabdff1aSopenharmony_ci dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0); 1554cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r, 1555cabdff1aSopenharmony_ci src65_r, filt2, dst1_r, dst1_r); 1556cabdff1aSopenharmony_ci dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3); 1557cabdff1aSopenharmony_ci dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0); 1558cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r, 1559cabdff1aSopenharmony_ci src76_r, filt2, dst2_r, dst2_r); 1560cabdff1aSopenharmony_ci dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3); 1561cabdff1aSopenharmony_ci dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0); 1562cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r, 1563cabdff1aSopenharmony_ci src87_r, filt2, dst3_r, dst3_r); 1564cabdff1aSopenharmony_ci dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3); 1565cabdff1aSopenharmony_ci dst0_l = __lsx_vdp2_h_bu_b(src10_l, filt0); 1566cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src32_l, filt1, dst0_l, 1567cabdff1aSopenharmony_ci src54_l, filt2, dst0_l, dst0_l); 1568cabdff1aSopenharmony_ci dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src76_l, filt3); 1569cabdff1aSopenharmony_ci dst1_l = __lsx_vdp2_h_bu_b(src21_l, filt0); 1570cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src43_l, filt1, dst1_l, 1571cabdff1aSopenharmony_ci src65_l, filt2, dst1_l, dst1_l); 1572cabdff1aSopenharmony_ci dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src87_l, filt3); 1573cabdff1aSopenharmony_ci dst2_l = __lsx_vdp2_h_bu_b(src32_l, filt0); 1574cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_l, src54_l, filt1, dst2_l, 1575cabdff1aSopenharmony_ci src76_l, filt2, dst2_l, dst2_l); 1576cabdff1aSopenharmony_ci dst2_l = __lsx_vdp2add_h_bu_b(dst2_l, src98_l, filt3); 1577cabdff1aSopenharmony_ci dst3_l = __lsx_vdp2_h_bu_b(src43_l, filt0); 1578cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_l, src65_l, filt1, dst3_l, 1579cabdff1aSopenharmony_ci src87_l, filt2, dst3_l, dst3_l); 1580cabdff1aSopenharmony_ci dst3_l = __lsx_vdp2add_h_bu_b(dst3_l, src109_l, filt3); 1581cabdff1aSopenharmony_ci 1582cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst_tmp, 0); 1583cabdff1aSopenharmony_ci __lsx_vst(dst0_l, dst_tmp, 16); 1584cabdff1aSopenharmony_ci dst_tmp += dst_stride; 1585cabdff1aSopenharmony_ci __lsx_vst(dst1_r, dst_tmp, 0); 1586cabdff1aSopenharmony_ci __lsx_vst(dst1_l, dst_tmp, 16); 1587cabdff1aSopenharmony_ci dst_tmp += dst_stride; 1588cabdff1aSopenharmony_ci __lsx_vst(dst2_r, dst_tmp, 0); 1589cabdff1aSopenharmony_ci __lsx_vst(dst2_l, dst_tmp, 16); 1590cabdff1aSopenharmony_ci dst_tmp += dst_stride; 1591cabdff1aSopenharmony_ci __lsx_vst(dst3_r, dst_tmp, 0); 1592cabdff1aSopenharmony_ci __lsx_vst(dst3_l, dst_tmp, 16); 1593cabdff1aSopenharmony_ci dst_tmp += dst_stride; 1594cabdff1aSopenharmony_ci 1595cabdff1aSopenharmony_ci src10_r = src54_r; 1596cabdff1aSopenharmony_ci src32_r = src76_r; 1597cabdff1aSopenharmony_ci src54_r = src98_r; 1598cabdff1aSopenharmony_ci src21_r = src65_r; 1599cabdff1aSopenharmony_ci src43_r = src87_r; 1600cabdff1aSopenharmony_ci src65_r = src109_r; 1601cabdff1aSopenharmony_ci src10_l = src54_l; 1602cabdff1aSopenharmony_ci src32_l = src76_l; 1603cabdff1aSopenharmony_ci src54_l = src98_l; 1604cabdff1aSopenharmony_ci src21_l = src65_l; 1605cabdff1aSopenharmony_ci src43_l = src87_l; 1606cabdff1aSopenharmony_ci src65_l = src109_l; 1607cabdff1aSopenharmony_ci src6 = src10; 1608cabdff1aSopenharmony_ci } 1609cabdff1aSopenharmony_ci src += 16; 1610cabdff1aSopenharmony_ci dst += 16; 1611cabdff1aSopenharmony_ci } 1612cabdff1aSopenharmony_ci} 1613cabdff1aSopenharmony_ci 1614cabdff1aSopenharmony_cistatic void hevc_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride, 1615cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1616cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1617cabdff1aSopenharmony_ci{ 1618cabdff1aSopenharmony_ci hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride, 1619cabdff1aSopenharmony_ci filter, height, 16); 1620cabdff1aSopenharmony_ci} 1621cabdff1aSopenharmony_ci 1622cabdff1aSopenharmony_cistatic void hevc_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride, 1623cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1624cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1625cabdff1aSopenharmony_ci{ 1626cabdff1aSopenharmony_ci hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride, 1627cabdff1aSopenharmony_ci filter, height, 16); 1628cabdff1aSopenharmony_ci hevc_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride, 1629cabdff1aSopenharmony_ci filter, height); 1630cabdff1aSopenharmony_ci} 1631cabdff1aSopenharmony_ci 1632cabdff1aSopenharmony_cistatic void hevc_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride, 1633cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1634cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1635cabdff1aSopenharmony_ci{ 1636cabdff1aSopenharmony_ci hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride, 1637cabdff1aSopenharmony_ci filter, height, 32); 1638cabdff1aSopenharmony_ci} 1639cabdff1aSopenharmony_ci 1640cabdff1aSopenharmony_cistatic void hevc_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride, 1641cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1642cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1643cabdff1aSopenharmony_ci{ 1644cabdff1aSopenharmony_ci hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride, 1645cabdff1aSopenharmony_ci filter, height, 48); 1646cabdff1aSopenharmony_ci} 1647cabdff1aSopenharmony_ci 1648cabdff1aSopenharmony_cistatic void hevc_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride, 1649cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1650cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1651cabdff1aSopenharmony_ci{ 1652cabdff1aSopenharmony_ci hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride, 1653cabdff1aSopenharmony_ci filter, height, 64); 1654cabdff1aSopenharmony_ci} 1655cabdff1aSopenharmony_ci 1656cabdff1aSopenharmony_cistatic void hevc_hv_8t_4w_lsx(uint8_t *src, int32_t src_stride, 1657cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1658cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1659cabdff1aSopenharmony_ci int32_t height) 1660cabdff1aSopenharmony_ci{ 1661cabdff1aSopenharmony_ci uint32_t loop_cnt; 1662cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1663cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1664cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1665cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1666cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 1667cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1, filt_h2, filt_h3; 1668cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 1669cabdff1aSopenharmony_ci __m128i filter_vec; 1670cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1671cabdff1aSopenharmony_ci __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1672cabdff1aSopenharmony_ci __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1673cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst2_r, dst3_r; 1674cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r; 1675cabdff1aSopenharmony_ci __m128i dst21_r, dst43_r, dst65_r, dst87_r, dst109_r; 1676cabdff1aSopenharmony_ci __m128i mask0; 1677cabdff1aSopenharmony_ci 1678cabdff1aSopenharmony_ci mask0 = __lsx_vld(ff_hevc_mask_arr, 16); 1679cabdff1aSopenharmony_ci 1680cabdff1aSopenharmony_ci src -= src_stride_3x + 3; 1681cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, 1682cabdff1aSopenharmony_ci filter_x, 6, filt0, filt1, filt2, filt3); 1683cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 1684cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1685cabdff1aSopenharmony_ci 1686cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2, 1687cabdff1aSopenharmony_ci filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3); 1688cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1689cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 1690cabdff1aSopenharmony_ci 1691cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 1692cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 1693cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 1694cabdff1aSopenharmony_ci src += src_stride_4x; 1695cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 1696cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 1697cabdff1aSopenharmony_ci src += src_stride_3x; 1698cabdff1aSopenharmony_ci 1699cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src0, mask0, src3, src0, mask1, src3, src0, 1700cabdff1aSopenharmony_ci mask2, src3, src0, mask3, vec0, vec1, vec2, vec3); 1701cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src1, mask0, src4, src1, mask1, src4, src1, 1702cabdff1aSopenharmony_ci mask2, src4, src1, mask3, vec4, vec5, vec6, vec7); 1703cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src5, src2, mask0, src5, src2, mask1, src5, src2, 1704cabdff1aSopenharmony_ci mask2, src5, src2, mask3, vec8, vec9, vec10, vec11); 1705cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask0, src6, src3, mask1, src6, src3, 1706cabdff1aSopenharmony_ci mask2, src6, src3, mask3, vec12, vec13, vec14, vec15); 1707cabdff1aSopenharmony_ci dst30 = __lsx_vdp2_h_bu_b(vec0, filt0); 1708cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2, 1709cabdff1aSopenharmony_ci dst30, dst30); 1710cabdff1aSopenharmony_ci dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3); 1711cabdff1aSopenharmony_ci dst41 = __lsx_vdp2_h_bu_b(vec4, filt0); 1712cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2, 1713cabdff1aSopenharmony_ci dst41, dst41); 1714cabdff1aSopenharmony_ci dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3); 1715cabdff1aSopenharmony_ci dst52 = __lsx_vdp2_h_bu_b(vec8, filt0); 1716cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2, 1717cabdff1aSopenharmony_ci dst52, dst52); 1718cabdff1aSopenharmony_ci dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3); 1719cabdff1aSopenharmony_ci dst63 = __lsx_vdp2_h_bu_b(vec12, filt0); 1720cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2, 1721cabdff1aSopenharmony_ci dst63, dst63); 1722cabdff1aSopenharmony_ci dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3); 1723cabdff1aSopenharmony_ci 1724cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r); 1725cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r); 1726cabdff1aSopenharmony_ci dst32_r = __lsx_vilvl_h(dst63, dst52); 1727cabdff1aSopenharmony_ci dst65_r = __lsx_vilvh_h(dst63, dst52); 1728cabdff1aSopenharmony_ci dst66 = __lsx_vreplvei_d(dst63, 1); 1729cabdff1aSopenharmony_ci 1730cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 1731cabdff1aSopenharmony_ci src7 = __lsx_vld(src, 0); 1732cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9); 1733cabdff1aSopenharmony_ci src10 = __lsx_vldx(src, src_stride_3x); 1734cabdff1aSopenharmony_ci src += src_stride_4x; 1735cabdff1aSopenharmony_ci 1736cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask0, src9, src7, mask1, src9, src7, 1737cabdff1aSopenharmony_ci mask2, src9, src7, mask3, vec0, vec1, vec2, vec3); 1738cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask0, src10, src8, mask1, src10, src8, 1739cabdff1aSopenharmony_ci mask2, src10, src8, mask3, vec4, vec5, vec6, vec7); 1740cabdff1aSopenharmony_ci 1741cabdff1aSopenharmony_ci dst97 = __lsx_vdp2_h_bu_b(vec0, filt0); 1742cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2, 1743cabdff1aSopenharmony_ci dst97, dst97); 1744cabdff1aSopenharmony_ci dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3); 1745cabdff1aSopenharmony_ci dst108 = __lsx_vdp2_h_bu_b(vec4, filt0); 1746cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6, 1747cabdff1aSopenharmony_ci filt2, dst108, dst108); 1748cabdff1aSopenharmony_ci dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3); 1749cabdff1aSopenharmony_ci 1750cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r); 1751cabdff1aSopenharmony_ci dst109_r = __lsx_vilvh_h(dst108, dst97); 1752cabdff1aSopenharmony_ci dst66 = __lsx_vreplvei_d(dst97, 1); 1753cabdff1aSopenharmony_ci dst98_r = __lsx_vilvl_h(dst66, dst108); 1754cabdff1aSopenharmony_ci 1755cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r, 1756cabdff1aSopenharmony_ci filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r); 1757cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r, 1758cabdff1aSopenharmony_ci filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1, 1759cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 1760cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r, 1761cabdff1aSopenharmony_ci filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2, 1762cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 1763cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r, 1764cabdff1aSopenharmony_ci filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3, 1765cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 1766cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6, 1767cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 1768cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r); 1769cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0_r, dst, 0, 0); 1770cabdff1aSopenharmony_ci dst += dst_stride; 1771cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0_r, dst, 0, 1); 1772cabdff1aSopenharmony_ci dst += dst_stride; 1773cabdff1aSopenharmony_ci __lsx_vstelm_d(dst2_r, dst, 0, 0); 1774cabdff1aSopenharmony_ci dst += dst_stride; 1775cabdff1aSopenharmony_ci __lsx_vstelm_d(dst2_r, dst, 0, 1); 1776cabdff1aSopenharmony_ci dst += dst_stride; 1777cabdff1aSopenharmony_ci 1778cabdff1aSopenharmony_ci dst10_r = dst54_r; 1779cabdff1aSopenharmony_ci dst32_r = dst76_r; 1780cabdff1aSopenharmony_ci dst54_r = dst98_r; 1781cabdff1aSopenharmony_ci dst21_r = dst65_r; 1782cabdff1aSopenharmony_ci dst43_r = dst87_r; 1783cabdff1aSopenharmony_ci dst65_r = dst109_r; 1784cabdff1aSopenharmony_ci dst66 = __lsx_vreplvei_d(dst108, 1); 1785cabdff1aSopenharmony_ci } 1786cabdff1aSopenharmony_ci} 1787cabdff1aSopenharmony_ci 1788cabdff1aSopenharmony_cistatic void hevc_hv_8t_8multx1mult_lsx(uint8_t *src, 1789cabdff1aSopenharmony_ci int32_t src_stride, 1790cabdff1aSopenharmony_ci int16_t *dst, 1791cabdff1aSopenharmony_ci int32_t dst_stride, 1792cabdff1aSopenharmony_ci const int8_t *filter_x, 1793cabdff1aSopenharmony_ci const int8_t *filter_y, 1794cabdff1aSopenharmony_ci int32_t height, 1795cabdff1aSopenharmony_ci int32_t width) 1796cabdff1aSopenharmony_ci{ 1797cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 1798cabdff1aSopenharmony_ci uint8_t *src_tmp; 1799cabdff1aSopenharmony_ci int16_t *dst_tmp; 1800cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1801cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1802cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1803cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 1804cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3; 1805cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1, filt_h2, filt_h3; 1806cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 1807cabdff1aSopenharmony_ci __m128i filter_vec; 1808cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1809cabdff1aSopenharmony_ci __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1810cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1811cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l; 1812cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst76_r; 1813cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst54_l, dst76_l; 1814cabdff1aSopenharmony_ci __m128i mask0 = {0x403030202010100, 0x807070606050504}; 1815cabdff1aSopenharmony_ci 1816cabdff1aSopenharmony_ci src -= src_stride_3x + 3; 1817cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, 1818cabdff1aSopenharmony_ci filter_x, 6, filt0, filt1, filt2, filt3); 1819cabdff1aSopenharmony_ci 1820cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 1821cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1822cabdff1aSopenharmony_ci 1823cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2, 1824cabdff1aSopenharmony_ci filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3); 1825cabdff1aSopenharmony_ci 1826cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1827cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 1828cabdff1aSopenharmony_ci 1829cabdff1aSopenharmony_ci for (cnt = width >> 3; cnt--;) { 1830cabdff1aSopenharmony_ci src_tmp = src; 1831cabdff1aSopenharmony_ci dst_tmp = dst; 1832cabdff1aSopenharmony_ci src0 = __lsx_vld(src_tmp, 0); 1833cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1834cabdff1aSopenharmony_ci src1, src2); 1835cabdff1aSopenharmony_ci src3 = __lsx_vldx(src_tmp, src_stride_3x); 1836cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 1837cabdff1aSopenharmony_ci src4 = __lsx_vld(src_tmp, 0); 1838cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1839cabdff1aSopenharmony_ci src5, src6); 1840cabdff1aSopenharmony_ci src_tmp += src_stride_3x; 1841cabdff1aSopenharmony_ci 1842cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 1843cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, 1844cabdff1aSopenharmony_ci src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3); 1845cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, 1846cabdff1aSopenharmony_ci src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7); 1847cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, 1848cabdff1aSopenharmony_ci src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11); 1849cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, 1850cabdff1aSopenharmony_ci src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15); 1851cabdff1aSopenharmony_ci dst0 = __lsx_vdp2_h_bu_b(vec0, filt0); 1852cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2, 1853cabdff1aSopenharmony_ci dst0, dst0); 1854cabdff1aSopenharmony_ci dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3); 1855cabdff1aSopenharmony_ci dst1 = __lsx_vdp2_h_bu_b(vec4, filt0); 1856cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2, 1857cabdff1aSopenharmony_ci dst1, dst1); 1858cabdff1aSopenharmony_ci dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3); 1859cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec8, filt0); 1860cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2, 1861cabdff1aSopenharmony_ci dst2, dst2); 1862cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3); 1863cabdff1aSopenharmony_ci dst3 = __lsx_vdp2_h_bu_b(vec12, filt0); 1864cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2, 1865cabdff1aSopenharmony_ci dst3, dst3); 1866cabdff1aSopenharmony_ci dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3); 1867cabdff1aSopenharmony_ci 1868cabdff1aSopenharmony_ci /* row 4 row 5 row 6 */ 1869cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, 1870cabdff1aSopenharmony_ci src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3); 1871cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, 1872cabdff1aSopenharmony_ci src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7); 1873cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, 1874cabdff1aSopenharmony_ci src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11); 1875cabdff1aSopenharmony_ci dst4 = __lsx_vdp2_h_bu_b(vec0, filt0); 1876cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2, 1877cabdff1aSopenharmony_ci dst4, dst4); 1878cabdff1aSopenharmony_ci dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3); 1879cabdff1aSopenharmony_ci dst5 = __lsx_vdp2_h_bu_b(vec4, filt0); 1880cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2, 1881cabdff1aSopenharmony_ci dst5, dst5); 1882cabdff1aSopenharmony_ci dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3); 1883cabdff1aSopenharmony_ci dst6 = __lsx_vdp2_h_bu_b(vec8, filt0); 1884cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2, 1885cabdff1aSopenharmony_ci dst6, dst6); 1886cabdff1aSopenharmony_ci dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3); 1887cabdff1aSopenharmony_ci 1888cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1889cabdff1aSopenharmony_ci src7 = __lsx_vld(src_tmp, 0); 1890cabdff1aSopenharmony_ci src_tmp += src_stride; 1891cabdff1aSopenharmony_ci 1892cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7, 1893cabdff1aSopenharmony_ci src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3); 1894cabdff1aSopenharmony_ci dst7 = __lsx_vdp2_h_bu_b(vec0, filt0); 1895cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, 1896cabdff1aSopenharmony_ci filt2, dst7, dst7); 1897cabdff1aSopenharmony_ci dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3); 1898cabdff1aSopenharmony_ci 1899cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, 1900cabdff1aSopenharmony_ci dst6, dst10_r, dst32_r, dst54_r, dst76_r); 1901cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, 1902cabdff1aSopenharmony_ci dst6, dst10_l, dst32_l, dst54_l, dst76_l); 1903cabdff1aSopenharmony_ci 1904cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, 1905cabdff1aSopenharmony_ci dst0_r, dst0_l); 1906cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, 1907cabdff1aSopenharmony_ci dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, 1908cabdff1aSopenharmony_ci dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l); 1909cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, 1910cabdff1aSopenharmony_ci dst76_l, filt_h3, dst0_r, dst0_l); 1911cabdff1aSopenharmony_ci dst0_r = __lsx_vsrai_w(dst0_r, 6); 1912cabdff1aSopenharmony_ci dst0_l = __lsx_vsrai_w(dst0_l, 6); 1913cabdff1aSopenharmony_ci 1914cabdff1aSopenharmony_ci dst0_r = __lsx_vpickev_h(dst0_l, dst0_r); 1915cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst_tmp, 0); 1916cabdff1aSopenharmony_ci dst_tmp += dst_stride; 1917cabdff1aSopenharmony_ci 1918cabdff1aSopenharmony_ci dst0 = dst1; 1919cabdff1aSopenharmony_ci dst1 = dst2; 1920cabdff1aSopenharmony_ci dst2 = dst3; 1921cabdff1aSopenharmony_ci dst3 = dst4; 1922cabdff1aSopenharmony_ci dst4 = dst5; 1923cabdff1aSopenharmony_ci dst5 = dst6; 1924cabdff1aSopenharmony_ci dst6 = dst7; 1925cabdff1aSopenharmony_ci } 1926cabdff1aSopenharmony_ci src += 8; 1927cabdff1aSopenharmony_ci dst += 8; 1928cabdff1aSopenharmony_ci } 1929cabdff1aSopenharmony_ci} 1930cabdff1aSopenharmony_ci 1931cabdff1aSopenharmony_cistatic void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, 1932cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1933cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1934cabdff1aSopenharmony_ci int32_t height) 1935cabdff1aSopenharmony_ci{ 1936cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride, 1937cabdff1aSopenharmony_ci filter_x, filter_y, height, 8); 1938cabdff1aSopenharmony_ci} 1939cabdff1aSopenharmony_ci 1940cabdff1aSopenharmony_cistatic void hevc_hv_8t_12w_lsx(uint8_t *src, int32_t src_stride, 1941cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 1942cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 1943cabdff1aSopenharmony_ci int32_t height) 1944cabdff1aSopenharmony_ci{ 1945cabdff1aSopenharmony_ci uint32_t loop_cnt; 1946cabdff1aSopenharmony_ci uint8_t *src_tmp; 1947cabdff1aSopenharmony_ci int16_t *dst_tmp; 1948cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 1949cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 1950cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1951cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1952cabdff1aSopenharmony_ci __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1953cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1954cabdff1aSopenharmony_ci __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1955cabdff1aSopenharmony_ci __m128i filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 1956cabdff1aSopenharmony_ci __m128i filter_vec; 1957cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1958cabdff1aSopenharmony_ci __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1959cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r; 1960cabdff1aSopenharmony_ci __m128i dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l; 1961cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst2_r, dst3_r; 1962cabdff1aSopenharmony_ci 1963cabdff1aSopenharmony_ci src -= src_stride_3x + 3; 1964cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, 1965cabdff1aSopenharmony_ci filter_x, 6, filt0, filt1, filt2, filt3); 1966cabdff1aSopenharmony_ci 1967cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 1968cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1969cabdff1aSopenharmony_ci 1970cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2, 1971cabdff1aSopenharmony_ci filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3); 1972cabdff1aSopenharmony_ci 1973cabdff1aSopenharmony_ci mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1974cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1975cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 6); 1976cabdff1aSopenharmony_ci 1977cabdff1aSopenharmony_ci src_tmp = src; 1978cabdff1aSopenharmony_ci dst_tmp = dst; 1979cabdff1aSopenharmony_ci 1980cabdff1aSopenharmony_ci src0 = __lsx_vld(src_tmp, 0); 1981cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1982cabdff1aSopenharmony_ci src1, src2); 1983cabdff1aSopenharmony_ci src3 = __lsx_vldx(src_tmp, src_stride_3x); 1984cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 1985cabdff1aSopenharmony_ci src4 = __lsx_vld(src_tmp, 0); 1986cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1987cabdff1aSopenharmony_ci src5, src6); 1988cabdff1aSopenharmony_ci src_tmp += src_stride_3x; 1989cabdff1aSopenharmony_ci 1990cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 1991cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, src0, 1992cabdff1aSopenharmony_ci mask2, src0, src0, mask3, vec0, vec1, vec2, vec3); 1993cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, src1, 1994cabdff1aSopenharmony_ci mask2, src1, src1, mask3, vec4, vec5, vec6, vec7); 1995cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, src2, 1996cabdff1aSopenharmony_ci mask2, src2, src2, mask3, vec8, vec9, vec10, vec11); 1997cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, src3, 1998cabdff1aSopenharmony_ci mask2, src3, src3, mask3, vec12, vec13, vec14, vec15); 1999cabdff1aSopenharmony_ci dst0 = __lsx_vdp2_h_bu_b(vec0, filt0); 2000cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2, 2001cabdff1aSopenharmony_ci dst0, dst0); 2002cabdff1aSopenharmony_ci dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3); 2003cabdff1aSopenharmony_ci dst1 = __lsx_vdp2_h_bu_b(vec4, filt0); 2004cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2, 2005cabdff1aSopenharmony_ci dst1, dst1); 2006cabdff1aSopenharmony_ci dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3); 2007cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec8, filt0); 2008cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2, 2009cabdff1aSopenharmony_ci dst2, dst2); 2010cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3); 2011cabdff1aSopenharmony_ci dst3 = __lsx_vdp2_h_bu_b(vec12, filt0); 2012cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2, 2013cabdff1aSopenharmony_ci dst3, dst3); 2014cabdff1aSopenharmony_ci dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3); 2015cabdff1aSopenharmony_ci 2016cabdff1aSopenharmony_ci /* row 4 row 5 row 6 */ 2017cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, src4, 2018cabdff1aSopenharmony_ci mask2, src4, src4, mask3, vec0, vec1, vec2, vec3); 2019cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, src5, 2020cabdff1aSopenharmony_ci mask2, src5, src5, mask3, vec4, vec5, vec6, vec7); 2021cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, src6, 2022cabdff1aSopenharmony_ci mask2, src6, src6, mask3, vec8, vec9, vec10, vec11); 2023cabdff1aSopenharmony_ci dst4 = __lsx_vdp2_h_bu_b(vec0, filt0); 2024cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2, 2025cabdff1aSopenharmony_ci dst4, dst4); 2026cabdff1aSopenharmony_ci dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3); 2027cabdff1aSopenharmony_ci dst5 = __lsx_vdp2_h_bu_b(vec4, filt0); 2028cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2, 2029cabdff1aSopenharmony_ci dst5, dst5); 2030cabdff1aSopenharmony_ci dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3); 2031cabdff1aSopenharmony_ci dst6 = __lsx_vdp2_h_bu_b(vec8, filt0); 2032cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2, 2033cabdff1aSopenharmony_ci dst6, dst6); 2034cabdff1aSopenharmony_ci dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3); 2035cabdff1aSopenharmony_ci 2036cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 2037cabdff1aSopenharmony_ci src7 = __lsx_vld(src_tmp, 0); 2038cabdff1aSopenharmony_ci src_tmp += src_stride; 2039cabdff1aSopenharmony_ci 2040cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7, 2041cabdff1aSopenharmony_ci src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3); 2042cabdff1aSopenharmony_ci dst7 = __lsx_vdp2_h_bu_b(vec0, filt0); 2043cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2, 2044cabdff1aSopenharmony_ci dst7, dst7); 2045cabdff1aSopenharmony_ci dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3); 2046cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 2047cabdff1aSopenharmony_ci dst10_r, dst32_r, dst54_r, dst76_r); 2048cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 2049cabdff1aSopenharmony_ci dst10_l, dst32_l, dst54_l, dst76_l); 2050cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, 2051cabdff1aSopenharmony_ci dst0_r, dst0_l); 2052cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 2053cabdff1aSopenharmony_ci filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, dst54_l, filt_h2, 2054cabdff1aSopenharmony_ci dst0_r, dst0_l, dst0_r, dst0_l); 2055cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, dst76_l, 2056cabdff1aSopenharmony_ci filt_h3, dst0_r, dst0_l) 2057cabdff1aSopenharmony_ci dst0_r = __lsx_vsrai_w(dst0_r, 6); 2058cabdff1aSopenharmony_ci dst0_l = __lsx_vsrai_w(dst0_l, 6); 2059cabdff1aSopenharmony_ci 2060cabdff1aSopenharmony_ci dst0_r = __lsx_vpickev_h(dst0_l, dst0_r); 2061cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst_tmp, 0); 2062cabdff1aSopenharmony_ci dst_tmp += dst_stride; 2063cabdff1aSopenharmony_ci 2064cabdff1aSopenharmony_ci dst0 = dst1; 2065cabdff1aSopenharmony_ci dst1 = dst2; 2066cabdff1aSopenharmony_ci dst2 = dst3; 2067cabdff1aSopenharmony_ci dst3 = dst4; 2068cabdff1aSopenharmony_ci dst4 = dst5; 2069cabdff1aSopenharmony_ci dst5 = dst6; 2070cabdff1aSopenharmony_ci dst6 = dst7; 2071cabdff1aSopenharmony_ci } 2072cabdff1aSopenharmony_ci src += 8; 2073cabdff1aSopenharmony_ci dst += 8; 2074cabdff1aSopenharmony_ci 2075cabdff1aSopenharmony_ci mask4 = __lsx_vld(ff_hevc_mask_arr, 16); 2076cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6); 2077cabdff1aSopenharmony_ci mask7 = __lsx_vaddi_bu(mask4, 6); 2078cabdff1aSopenharmony_ci 2079cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 2080cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 2081cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 2082cabdff1aSopenharmony_ci src += src_stride_4x; 2083cabdff1aSopenharmony_ci src4 = __lsx_vld(src, 0); 2084cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 2085cabdff1aSopenharmony_ci src += src_stride_3x; 2086cabdff1aSopenharmony_ci 2087cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src3, src0, mask4, src3, src0, mask5, src3, src0, 2088cabdff1aSopenharmony_ci mask6, src3, src0, mask7, vec0, vec1, vec2, vec3); 2089cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src1, mask4, src4, src1, mask5, src4, src1, 2090cabdff1aSopenharmony_ci mask6, src4, src1, mask7, vec4, vec5, vec6, vec7); 2091cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src5, src2, mask4, src5, src2, mask5, src5, src2, 2092cabdff1aSopenharmony_ci mask6, src5, src2, mask7, vec8, vec9, vec10, vec11); 2093cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask4, src6, src3, mask5, src6, src3, 2094cabdff1aSopenharmony_ci mask6, src6, src3, mask7, vec12, vec13, vec14, vec15); 2095cabdff1aSopenharmony_ci dst30 = __lsx_vdp2_h_bu_b(vec0, filt0); 2096cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2, 2097cabdff1aSopenharmony_ci dst30, dst30); 2098cabdff1aSopenharmony_ci dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3); 2099cabdff1aSopenharmony_ci dst41 = __lsx_vdp2_h_bu_b(vec4, filt0); 2100cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2, 2101cabdff1aSopenharmony_ci dst41, dst41); 2102cabdff1aSopenharmony_ci dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3); 2103cabdff1aSopenharmony_ci dst52 = __lsx_vdp2_h_bu_b(vec8, filt0); 2104cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2, 2105cabdff1aSopenharmony_ci dst52, dst52); 2106cabdff1aSopenharmony_ci dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3); 2107cabdff1aSopenharmony_ci dst63 = __lsx_vdp2_h_bu_b(vec12, filt0); 2108cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2, 2109cabdff1aSopenharmony_ci dst63, dst63); 2110cabdff1aSopenharmony_ci dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3); 2111cabdff1aSopenharmony_ci 2112cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r); 2113cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r); 2114cabdff1aSopenharmony_ci dst32_r = __lsx_vilvl_h(dst63, dst52); 2115cabdff1aSopenharmony_ci dst65_r = __lsx_vilvh_h(dst63, dst52); 2116cabdff1aSopenharmony_ci 2117cabdff1aSopenharmony_ci dst66 = __lsx_vreplvei_d(dst63, 1); 2118cabdff1aSopenharmony_ci 2119cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 2120cabdff1aSopenharmony_ci src7 = __lsx_vld(src, 0); 2121cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9); 2122cabdff1aSopenharmony_ci src10 = __lsx_vldx(src, src_stride_3x); 2123cabdff1aSopenharmony_ci src += src_stride_4x; 2124cabdff1aSopenharmony_ci 2125cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask4, src9, src7, mask5, src9, 2126cabdff1aSopenharmony_ci src7, mask6, src9, src7, mask7, vec0, vec1, vec2, vec3); 2127cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask4, src10, src8, mask5, src10, 2128cabdff1aSopenharmony_ci src8, mask6, src10, src8, mask7, vec4, vec5, vec6, vec7); 2129cabdff1aSopenharmony_ci dst97 = __lsx_vdp2_h_bu_b(vec0, filt0); 2130cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2, 2131cabdff1aSopenharmony_ci dst97, dst97); 2132cabdff1aSopenharmony_ci dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3); 2133cabdff1aSopenharmony_ci dst108 = __lsx_vdp2_h_bu_b(vec4, filt0); 2134cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6, 2135cabdff1aSopenharmony_ci filt2, dst108, dst108); 2136cabdff1aSopenharmony_ci dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3); 2137cabdff1aSopenharmony_ci 2138cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r); 2139cabdff1aSopenharmony_ci dst109_r = __lsx_vilvh_h(dst108, dst97); 2140cabdff1aSopenharmony_ci dst66 = __lsx_vreplvei_d(dst97, 1); 2141cabdff1aSopenharmony_ci dst98_r = __lsx_vilvl_h(dst66, dst108); 2142cabdff1aSopenharmony_ci 2143cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r, 2144cabdff1aSopenharmony_ci filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r); 2145cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r, 2146cabdff1aSopenharmony_ci filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1, 2147cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 2148cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r, 2149cabdff1aSopenharmony_ci filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2, 2150cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 2151cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r, 2152cabdff1aSopenharmony_ci filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3, 2153cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 2154cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6, 2155cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 2156cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, 2157cabdff1aSopenharmony_ci dst0_r, dst2_r); 2158cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0_r, dst, 0, 0); 2159cabdff1aSopenharmony_ci dst += dst_stride; 2160cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0_r, dst, 0, 1); 2161cabdff1aSopenharmony_ci dst += dst_stride; 2162cabdff1aSopenharmony_ci __lsx_vstelm_d(dst2_r, dst, 0, 0); 2163cabdff1aSopenharmony_ci dst += dst_stride; 2164cabdff1aSopenharmony_ci __lsx_vstelm_d(dst2_r, dst, 0, 1); 2165cabdff1aSopenharmony_ci dst += dst_stride; 2166cabdff1aSopenharmony_ci 2167cabdff1aSopenharmony_ci dst10_r = dst54_r; 2168cabdff1aSopenharmony_ci dst32_r = dst76_r; 2169cabdff1aSopenharmony_ci dst54_r = dst98_r; 2170cabdff1aSopenharmony_ci dst21_r = dst65_r; 2171cabdff1aSopenharmony_ci dst43_r = dst87_r; 2172cabdff1aSopenharmony_ci dst65_r = dst109_r; 2173cabdff1aSopenharmony_ci dst66 = __lsx_vreplvei_d(dst108, 1); 2174cabdff1aSopenharmony_ci } 2175cabdff1aSopenharmony_ci} 2176cabdff1aSopenharmony_ci 2177cabdff1aSopenharmony_cistatic void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, 2178cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 2179cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 2180cabdff1aSopenharmony_ci int32_t height) 2181cabdff1aSopenharmony_ci{ 2182cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride, 2183cabdff1aSopenharmony_ci filter_x, filter_y, height, 16); 2184cabdff1aSopenharmony_ci} 2185cabdff1aSopenharmony_ci 2186cabdff1aSopenharmony_cistatic void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, 2187cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 2188cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 2189cabdff1aSopenharmony_ci int32_t height) 2190cabdff1aSopenharmony_ci{ 2191cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride, 2192cabdff1aSopenharmony_ci filter_x, filter_y, height, 24); 2193cabdff1aSopenharmony_ci} 2194cabdff1aSopenharmony_ci 2195cabdff1aSopenharmony_cistatic void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, 2196cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 2197cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 2198cabdff1aSopenharmony_ci int32_t height) 2199cabdff1aSopenharmony_ci{ 2200cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride, 2201cabdff1aSopenharmony_ci filter_x, filter_y, height, 32); 2202cabdff1aSopenharmony_ci} 2203cabdff1aSopenharmony_ci 2204cabdff1aSopenharmony_cistatic void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, 2205cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 2206cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 2207cabdff1aSopenharmony_ci int32_t height) 2208cabdff1aSopenharmony_ci{ 2209cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride, 2210cabdff1aSopenharmony_ci filter_x, filter_y, height, 48); 2211cabdff1aSopenharmony_ci} 2212cabdff1aSopenharmony_ci 2213cabdff1aSopenharmony_cistatic void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, 2214cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 2215cabdff1aSopenharmony_ci const int8_t *filter_x, const int8_t *filter_y, 2216cabdff1aSopenharmony_ci int32_t height) 2217cabdff1aSopenharmony_ci{ 2218cabdff1aSopenharmony_ci hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride, 2219cabdff1aSopenharmony_ci filter_x, filter_y, height, 64); 2220cabdff1aSopenharmony_ci} 2221cabdff1aSopenharmony_ci 2222cabdff1aSopenharmony_cistatic void hevc_hz_4t_32w_lsx(uint8_t *src, 2223cabdff1aSopenharmony_ci int32_t src_stride, 2224cabdff1aSopenharmony_ci int16_t *dst, 2225cabdff1aSopenharmony_ci int32_t dst_stride, 2226cabdff1aSopenharmony_ci const int8_t *filter, 2227cabdff1aSopenharmony_ci int32_t height) 2228cabdff1aSopenharmony_ci{ 2229cabdff1aSopenharmony_ci uint32_t loop_cnt; 2230cabdff1aSopenharmony_ci __m128i src0, src1, src2; 2231cabdff1aSopenharmony_ci __m128i filt0, filt1; 2232cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 2233cabdff1aSopenharmony_ci __m128i mask1, mask2, mask3; 2234cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3; 2235cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3; 2236cabdff1aSopenharmony_ci 2237cabdff1aSopenharmony_ci src -= 1; 2238cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 2239cabdff1aSopenharmony_ci 2240cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2); 2241cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask0, 10); 2242cabdff1aSopenharmony_ci 2243cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 2244cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); 2245cabdff1aSopenharmony_ci src2 = __lsx_vld(src, 24); 2246cabdff1aSopenharmony_ci src += src_stride; 2247cabdff1aSopenharmony_ci 2248cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, 2249cabdff1aSopenharmony_ci vec0, vec1); 2250cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src2, src2, mask0, 2251cabdff1aSopenharmony_ci vec2, vec3); 2252cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 2253cabdff1aSopenharmony_ci vec3, filt0, dst0, dst1, dst2, dst3); 2254cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, 2255cabdff1aSopenharmony_ci vec0, vec1); 2256cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask1, src2, src2, mask1, 2257cabdff1aSopenharmony_ci vec2, vec3); 2258cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 2259cabdff1aSopenharmony_ci dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 2260cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 0); 2261cabdff1aSopenharmony_ci __lsx_vst(dst1, dst, 16); 2262cabdff1aSopenharmony_ci __lsx_vst(dst2, dst, 32); 2263cabdff1aSopenharmony_ci __lsx_vst(dst3, dst, 48); 2264cabdff1aSopenharmony_ci dst += dst_stride; 2265cabdff1aSopenharmony_ci } 2266cabdff1aSopenharmony_ci} 2267cabdff1aSopenharmony_ci 2268cabdff1aSopenharmony_cistatic void hevc_vt_4t_16w_lsx(uint8_t *src, 2269cabdff1aSopenharmony_ci int32_t src_stride, 2270cabdff1aSopenharmony_ci int16_t *dst, 2271cabdff1aSopenharmony_ci int32_t dst_stride, 2272cabdff1aSopenharmony_ci const int8_t *filter, 2273cabdff1aSopenharmony_ci int32_t height) 2274cabdff1aSopenharmony_ci{ 2275cabdff1aSopenharmony_ci int32_t loop_cnt; 2276cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 2277cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 2278cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5; 2279cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src21_r, src43_r; 2280cabdff1aSopenharmony_ci __m128i src10_l, src32_l, src21_l, src43_l; 2281cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst0_l, dst1_l; 2282cabdff1aSopenharmony_ci __m128i filt0, filt1; 2283cabdff1aSopenharmony_ci 2284cabdff1aSopenharmony_ci src -= src_stride; 2285cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 2286cabdff1aSopenharmony_ci 2287cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 2288cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 2289cabdff1aSopenharmony_ci src += src_stride_3x; 2290cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 2291cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 2292cabdff1aSopenharmony_ci 2293cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2294cabdff1aSopenharmony_ci src3 = __lsx_vld(src, 0); 2295cabdff1aSopenharmony_ci src4 = __lsx_vldx(src, src_stride); 2296cabdff1aSopenharmony_ci src += src_stride_2x; 2297cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 2298cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 2299cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r, 2300cabdff1aSopenharmony_ci filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 2301cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l, 2302cabdff1aSopenharmony_ci src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l, 2303cabdff1aSopenharmony_ci filt1, dst0_r, dst0_l, dst1_r, dst1_l); 2304cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 2305cabdff1aSopenharmony_ci __lsx_vst(dst0_l, dst, 16); 2306cabdff1aSopenharmony_ci dst += dst_stride; 2307cabdff1aSopenharmony_ci __lsx_vst(dst1_r, dst, 0); 2308cabdff1aSopenharmony_ci __lsx_vst(dst1_l, dst, 16); 2309cabdff1aSopenharmony_ci dst += dst_stride; 2310cabdff1aSopenharmony_ci 2311cabdff1aSopenharmony_ci src5 = __lsx_vld(src, 0); 2312cabdff1aSopenharmony_ci src2 = __lsx_vldx(src, src_stride); 2313cabdff1aSopenharmony_ci src += src_stride_2x; 2314cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r); 2315cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l); 2316cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r, 2317cabdff1aSopenharmony_ci filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 2318cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, 2319cabdff1aSopenharmony_ci src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, 2320cabdff1aSopenharmony_ci filt1, dst0_r, dst0_l, dst1_r, dst1_l); 2321cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 2322cabdff1aSopenharmony_ci __lsx_vst(dst0_l, dst, 16); 2323cabdff1aSopenharmony_ci dst += dst_stride; 2324cabdff1aSopenharmony_ci __lsx_vst(dst1_r, dst, 0); 2325cabdff1aSopenharmony_ci __lsx_vst(dst1_l, dst, 16); 2326cabdff1aSopenharmony_ci dst += dst_stride; 2327cabdff1aSopenharmony_ci } 2328cabdff1aSopenharmony_ci} 2329cabdff1aSopenharmony_ci 2330cabdff1aSopenharmony_cistatic void hevc_vt_4t_24w_lsx(uint8_t *src, 2331cabdff1aSopenharmony_ci int32_t src_stride, 2332cabdff1aSopenharmony_ci int16_t *dst, 2333cabdff1aSopenharmony_ci int32_t dst_stride, 2334cabdff1aSopenharmony_ci const int8_t *filter, 2335cabdff1aSopenharmony_ci int32_t height) 2336cabdff1aSopenharmony_ci{ 2337cabdff1aSopenharmony_ci int32_t loop_cnt; 2338cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 2339cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 2340cabdff1aSopenharmony_ci uint8_t *_src; 2341cabdff1aSopenharmony_ci 2342cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5; 2343cabdff1aSopenharmony_ci __m128i src6, src7, src8, src9, src10, src11; 2344cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src76_r, src98_r; 2345cabdff1aSopenharmony_ci __m128i src21_r, src43_r, src87_r, src109_r; 2346cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst2_r, dst3_r; 2347cabdff1aSopenharmony_ci __m128i src10_l, src32_l, src21_l, src43_l; 2348cabdff1aSopenharmony_ci __m128i dst0_l, dst1_l; 2349cabdff1aSopenharmony_ci __m128i filt0, filt1; 2350cabdff1aSopenharmony_ci 2351cabdff1aSopenharmony_ci src -= src_stride; 2352cabdff1aSopenharmony_ci _src = src + 16; 2353cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 2354cabdff1aSopenharmony_ci 2355cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 2356cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 2357cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 2358cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 2359cabdff1aSopenharmony_ci 2360cabdff1aSopenharmony_ci src6 = __lsx_vld(_src, 0); 2361cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8); 2362cabdff1aSopenharmony_ci src += src_stride_3x; 2363cabdff1aSopenharmony_ci _src += src_stride_3x; 2364cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 2365cabdff1aSopenharmony_ci 2366cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2367cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9); 2368cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10); 2369cabdff1aSopenharmony_ci src += src_stride_2x; 2370cabdff1aSopenharmony_ci _src += src_stride_2x; 2371cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 2372cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 2373cabdff1aSopenharmony_ci 2374cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r); 2375cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r, 2376cabdff1aSopenharmony_ci filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 2377cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l, 2378cabdff1aSopenharmony_ci src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l, 2379cabdff1aSopenharmony_ci filt1, dst0_r, dst0_l, dst1_r, dst1_l); 2380cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0, 2381cabdff1aSopenharmony_ci dst2_r, dst3_r); 2382cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r, 2383cabdff1aSopenharmony_ci src109_r, filt1, dst2_r, dst3_r); 2384cabdff1aSopenharmony_ci 2385cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 2386cabdff1aSopenharmony_ci __lsx_vst(dst0_l, dst, 16); 2387cabdff1aSopenharmony_ci __lsx_vst(dst2_r, dst, 32); 2388cabdff1aSopenharmony_ci dst += dst_stride; 2389cabdff1aSopenharmony_ci __lsx_vst(dst1_r, dst, 0); 2390cabdff1aSopenharmony_ci __lsx_vst(dst1_l, dst, 16); 2391cabdff1aSopenharmony_ci __lsx_vst(dst3_r, dst, 32); 2392cabdff1aSopenharmony_ci dst += dst_stride; 2393cabdff1aSopenharmony_ci 2394cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11); 2395cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8); 2396cabdff1aSopenharmony_ci src += src_stride_2x; 2397cabdff1aSopenharmony_ci _src += src_stride_2x; 2398cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r); 2399cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l); 2400cabdff1aSopenharmony_ci 2401cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r); 2402cabdff1aSopenharmony_ci 2403cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r, 2404cabdff1aSopenharmony_ci filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 2405cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, src10_l, 2406cabdff1aSopenharmony_ci filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, filt1, 2407cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 2408cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0, 2409cabdff1aSopenharmony_ci dst2_r, dst3_r); 2410cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r, src87_r, 2411cabdff1aSopenharmony_ci filt1, dst2_r, dst3_r); 2412cabdff1aSopenharmony_ci 2413cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 2414cabdff1aSopenharmony_ci __lsx_vst(dst0_l, dst, 16); 2415cabdff1aSopenharmony_ci __lsx_vst(dst2_r, dst, 32); 2416cabdff1aSopenharmony_ci dst += dst_stride; 2417cabdff1aSopenharmony_ci __lsx_vst(dst1_r, dst, 0); 2418cabdff1aSopenharmony_ci __lsx_vst(dst1_l, dst, 16); 2419cabdff1aSopenharmony_ci __lsx_vst(dst3_r, dst, 32); 2420cabdff1aSopenharmony_ci dst += dst_stride; 2421cabdff1aSopenharmony_ci } 2422cabdff1aSopenharmony_ci} 2423cabdff1aSopenharmony_ci 2424cabdff1aSopenharmony_cistatic void hevc_vt_4t_32w_lsx(uint8_t *src, 2425cabdff1aSopenharmony_ci int32_t src_stride, 2426cabdff1aSopenharmony_ci int16_t *dst, 2427cabdff1aSopenharmony_ci int32_t dst_stride, 2428cabdff1aSopenharmony_ci const int8_t *filter, 2429cabdff1aSopenharmony_ci int32_t height) 2430cabdff1aSopenharmony_ci{ 2431cabdff1aSopenharmony_ci int32_t loop_cnt; 2432cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 2433cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 2434cabdff1aSopenharmony_ci uint8_t *_src; 2435cabdff1aSopenharmony_ci 2436cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5; 2437cabdff1aSopenharmony_ci __m128i src6, src7, src8, src9, src10, src11; 2438cabdff1aSopenharmony_ci __m128i src10_r, src32_r, src76_r, src98_r; 2439cabdff1aSopenharmony_ci __m128i src21_r, src43_r, src87_r, src109_r; 2440cabdff1aSopenharmony_ci __m128i dst0_r, dst1_r, dst2_r, dst3_r; 2441cabdff1aSopenharmony_ci __m128i src10_l, src32_l, src76_l, src98_l; 2442cabdff1aSopenharmony_ci __m128i src21_l, src43_l, src87_l, src109_l; 2443cabdff1aSopenharmony_ci __m128i dst0_l, dst1_l, dst2_l, dst3_l; 2444cabdff1aSopenharmony_ci __m128i filt0, filt1; 2445cabdff1aSopenharmony_ci 2446cabdff1aSopenharmony_ci src -= src_stride; 2447cabdff1aSopenharmony_ci _src = src + 16; 2448cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 2449cabdff1aSopenharmony_ci 2450cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 2451cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 2452cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 2453cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 2454cabdff1aSopenharmony_ci 2455cabdff1aSopenharmony_ci src6 = __lsx_vld(_src, 0); 2456cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8); 2457cabdff1aSopenharmony_ci src += src_stride_3x; 2458cabdff1aSopenharmony_ci _src += src_stride_3x; 2459cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 2460cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l); 2461cabdff1aSopenharmony_ci 2462cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2463cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9); 2464cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10); 2465cabdff1aSopenharmony_ci src += src_stride_2x; 2466cabdff1aSopenharmony_ci _src += src_stride_2x; 2467cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 2468cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 2469cabdff1aSopenharmony_ci 2470cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r); 2471cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l); 2472cabdff1aSopenharmony_ci 2473cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r, 2474cabdff1aSopenharmony_ci filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 2475cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l, 2476cabdff1aSopenharmony_ci src32_l, filt1, dst1_r, src43_r, filt1, dst1_l,src43_l, 2477cabdff1aSopenharmony_ci filt1, dst0_r, dst0_l, dst1_r, dst1_l); 2478cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r, 2479cabdff1aSopenharmony_ci filt0, src87_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l); 2480cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst2_l, src98_l, 2481cabdff1aSopenharmony_ci filt1, dst3_r, src109_r, filt1, dst3_l, src109_l, filt1, 2482cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 2483cabdff1aSopenharmony_ci 2484cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 2485cabdff1aSopenharmony_ci __lsx_vst(dst0_l, dst, 16); 2486cabdff1aSopenharmony_ci __lsx_vst(dst2_r, dst, 32); 2487cabdff1aSopenharmony_ci __lsx_vst(dst2_l, dst, 48); 2488cabdff1aSopenharmony_ci dst += dst_stride; 2489cabdff1aSopenharmony_ci __lsx_vst(dst1_r, dst, 0); 2490cabdff1aSopenharmony_ci __lsx_vst(dst1_l, dst, 16); 2491cabdff1aSopenharmony_ci __lsx_vst(dst3_r, dst, 32); 2492cabdff1aSopenharmony_ci __lsx_vst(dst3_l, dst, 48); 2493cabdff1aSopenharmony_ci dst += dst_stride; 2494cabdff1aSopenharmony_ci 2495cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11); 2496cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8); 2497cabdff1aSopenharmony_ci src += src_stride_2x; 2498cabdff1aSopenharmony_ci _src += src_stride_2x; 2499cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r); 2500cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l); 2501cabdff1aSopenharmony_ci 2502cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r); 2503cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, src11, src10, src8, src11, src76_l, src87_l); 2504cabdff1aSopenharmony_ci 2505cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r, 2506cabdff1aSopenharmony_ci filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 2507cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, 2508cabdff1aSopenharmony_ci src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, 2509cabdff1aSopenharmony_ci filt1, dst0_r, dst0_l, dst1_r, dst1_l); 2510cabdff1aSopenharmony_ci 2511cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src98_l, filt0, src109_r, 2512cabdff1aSopenharmony_ci filt0, src109_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l); 2513cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst2_l, src76_l, 2514cabdff1aSopenharmony_ci filt1, dst3_r, src87_r, filt1, dst3_l, src87_l, filt1, 2515cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 2516cabdff1aSopenharmony_ci 2517cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 2518cabdff1aSopenharmony_ci __lsx_vst(dst0_l, dst, 16); 2519cabdff1aSopenharmony_ci __lsx_vst(dst2_r, dst, 32); 2520cabdff1aSopenharmony_ci __lsx_vst(dst2_l, dst, 48); 2521cabdff1aSopenharmony_ci dst += dst_stride; 2522cabdff1aSopenharmony_ci __lsx_vst(dst1_r, dst, 0); 2523cabdff1aSopenharmony_ci __lsx_vst(dst1_l, dst, 16); 2524cabdff1aSopenharmony_ci __lsx_vst(dst3_r, dst, 32); 2525cabdff1aSopenharmony_ci __lsx_vst(dst3_l, dst, 48); 2526cabdff1aSopenharmony_ci dst += dst_stride; 2527cabdff1aSopenharmony_ci } 2528cabdff1aSopenharmony_ci} 2529cabdff1aSopenharmony_ci 2530cabdff1aSopenharmony_cistatic void hevc_hv_4t_8x2_lsx(uint8_t *src, 2531cabdff1aSopenharmony_ci int32_t src_stride, 2532cabdff1aSopenharmony_ci int16_t *dst, 2533cabdff1aSopenharmony_ci int32_t dst_stride, 2534cabdff1aSopenharmony_ci const int8_t *filter_x, 2535cabdff1aSopenharmony_ci const int8_t *filter_y) 2536cabdff1aSopenharmony_ci{ 2537cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 2538cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 2539cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 2540cabdff1aSopenharmony_ci 2541cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4; 2542cabdff1aSopenharmony_ci __m128i filt0, filt1; 2543cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1; 2544cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 2545cabdff1aSopenharmony_ci __m128i mask1; 2546cabdff1aSopenharmony_ci __m128i filter_vec; 2547cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 2548cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4; 2549cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l; 2550cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst21_r, dst43_r; 2551cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst21_l, dst43_l; 2552cabdff1aSopenharmony_ci 2553cabdff1aSopenharmony_ci src -= (src_stride + 1); 2554cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 2555cabdff1aSopenharmony_ci 2556cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 2557cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 2558cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 2559cabdff1aSopenharmony_ci 2560cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 2561cabdff1aSopenharmony_ci 2562cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 2563cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 2564cabdff1aSopenharmony_ci src3 = __lsx_vldx(src, src_stride_3x); 2565cabdff1aSopenharmony_ci src4 = __lsx_vldx(src, src_stride_4x); 2566cabdff1aSopenharmony_ci 2567cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1); 2568cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3); 2569cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5); 2570cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7); 2571cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9); 2572cabdff1aSopenharmony_ci 2573cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 2574cabdff1aSopenharmony_ci vec6, filt0, dst0, dst1, dst2, dst3); 2575cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 2576cabdff1aSopenharmony_ci dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3); 2577cabdff1aSopenharmony_ci dst4 = __lsx_vdp2_h_bu_b(vec8, filt0); 2578cabdff1aSopenharmony_ci dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1); 2579cabdff1aSopenharmony_ci 2580cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 2581cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 2582cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r); 2583cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l); 2584cabdff1aSopenharmony_ci 2585cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 2586cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 2587cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 2588cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 2589cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 2590cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 2591cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 2592cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 2593cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 2594cabdff1aSopenharmony_ci __lsx_vst(dst1_r, dst + dst_stride, 0); 2595cabdff1aSopenharmony_ci} 2596cabdff1aSopenharmony_ci 2597cabdff1aSopenharmony_cistatic void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride, 2598cabdff1aSopenharmony_ci int16_t *dst, int32_t dst_stride, 2599cabdff1aSopenharmony_ci const int8_t *filter_x, 2600cabdff1aSopenharmony_ci const int8_t *filter_y, int32_t width8mult) 2601cabdff1aSopenharmony_ci{ 2602cabdff1aSopenharmony_ci int32_t cnt; 2603cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 2604cabdff1aSopenharmony_ci int32_t dst_stride_x = (dst_stride << 1); 2605cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 2606cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 2); 2607cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 2608cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride_x; 2609cabdff1aSopenharmony_ci 2610cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 2611cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2612cabdff1aSopenharmony_ci __m128i filt0, filt1, filt_h0, filt_h1, filter_vec; 2613cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6; 2614cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 2615cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 2616cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 2617cabdff1aSopenharmony_ci 2618cabdff1aSopenharmony_ci src -= (src_stride + 1); 2619cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 2620cabdff1aSopenharmony_ci 2621cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 2622cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 2623cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 2624cabdff1aSopenharmony_ci 2625cabdff1aSopenharmony_ci mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 2626cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 2627cabdff1aSopenharmony_ci 2628cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 2629cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 2630cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src, 2631cabdff1aSopenharmony_ci src_stride_3x, src, src_stride_4x, src1, src2, src3, src4); 2632cabdff1aSopenharmony_ci src += src_stride_4x; 2633cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 2634cabdff1aSopenharmony_ci src += (8 - src_stride_4x); 2635cabdff1aSopenharmony_ci 2636cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, 2637cabdff1aSopenharmony_ci vec0, vec1); 2638cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, 2639cabdff1aSopenharmony_ci vec2, vec3); 2640cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, 2641cabdff1aSopenharmony_ci vec4, vec5); 2642cabdff1aSopenharmony_ci 2643cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 2644cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 2645cabdff1aSopenharmony_ci dst0, dst1); 2646cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 2647cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 2648cabdff1aSopenharmony_ci 2649cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 2650cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 2651cabdff1aSopenharmony_ci 2652cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, 2653cabdff1aSopenharmony_ci vec0, vec1); 2654cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, 2655cabdff1aSopenharmony_ci vec2, vec3); 2656cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, 2657cabdff1aSopenharmony_ci vec4, vec5); 2658cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, 2659cabdff1aSopenharmony_ci vec6, vec7); 2660cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 2661cabdff1aSopenharmony_ci vec6, filt0, dst3, dst4, dst5, dst6); 2662cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1, 2663cabdff1aSopenharmony_ci dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6); 2664cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r); 2665cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l); 2666cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r); 2667cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l); 2668cabdff1aSopenharmony_ci 2669cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 2670cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 2671cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 2672cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 2673cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 2674cabdff1aSopenharmony_ci 2675cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 2676cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 2677cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 2678cabdff1aSopenharmony_ci filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 2679cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 2680cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 2681cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 2682cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, 2683cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 2684cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, 2685cabdff1aSopenharmony_ci dst0_r, dst1_r); 2686cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst2_l, dst2_r, dst3_l, dst3_r, 2687cabdff1aSopenharmony_ci dst2_r, dst3_r); 2688cabdff1aSopenharmony_ci 2689cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 2690cabdff1aSopenharmony_ci __lsx_vstx(dst1_r, dst, dst_stride_x); 2691cabdff1aSopenharmony_ci __lsx_vstx(dst2_r, dst, dst_stride_2x); 2692cabdff1aSopenharmony_ci __lsx_vstx(dst3_r, dst, dst_stride_3x); 2693cabdff1aSopenharmony_ci dst += 8; 2694cabdff1aSopenharmony_ci } 2695cabdff1aSopenharmony_ci} 2696cabdff1aSopenharmony_ci 2697cabdff1aSopenharmony_cistatic void hevc_hv_4t_8x6_lsx(uint8_t *src, 2698cabdff1aSopenharmony_ci int32_t src_stride, 2699cabdff1aSopenharmony_ci int16_t *dst, 2700cabdff1aSopenharmony_ci int32_t dst_stride, 2701cabdff1aSopenharmony_ci const int8_t *filter_x, 2702cabdff1aSopenharmony_ci const int8_t *filter_y) 2703cabdff1aSopenharmony_ci{ 2704cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 2705cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 1); 2706cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 2707cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 2708cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 2709cabdff1aSopenharmony_ci __m128i filt0, filt1; 2710cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1; 2711cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 2712cabdff1aSopenharmony_ci __m128i mask1, filter_vec; 2713cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 2714cabdff1aSopenharmony_ci __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 2715cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 2716cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 2717cabdff1aSopenharmony_ci __m128i dst4_r, dst4_l, dst5_r, dst5_l; 2718cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst10_l, dst32_l; 2719cabdff1aSopenharmony_ci __m128i dst21_r, dst43_r, dst21_l, dst43_l; 2720cabdff1aSopenharmony_ci __m128i dst54_r, dst54_l, dst65_r, dst65_l; 2721cabdff1aSopenharmony_ci __m128i dst76_r, dst76_l, dst87_r, dst87_l; 2722cabdff1aSopenharmony_ci 2723cabdff1aSopenharmony_ci src -= (src_stride + 1); 2724cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 2725cabdff1aSopenharmony_ci 2726cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 2727cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 2728cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 2729cabdff1aSopenharmony_ci 2730cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 2731cabdff1aSopenharmony_ci 2732cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 2733cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src, 2734cabdff1aSopenharmony_ci src_stride_3x, src, src_stride_4x, src1, src2, src3, src4); 2735cabdff1aSopenharmony_ci src += src_stride_4x; 2736cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src, 2737cabdff1aSopenharmony_ci src_stride_3x, src, src_stride_4x, src5, src6, src7, src8); 2738cabdff1aSopenharmony_ci 2739cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1, 2740cabdff1aSopenharmony_ci mask0, src1, src1, mask1, vec0, vec1, vec2, vec3); 2741cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,src3, src3, 2742cabdff1aSopenharmony_ci mask0, src3, src3, mask1, vec4, vec5, vec6, vec7); 2743cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5, 2744cabdff1aSopenharmony_ci mask0, src5, src5, mask1, vec8, vec9, vec10, vec11); 2745cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7, 2746cabdff1aSopenharmony_ci mask0, src7, src7, mask1, vec12, vec13, vec14, vec15); 2747cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, 2748cabdff1aSopenharmony_ci vec16, vec17); 2749cabdff1aSopenharmony_ci 2750cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 2751cabdff1aSopenharmony_ci filt0, dst0, dst1, dst2, dst3); 2752cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 2753cabdff1aSopenharmony_ci dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3); 2754cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, 2755cabdff1aSopenharmony_ci vec14, filt0, dst4, dst5, dst6, dst7); 2756cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6, 2757cabdff1aSopenharmony_ci vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7); 2758cabdff1aSopenharmony_ci dst8 = __lsx_vdp2_h_bu_b(vec16, filt0); 2759cabdff1aSopenharmony_ci dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1); 2760cabdff1aSopenharmony_ci 2761cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 2762cabdff1aSopenharmony_ci dst10_r, dst21_r, dst32_r, dst43_r); 2763cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 2764cabdff1aSopenharmony_ci dst10_l, dst21_l, dst32_l, dst43_l); 2765cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7, 2766cabdff1aSopenharmony_ci dst54_r, dst65_r, dst76_r, dst87_r); 2767cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7, 2768cabdff1aSopenharmony_ci dst54_l, dst65_l, dst76_l, dst87_l); 2769cabdff1aSopenharmony_ci 2770cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 2771cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 2772cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 2773cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 2774cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r, 2775cabdff1aSopenharmony_ci filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l); 2776cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 2777cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 2778cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 2779cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 2780cabdff1aSopenharmony_ci filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 2781cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 2782cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l, 2783cabdff1aSopenharmony_ci filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1, 2784cabdff1aSopenharmony_ci dst4_r, dst4_l, dst5_r, dst5_l); 2785cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, dst0_r, 2786cabdff1aSopenharmony_ci dst0_l, dst1_r, dst1_l); 2787cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, dst2_r, 2788cabdff1aSopenharmony_ci dst2_l, dst3_r, dst3_l); 2789cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6, dst4_r, 2790cabdff1aSopenharmony_ci dst4_l, dst5_r, dst5_l); 2791cabdff1aSopenharmony_ci 2792cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_h,dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, 2793cabdff1aSopenharmony_ci dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r); 2794cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r); 2795cabdff1aSopenharmony_ci 2796cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst, 0); 2797cabdff1aSopenharmony_ci __lsx_vstx(dst1_r, dst, dst_stride_2x); 2798cabdff1aSopenharmony_ci dst += dst_stride_2x; 2799cabdff1aSopenharmony_ci __lsx_vst(dst2_r, dst, 0); 2800cabdff1aSopenharmony_ci __lsx_vstx(dst3_r, dst, dst_stride_2x); 2801cabdff1aSopenharmony_ci dst += dst_stride_2x; 2802cabdff1aSopenharmony_ci __lsx_vst(dst4_r, dst, 0); 2803cabdff1aSopenharmony_ci __lsx_vstx(dst5_r, dst, dst_stride_2x); 2804cabdff1aSopenharmony_ci} 2805cabdff1aSopenharmony_ci 2806cabdff1aSopenharmony_cistatic void hevc_hv_4t_8multx4mult_lsx(uint8_t *src, 2807cabdff1aSopenharmony_ci int32_t src_stride, 2808cabdff1aSopenharmony_ci int16_t *dst, 2809cabdff1aSopenharmony_ci int32_t dst_stride, 2810cabdff1aSopenharmony_ci const int8_t *filter_x, 2811cabdff1aSopenharmony_ci const int8_t *filter_y, 2812cabdff1aSopenharmony_ci int32_t height, 2813cabdff1aSopenharmony_ci int32_t width8mult) 2814cabdff1aSopenharmony_ci{ 2815cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 2816cabdff1aSopenharmony_ci uint8_t *src_tmp; 2817cabdff1aSopenharmony_ci int16_t *dst_tmp; 2818cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 2819cabdff1aSopenharmony_ci int32_t dst_stride_x = (dst_stride << 1); 2820cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 2821cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 2); 2822cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 2823cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride_x; 2824cabdff1aSopenharmony_ci 2825cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6; 2826cabdff1aSopenharmony_ci __m128i filt0, filt1; 2827cabdff1aSopenharmony_ci __m128i filt_h0, filt_h1; 2828cabdff1aSopenharmony_ci __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 2829cabdff1aSopenharmony_ci __m128i mask1, filter_vec; 2830cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2831cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6; 2832cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 2833cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 2834cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 2835cabdff1aSopenharmony_ci 2836cabdff1aSopenharmony_ci src -= (src_stride + 1); 2837cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 2838cabdff1aSopenharmony_ci 2839cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 2840cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 2841cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 2842cabdff1aSopenharmony_ci 2843cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 2844cabdff1aSopenharmony_ci 2845cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 2846cabdff1aSopenharmony_ci src_tmp = src; 2847cabdff1aSopenharmony_ci dst_tmp = dst; 2848cabdff1aSopenharmony_ci 2849cabdff1aSopenharmony_ci src0 = __lsx_vld(src_tmp, 0); 2850cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 2851cabdff1aSopenharmony_ci src1, src2); 2852cabdff1aSopenharmony_ci src_tmp += src_stride_3x; 2853cabdff1aSopenharmony_ci 2854cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, 2855cabdff1aSopenharmony_ci vec0, vec1); 2856cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, 2857cabdff1aSopenharmony_ci vec2, vec3); 2858cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, 2859cabdff1aSopenharmony_ci vec4, vec5); 2860cabdff1aSopenharmony_ci 2861cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 2862cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 2863cabdff1aSopenharmony_ci dst0, dst1); 2864cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 2865cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 2866cabdff1aSopenharmony_ci 2867cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 2868cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 2869cabdff1aSopenharmony_ci 2870cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 2871cabdff1aSopenharmony_ci src3 = __lsx_vld(src_tmp, 0); 2872cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 2873cabdff1aSopenharmony_ci src4, src5); 2874cabdff1aSopenharmony_ci src6 = __lsx_vldx(src_tmp, src_stride_3x); 2875cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 2876cabdff1aSopenharmony_ci 2877cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, 2878cabdff1aSopenharmony_ci vec0, vec1); 2879cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, 2880cabdff1aSopenharmony_ci vec2, vec3); 2881cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, 2882cabdff1aSopenharmony_ci vec4, vec5); 2883cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, 2884cabdff1aSopenharmony_ci vec6, vec7); 2885cabdff1aSopenharmony_ci 2886cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 2887cabdff1aSopenharmony_ci vec6, filt0, dst3, dst4, dst5, dst6); 2888cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, 2889cabdff1aSopenharmony_ci filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3, 2890cabdff1aSopenharmony_ci dst4, dst5, dst6); 2891cabdff1aSopenharmony_ci 2892cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r); 2893cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l); 2894cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r); 2895cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l); 2896cabdff1aSopenharmony_ci 2897cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 2898cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 2899cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 2900cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 2901cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, 2902cabdff1aSopenharmony_ci dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, 2903cabdff1aSopenharmony_ci dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l); 2904cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, 2905cabdff1aSopenharmony_ci dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, 2906cabdff1aSopenharmony_ci dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l); 2907cabdff1aSopenharmony_ci 2908cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 2909cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 2910cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, 2911cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 2912cabdff1aSopenharmony_ci 2913cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, 2914cabdff1aSopenharmony_ci dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r); 2915cabdff1aSopenharmony_ci 2916cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst_tmp, 0); 2917cabdff1aSopenharmony_ci __lsx_vstx(dst1_r, dst_tmp, dst_stride_x); 2918cabdff1aSopenharmony_ci __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x); 2919cabdff1aSopenharmony_ci __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x); 2920cabdff1aSopenharmony_ci dst_tmp += dst_stride_2x; 2921cabdff1aSopenharmony_ci 2922cabdff1aSopenharmony_ci dst10_r = dst54_r; 2923cabdff1aSopenharmony_ci dst10_l = dst54_l; 2924cabdff1aSopenharmony_ci dst21_r = dst65_r; 2925cabdff1aSopenharmony_ci dst21_l = dst65_l; 2926cabdff1aSopenharmony_ci dst2 = dst6; 2927cabdff1aSopenharmony_ci } 2928cabdff1aSopenharmony_ci src += 8; 2929cabdff1aSopenharmony_ci dst += 8; 2930cabdff1aSopenharmony_ci } 2931cabdff1aSopenharmony_ci} 2932cabdff1aSopenharmony_ci 2933cabdff1aSopenharmony_cistatic void hevc_hv_4t_8w_lsx(uint8_t *src, 2934cabdff1aSopenharmony_ci int32_t src_stride, 2935cabdff1aSopenharmony_ci int16_t *dst, 2936cabdff1aSopenharmony_ci int32_t dst_stride, 2937cabdff1aSopenharmony_ci const int8_t *filter_x, 2938cabdff1aSopenharmony_ci const int8_t *filter_y, 2939cabdff1aSopenharmony_ci int32_t height) 2940cabdff1aSopenharmony_ci{ 2941cabdff1aSopenharmony_ci 2942cabdff1aSopenharmony_ci if (2 == height) { 2943cabdff1aSopenharmony_ci hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride, 2944cabdff1aSopenharmony_ci filter_x, filter_y); 2945cabdff1aSopenharmony_ci } else if (4 == height) { 2946cabdff1aSopenharmony_ci hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, 2947cabdff1aSopenharmony_ci filter_x, filter_y, 1); 2948cabdff1aSopenharmony_ci } else if (6 == height) { 2949cabdff1aSopenharmony_ci hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride, 2950cabdff1aSopenharmony_ci filter_x, filter_y); 2951cabdff1aSopenharmony_ci } else if (0 == (height & 0x03)) { 2952cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 2953cabdff1aSopenharmony_ci filter_x, filter_y, height, 1); 2954cabdff1aSopenharmony_ci } 2955cabdff1aSopenharmony_ci} 2956cabdff1aSopenharmony_ci 2957cabdff1aSopenharmony_cistatic void hevc_hv_4t_12w_lsx(uint8_t *src, 2958cabdff1aSopenharmony_ci int32_t src_stride, 2959cabdff1aSopenharmony_ci int16_t *dst, 2960cabdff1aSopenharmony_ci int32_t dst_stride, 2961cabdff1aSopenharmony_ci const int8_t *filter_x, 2962cabdff1aSopenharmony_ci const int8_t *filter_y, 2963cabdff1aSopenharmony_ci int32_t height) 2964cabdff1aSopenharmony_ci{ 2965cabdff1aSopenharmony_ci uint32_t loop_cnt; 2966cabdff1aSopenharmony_ci uint8_t *src_tmp; 2967cabdff1aSopenharmony_ci int16_t *dst_tmp; 2968cabdff1aSopenharmony_ci int32_t src_stride_2x = (src_stride << 1); 2969cabdff1aSopenharmony_ci int32_t dst_stride_x = (dst_stride << 1); 2970cabdff1aSopenharmony_ci int32_t src_stride_4x = (src_stride << 2); 2971cabdff1aSopenharmony_ci int32_t dst_stride_2x = (dst_stride << 2); 2972cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 2973cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride_x; 2974cabdff1aSopenharmony_ci 2975cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2976cabdff1aSopenharmony_ci __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2977cabdff1aSopenharmony_ci __m128i mask0, mask1, mask2, mask3; 2978cabdff1aSopenharmony_ci __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, dst0; 2979cabdff1aSopenharmony_ci __m128i dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73; 2980cabdff1aSopenharmony_ci __m128i dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r; 2981cabdff1aSopenharmony_ci __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 2982cabdff1aSopenharmony_ci __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 2983cabdff1aSopenharmony_ci __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 2984cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2985cabdff1aSopenharmony_ci 2986cabdff1aSopenharmony_ci src -= (src_stride + 1); 2987cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 2988cabdff1aSopenharmony_ci 2989cabdff1aSopenharmony_ci filter_vec = __lsx_vld(filter_y, 0); 2990cabdff1aSopenharmony_ci filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 2991cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 2992cabdff1aSopenharmony_ci 2993cabdff1aSopenharmony_ci mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 2994cabdff1aSopenharmony_ci mask1 = __lsx_vaddi_bu(mask0, 2); 2995cabdff1aSopenharmony_ci 2996cabdff1aSopenharmony_ci src_tmp = src; 2997cabdff1aSopenharmony_ci dst_tmp = dst; 2998cabdff1aSopenharmony_ci 2999cabdff1aSopenharmony_ci src0 = __lsx_vld(src_tmp, 0); 3000cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 3001cabdff1aSopenharmony_ci src1, src2); 3002cabdff1aSopenharmony_ci src_tmp += src_stride_3x; 3003cabdff1aSopenharmony_ci 3004cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1); 3005cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3); 3006cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5); 3007cabdff1aSopenharmony_ci 3008cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 3009cabdff1aSopenharmony_ci dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 3010cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 3011cabdff1aSopenharmony_ci dst0, dst1); 3012cabdff1aSopenharmony_ci dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 3013cabdff1aSopenharmony_ci 3014cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 3015cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 3016cabdff1aSopenharmony_ci 3017cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 3018cabdff1aSopenharmony_ci src3 = __lsx_vld(src_tmp, 0); 3019cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 3020cabdff1aSopenharmony_ci src4, src5); 3021cabdff1aSopenharmony_ci src6 = __lsx_vldx(src_tmp, src_stride_3x); 3022cabdff1aSopenharmony_ci src_tmp += src_stride_4x; 3023cabdff1aSopenharmony_ci 3024cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, 3025cabdff1aSopenharmony_ci vec0, vec1); 3026cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, 3027cabdff1aSopenharmony_ci vec2, vec3); 3028cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, 3029cabdff1aSopenharmony_ci vec4, vec5); 3030cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, 3031cabdff1aSopenharmony_ci vec6, vec7); 3032cabdff1aSopenharmony_ci 3033cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 3034cabdff1aSopenharmony_ci vec6, filt0, dst3, dst4, dst5, dst6); 3035cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, 3036cabdff1aSopenharmony_ci filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3, 3037cabdff1aSopenharmony_ci dst4, dst5, dst6); 3038cabdff1aSopenharmony_ci 3039cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r); 3040cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l); 3041cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r); 3042cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l); 3043cabdff1aSopenharmony_ci 3044cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 3045cabdff1aSopenharmony_ci filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 3046cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 3047cabdff1aSopenharmony_ci filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 3048cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 3049cabdff1aSopenharmony_ci filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 3050cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 3051cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 3052cabdff1aSopenharmony_ci filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 3053cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 3054cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 3055cabdff1aSopenharmony_ci dst0_r, dst0_l, dst1_r, dst1_l); 3056cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, 3057cabdff1aSopenharmony_ci dst2_r, dst2_l, dst3_r, dst3_l); 3058cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, 3059cabdff1aSopenharmony_ci dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r); 3060cabdff1aSopenharmony_ci __lsx_vst(dst0_r, dst_tmp, 0); 3061cabdff1aSopenharmony_ci __lsx_vstx(dst1_r, dst_tmp, dst_stride_x); 3062cabdff1aSopenharmony_ci __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x); 3063cabdff1aSopenharmony_ci __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x); 3064cabdff1aSopenharmony_ci dst_tmp += dst_stride_2x; 3065cabdff1aSopenharmony_ci 3066cabdff1aSopenharmony_ci dst10_r = dst54_r; 3067cabdff1aSopenharmony_ci dst10_l = dst54_l; 3068cabdff1aSopenharmony_ci dst21_r = dst65_r; 3069cabdff1aSopenharmony_ci dst21_l = dst65_l; 3070cabdff1aSopenharmony_ci dst2 = dst6; 3071cabdff1aSopenharmony_ci } 3072cabdff1aSopenharmony_ci 3073cabdff1aSopenharmony_ci src += 8; 3074cabdff1aSopenharmony_ci dst += 8; 3075cabdff1aSopenharmony_ci 3076cabdff1aSopenharmony_ci mask2 = __lsx_vld(ff_hevc_mask_arr, 16); 3077cabdff1aSopenharmony_ci mask3 = __lsx_vaddi_bu(mask2, 2); 3078cabdff1aSopenharmony_ci 3079cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 3080cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 3081cabdff1aSopenharmony_ci src += src_stride_3x; 3082cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1); 3083cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3); 3084cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21); 3085cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1, 3086cabdff1aSopenharmony_ci dst10, dst21); 3087cabdff1aSopenharmony_ci dst10_r = __lsx_vilvl_h(dst21, dst10); 3088cabdff1aSopenharmony_ci dst21_r = __lsx_vilvh_h(dst21, dst10); 3089cabdff1aSopenharmony_ci dst22 = __lsx_vreplvei_d(dst21, 1); 3090cabdff1aSopenharmony_ci 3091cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 3092cabdff1aSopenharmony_ci src3 = __lsx_vld(src, 0); 3093cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5); 3094cabdff1aSopenharmony_ci src6 = __lsx_vldx(src, src_stride_3x); 3095cabdff1aSopenharmony_ci src += src_stride_4x; 3096cabdff1aSopenharmony_ci src7 = __lsx_vld(src, 0); 3097cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9); 3098cabdff1aSopenharmony_ci src10 = __lsx_vldx(src, src_stride_3x); 3099cabdff1aSopenharmony_ci src += src_stride_4x; 3100cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3, 3101cabdff1aSopenharmony_ci vec0, vec1); 3102cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src8, src4, mask2, src8, src4, mask3, 3103cabdff1aSopenharmony_ci vec2, vec3); 3104cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3, 3105cabdff1aSopenharmony_ci vec4, vec5); 3106cabdff1aSopenharmony_ci DUP2_ARG3(__lsx_vshuf_b, src10, src6, mask2, src10, src6, mask3, 3107cabdff1aSopenharmony_ci vec6, vec7); 3108cabdff1aSopenharmony_ci 3109cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 3110cabdff1aSopenharmony_ci vec6, filt0, dst73, dst84, dst95, dst106); 3111cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3, 3112cabdff1aSopenharmony_ci filt1, dst95, vec5, filt1, dst106, vec7, filt1, dst73, 3113cabdff1aSopenharmony_ci dst84, dst95, dst106); 3114cabdff1aSopenharmony_ci 3115cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst73, dst22, dst84, dst73, dst32_r, dst43_r); 3116cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r); 3117cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_h, dst95, dst84, dst106, dst95, dst54_r, dst65_r); 3118cabdff1aSopenharmony_ci dst109_r = __lsx_vilvh_h(dst106, dst95); 3119cabdff1aSopenharmony_ci dst22 = __lsx_vreplvei_d(dst73, 1); 3120cabdff1aSopenharmony_ci dst76_r = __lsx_vilvl_h(dst22, dst106); 3121cabdff1aSopenharmony_ci 3122cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r, 3123cabdff1aSopenharmony_ci filt_h0, dst43_r, filt_h0, tmp0, tmp1, tmp2, tmp3); 3124cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r, 3125cabdff1aSopenharmony_ci filt_h0, dst87_r, filt_h0, tmp4, tmp5, tmp6, tmp7); 3126cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, tmp0, dst32_r, filt_h1, tmp1, dst43_r, 3127cabdff1aSopenharmony_ci filt_h1, tmp2, dst54_r, filt_h1, tmp3, dst65_r, filt_h1, 3128cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 3129cabdff1aSopenharmony_ci DUP4_ARG3(__lsx_vdp2add_w_h, tmp4, dst76_r, filt_h1, tmp5, dst87_r, 3130cabdff1aSopenharmony_ci filt_h1, tmp6, dst98_r, filt_h1, tmp7, dst109_r, filt_h1, 3131cabdff1aSopenharmony_ci tmp4, tmp5, tmp6, tmp7); 3132cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, tmp0, 6, tmp1, 6, tmp2, 6, tmp3, 6, 3133cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 3134cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsrai_w, tmp4, 6, tmp5, 6, tmp6, 6, tmp7, 6, 3135cabdff1aSopenharmony_ci tmp4, tmp5, tmp6, tmp7); 3136cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, 3137cabdff1aSopenharmony_ci tmp7, tmp6, tmp0, tmp1, tmp2, tmp3); 3138cabdff1aSopenharmony_ci 3139cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp0, dst, 0, 0); 3140cabdff1aSopenharmony_ci dst += dst_stride; 3141cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp0, dst, 0, 1); 3142cabdff1aSopenharmony_ci dst += dst_stride; 3143cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp1, dst, 0, 0); 3144cabdff1aSopenharmony_ci dst += dst_stride; 3145cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp1, dst, 0, 1); 3146cabdff1aSopenharmony_ci dst += dst_stride; 3147cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp2, dst, 0, 0); 3148cabdff1aSopenharmony_ci dst += dst_stride; 3149cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp2, dst, 0, 1); 3150cabdff1aSopenharmony_ci dst += dst_stride; 3151cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp3, dst, 0, 0); 3152cabdff1aSopenharmony_ci dst += dst_stride; 3153cabdff1aSopenharmony_ci __lsx_vstelm_d(tmp3, dst, 0, 1); 3154cabdff1aSopenharmony_ci dst += dst_stride; 3155cabdff1aSopenharmony_ci 3156cabdff1aSopenharmony_ci dst10_r = dst98_r; 3157cabdff1aSopenharmony_ci dst21_r = dst109_r; 3158cabdff1aSopenharmony_ci dst22 = __lsx_vreplvei_d(dst106, 1); 3159cabdff1aSopenharmony_ci } 3160cabdff1aSopenharmony_ci} 3161cabdff1aSopenharmony_ci 3162cabdff1aSopenharmony_cistatic void hevc_hv_4t_16w_lsx(uint8_t *src, 3163cabdff1aSopenharmony_ci int32_t src_stride, 3164cabdff1aSopenharmony_ci int16_t *dst, 3165cabdff1aSopenharmony_ci int32_t dst_stride, 3166cabdff1aSopenharmony_ci const int8_t *filter_x, 3167cabdff1aSopenharmony_ci const int8_t *filter_y, 3168cabdff1aSopenharmony_ci int32_t height) 3169cabdff1aSopenharmony_ci{ 3170cabdff1aSopenharmony_ci if (4 == height) { 3171cabdff1aSopenharmony_ci hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, 3172cabdff1aSopenharmony_ci filter_x, filter_y, 2); 3173cabdff1aSopenharmony_ci } else { 3174cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 3175cabdff1aSopenharmony_ci filter_x, filter_y, height, 2); 3176cabdff1aSopenharmony_ci } 3177cabdff1aSopenharmony_ci} 3178cabdff1aSopenharmony_ci 3179cabdff1aSopenharmony_cistatic void hevc_hv_4t_24w_lsx(uint8_t *src, 3180cabdff1aSopenharmony_ci int32_t src_stride, 3181cabdff1aSopenharmony_ci int16_t *dst, 3182cabdff1aSopenharmony_ci int32_t dst_stride, 3183cabdff1aSopenharmony_ci const int8_t *filter_x, 3184cabdff1aSopenharmony_ci const int8_t *filter_y, 3185cabdff1aSopenharmony_ci int32_t height) 3186cabdff1aSopenharmony_ci{ 3187cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 3188cabdff1aSopenharmony_ci filter_x, filter_y, height, 3); 3189cabdff1aSopenharmony_ci} 3190cabdff1aSopenharmony_ci 3191cabdff1aSopenharmony_cistatic void hevc_hv_4t_32w_lsx(uint8_t *src, 3192cabdff1aSopenharmony_ci int32_t src_stride, 3193cabdff1aSopenharmony_ci int16_t *dst, 3194cabdff1aSopenharmony_ci int32_t dst_stride, 3195cabdff1aSopenharmony_ci const int8_t *filter_x, 3196cabdff1aSopenharmony_ci const int8_t *filter_y, 3197cabdff1aSopenharmony_ci int32_t height) 3198cabdff1aSopenharmony_ci{ 3199cabdff1aSopenharmony_ci hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 3200cabdff1aSopenharmony_ci filter_x, filter_y, height, 4); 3201cabdff1aSopenharmony_ci} 3202cabdff1aSopenharmony_ci 3203cabdff1aSopenharmony_ci#define MC_COPY(WIDTH) \ 3204cabdff1aSopenharmony_civoid ff_hevc_put_hevc_pel_pixels##WIDTH##_8_lsx(int16_t *dst, \ 3205cabdff1aSopenharmony_ci uint8_t *src, \ 3206cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 3207cabdff1aSopenharmony_ci int height, \ 3208cabdff1aSopenharmony_ci intptr_t mx, \ 3209cabdff1aSopenharmony_ci intptr_t my, \ 3210cabdff1aSopenharmony_ci int width) \ 3211cabdff1aSopenharmony_ci{ \ 3212cabdff1aSopenharmony_ci hevc_copy_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE, height); \ 3213cabdff1aSopenharmony_ci} 3214cabdff1aSopenharmony_ci 3215cabdff1aSopenharmony_ciMC_COPY(4); 3216cabdff1aSopenharmony_ciMC_COPY(6); 3217cabdff1aSopenharmony_ciMC_COPY(8); 3218cabdff1aSopenharmony_ciMC_COPY(12); 3219cabdff1aSopenharmony_ciMC_COPY(16); 3220cabdff1aSopenharmony_ciMC_COPY(24); 3221cabdff1aSopenharmony_ciMC_COPY(32); 3222cabdff1aSopenharmony_ciMC_COPY(48); 3223cabdff1aSopenharmony_ciMC_COPY(64); 3224cabdff1aSopenharmony_ci 3225cabdff1aSopenharmony_ci#undef MC_COPY 3226cabdff1aSopenharmony_ci 3227cabdff1aSopenharmony_ci#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 3228cabdff1aSopenharmony_civoid ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_lsx(int16_t *dst, \ 3229cabdff1aSopenharmony_ci uint8_t *src, \ 3230cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 3231cabdff1aSopenharmony_ci int height, \ 3232cabdff1aSopenharmony_ci intptr_t mx, \ 3233cabdff1aSopenharmony_ci intptr_t my, \ 3234cabdff1aSopenharmony_ci int width) \ 3235cabdff1aSopenharmony_ci{ \ 3236cabdff1aSopenharmony_ci const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 3237cabdff1aSopenharmony_ci \ 3238cabdff1aSopenharmony_ci hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, \ 3239cabdff1aSopenharmony_ci MAX_PB_SIZE, filter, height); \ 3240cabdff1aSopenharmony_ci} 3241cabdff1aSopenharmony_ci 3242cabdff1aSopenharmony_ciMC(qpel, h, 4, 8, hz, mx); 3243cabdff1aSopenharmony_ciMC(qpel, h, 8, 8, hz, mx); 3244cabdff1aSopenharmony_ciMC(qpel, h, 12, 8, hz, mx); 3245cabdff1aSopenharmony_ciMC(qpel, h, 16, 8, hz, mx); 3246cabdff1aSopenharmony_ciMC(qpel, h, 24, 8, hz, mx); 3247cabdff1aSopenharmony_ciMC(qpel, h, 32, 8, hz, mx); 3248cabdff1aSopenharmony_ciMC(qpel, h, 48, 8, hz, mx); 3249cabdff1aSopenharmony_ciMC(qpel, h, 64, 8, hz, mx); 3250cabdff1aSopenharmony_ci 3251cabdff1aSopenharmony_ciMC(qpel, v, 4, 8, vt, my); 3252cabdff1aSopenharmony_ciMC(qpel, v, 8, 8, vt, my); 3253cabdff1aSopenharmony_ciMC(qpel, v, 12, 8, vt, my); 3254cabdff1aSopenharmony_ciMC(qpel, v, 16, 8, vt, my); 3255cabdff1aSopenharmony_ciMC(qpel, v, 24, 8, vt, my); 3256cabdff1aSopenharmony_ciMC(qpel, v, 32, 8, vt, my); 3257cabdff1aSopenharmony_ciMC(qpel, v, 48, 8, vt, my); 3258cabdff1aSopenharmony_ciMC(qpel, v, 64, 8, vt, my); 3259cabdff1aSopenharmony_ci 3260cabdff1aSopenharmony_ciMC(epel, h, 32, 4, hz, mx); 3261cabdff1aSopenharmony_ci 3262cabdff1aSopenharmony_ciMC(epel, v, 16, 4, vt, my); 3263cabdff1aSopenharmony_ciMC(epel, v, 24, 4, vt, my); 3264cabdff1aSopenharmony_ciMC(epel, v, 32, 4, vt, my); 3265cabdff1aSopenharmony_ci 3266cabdff1aSopenharmony_ci#undef MC 3267cabdff1aSopenharmony_ci 3268cabdff1aSopenharmony_ci#define MC_HV(PEL, WIDTH, TAP) \ 3269cabdff1aSopenharmony_civoid ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_lsx(int16_t *dst, \ 3270cabdff1aSopenharmony_ci uint8_t *src, \ 3271cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 3272cabdff1aSopenharmony_ci int height, \ 3273cabdff1aSopenharmony_ci intptr_t mx, \ 3274cabdff1aSopenharmony_ci intptr_t my, \ 3275cabdff1aSopenharmony_ci int width) \ 3276cabdff1aSopenharmony_ci{ \ 3277cabdff1aSopenharmony_ci const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 3278cabdff1aSopenharmony_ci const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 3279cabdff1aSopenharmony_ci \ 3280cabdff1aSopenharmony_ci hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE, \ 3281cabdff1aSopenharmony_ci filter_x, filter_y, height); \ 3282cabdff1aSopenharmony_ci} 3283cabdff1aSopenharmony_ci 3284cabdff1aSopenharmony_ciMC_HV(qpel, 4, 8); 3285cabdff1aSopenharmony_ciMC_HV(qpel, 8, 8); 3286cabdff1aSopenharmony_ciMC_HV(qpel, 12, 8); 3287cabdff1aSopenharmony_ciMC_HV(qpel, 16, 8); 3288cabdff1aSopenharmony_ciMC_HV(qpel, 24, 8); 3289cabdff1aSopenharmony_ciMC_HV(qpel, 32, 8); 3290cabdff1aSopenharmony_ciMC_HV(qpel, 48, 8); 3291cabdff1aSopenharmony_ciMC_HV(qpel, 64, 8); 3292cabdff1aSopenharmony_ci 3293cabdff1aSopenharmony_ciMC_HV(epel, 8, 4); 3294cabdff1aSopenharmony_ciMC_HV(epel, 12, 4); 3295cabdff1aSopenharmony_ciMC_HV(epel, 16, 4); 3296cabdff1aSopenharmony_ciMC_HV(epel, 24, 4); 3297cabdff1aSopenharmony_ciMC_HV(epel, 32, 4); 3298cabdff1aSopenharmony_ci 3299cabdff1aSopenharmony_ci#undef MC_HV 3300