1/* 2 * Copyright (c) 2022 Loongson Technology Corporation Limited 3 * Contributed by Lu Wang <wanglu@loongson.cn> 4 * Hao Chen <chenhao@loongson.cn> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/loongarch/loongson_intrinsics.h" 24#include "hevcdsp_lsx.h" 25 26static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 27 /* 8 width cases */ 28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 30}; 31 32static av_always_inline __m128i 33hevc_bi_rnd_clip(__m128i in0, __m128i vec0, __m128i in1, __m128i vec1) 34{ 35 __m128i out; 36 37 vec0 = __lsx_vsadd_h(in0, vec0); 38 vec1 = __lsx_vsadd_h(in1, vec1); 39 out = __lsx_vssrarni_bu_h(vec1, vec0, 7); 40 return out; 41} 42 43/* hevc_bi_copy: dst = av_clip_uint8((src0 << 6 + src1) >> 7) */ 44static 45void hevc_bi_copy_4w_lsx(uint8_t *src0_ptr, int32_t src_stride, 46 int16_t *src1_ptr, int32_t src2_stride, 47 uint8_t *dst, int32_t dst_stride, int32_t height) 48{ 49 int32_t loop_cnt = height >> 3; 50 int32_t res = (height & 0x07) >> 1; 51 int32_t src_stride_2x = (src_stride << 1); 52 int32_t dst_stride_2x = (dst_stride << 1); 53 int32_t src_stride_4x = (src_stride << 2); 54 int32_t dst_stride_4x = (dst_stride << 2); 55 int32_t src2_stride_2x = (src2_stride << 1); 56 int32_t src2_stride_4x = (src2_stride << 2); 57 int32_t src_stride_3x = src_stride_2x + src_stride; 58 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 59 int32_t src2_stride_3x = src2_stride_2x + src2_stride; 60 __m128i src0, src1; 61 __m128i zero = __lsx_vldi(0); 62 __m128i in0, in1, in2, in3; 63 __m128i tmp0, tmp1, tmp2, tmp3; 64 __m128i reg0, reg1, reg2, reg3; 65 __m128i dst0, dst1, dst2, dst3; 66 67 for (;loop_cnt--;) { 68 reg0 = __lsx_vldrepl_w(src0_ptr, 0); 69 reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0); 70 reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0); 71 reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0); 72 src0_ptr += src_stride_4x; 73 DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1); 74 src0 = __lsx_vilvl_d(tmp1, tmp0); 75 reg0 = __lsx_vldrepl_w(src0_ptr, 0); 76 reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0); 77 reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0); 78 reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0); 79 DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1); 80 src1 = __lsx_vilvl_d(tmp1, tmp0); 81 src0_ptr += src_stride_4x; 82 83 tmp0 = __lsx_vldrepl_d(src1_ptr, 0); 84 tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0); 85 tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0); 86 tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0); 87 src1_ptr += src2_stride_4x; 88 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in0, in1); 89 tmp0 = __lsx_vldrepl_d(src1_ptr, 0); 90 tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0); 91 tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0); 92 tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0); 93 src1_ptr += src2_stride_4x; 94 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in2, in3); 95 DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst0, dst2); 96 DUP2_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, dst1, dst3); 97 DUP2_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst1, dst3); 98 dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 99 dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 100 __lsx_vstelm_w(dst0, dst, 0, 0); 101 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1); 102 __lsx_vstelm_w(dst0, dst + dst_stride_2x, 0, 2); 103 __lsx_vstelm_w(dst0, dst + dst_stride_3x, 0, 3); 104 dst += dst_stride_4x; 105 __lsx_vstelm_w(dst1, dst, 0, 0); 106 __lsx_vstelm_w(dst1, dst + dst_stride, 0, 1); 107 __lsx_vstelm_w(dst1, dst + dst_stride_2x, 0, 2); 108 __lsx_vstelm_w(dst1, dst + dst_stride_3x, 0, 3); 109 dst += dst_stride_4x; 110 } 111 for(;res--;) { 112 reg0 = __lsx_vldrepl_w(src0_ptr, 0); 113 reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0); 114 reg2 = __lsx_vldrepl_d(src1_ptr, 0); 115 reg3 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0); 116 src0 = __lsx_vilvl_w(reg1, reg0); 117 in0 = __lsx_vilvl_d(reg3, reg2); 118 dst0 = __lsx_vsllwil_hu_bu(src0, 6); 119 dst0 = __lsx_vsadd_h(dst0, in0); 120 dst0 = __lsx_vssrarni_bu_h(dst0, dst0, 7); 121 __lsx_vstelm_w(dst0, dst, 0, 0); 122 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1); 123 src0_ptr += src_stride_2x; 124 src1_ptr += src2_stride_2x; 125 dst += dst_stride_2x; 126 } 127} 128 129static 130void hevc_bi_copy_6w_lsx(uint8_t *src0_ptr, int32_t src_stride, 131 int16_t *src1_ptr, int32_t src2_stride, 132 uint8_t *dst, int32_t dst_stride, int32_t height) 133{ 134 int32_t loop_cnt; 135 int32_t res = (height & 0x07) >> 1; 136 int32_t src_stride_2x = (src_stride << 1); 137 int32_t dst_stride_2x = (dst_stride << 1); 138 int32_t src_stride_4x = (src_stride << 2); 139 int32_t dst_stride_4x = (dst_stride << 2); 140 int32_t src2_stride_x = (src2_stride << 1); 141 int32_t src2_stride_2x = (src2_stride << 2); 142 int32_t src_stride_3x = src_stride_2x + src_stride; 143 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 144 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 145 __m128i out0, out1, out2, out3; 146 __m128i zero = __lsx_vldi(0); 147 __m128i src0, src1, src2, src3; 148 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 149 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 150 __m128i reg0, reg1, reg2, reg3; 151 152 for (loop_cnt = (height >> 3); loop_cnt--;) { 153 reg0 = __lsx_vldrepl_d(src0_ptr, 0); 154 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 155 reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0); 156 reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0); 157 DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1); 158 src0_ptr += src_stride_4x; 159 reg0 = __lsx_vldrepl_d(src0_ptr, 0); 160 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 161 reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0); 162 reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0); 163 DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3); 164 src0_ptr += src_stride_4x; 165 in0 = __lsx_vld(src1_ptr, 0); 166 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 167 src2_stride_2x, in1, in2); 168 in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 169 src1_ptr += src2_stride_2x; 170 in4 = __lsx_vld(src1_ptr, 0); 171 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 172 src2_stride_2x, in5, in6); 173 in7 = __lsx_vldx(src1_ptr, src2_stride_3x); 174 src1_ptr += src2_stride_2x; 175 DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 176 dst0, dst2, dst4, dst6); 177 DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3, 178 dst1, dst3, dst5, dst7); 179 DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1, dst3, 180 dst5, dst7); 181 out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 182 out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 183 out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5); 184 out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7); 185 __lsx_vstelm_w(out0, dst, 0, 0); 186 __lsx_vstelm_w(out0, dst + dst_stride, 0, 2); 187 __lsx_vstelm_h(out0, dst, 4, 2); 188 __lsx_vstelm_h(out0, dst + dst_stride, 4, 6); 189 __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 0); 190 __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 2); 191 __lsx_vstelm_h(out1, dst + dst_stride_2x, 4, 2); 192 __lsx_vstelm_h(out1, dst + dst_stride_3x, 4, 6); 193 dst += dst_stride_4x; 194 __lsx_vstelm_w(out2, dst, 0, 0); 195 __lsx_vstelm_w(out2, dst + dst_stride, 0, 2); 196 __lsx_vstelm_h(out2, dst, 4, 2); 197 __lsx_vstelm_h(out2, dst + dst_stride, 4, 6); 198 __lsx_vstelm_w(out3, dst + dst_stride_2x, 0, 0); 199 __lsx_vstelm_w(out3, dst + dst_stride_3x, 0, 2); 200 __lsx_vstelm_h(out3, dst + dst_stride_2x, 4, 2); 201 __lsx_vstelm_h(out3, dst + dst_stride_3x, 4, 6); 202 dst += dst_stride_4x; 203 } 204 for (;res--;) { 205 reg0 = __lsx_vldrepl_d(src0_ptr, 0); 206 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 207 src0 = __lsx_vilvl_d(reg1, reg0); 208 src0_ptr += src_stride_2x; 209 in0 = __lsx_vld(src1_ptr, 0); 210 in1 = __lsx_vldx(src1_ptr, src2_stride_x); 211 src1_ptr += src2_stride_x; 212 dst0 = __lsx_vsllwil_hu_bu(src0, 6); 213 dst1 = __lsx_vilvh_b(zero, src0); 214 dst1 = __lsx_vslli_h(dst1, 6); 215 out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 216 __lsx_vstelm_w(out0, dst, 0, 0); 217 __lsx_vstelm_h(out0, dst, 4, 2); 218 dst += dst_stride; 219 __lsx_vstelm_w(out0, dst, 0, 2); 220 __lsx_vstelm_h(out0, dst, 4, 6); 221 dst += dst_stride; 222 } 223} 224 225static 226void hevc_bi_copy_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, 227 int16_t *src1_ptr, int32_t src2_stride, 228 uint8_t *dst, int32_t dst_stride, int32_t height) 229{ 230 int32_t loop_cnt = height >> 3; 231 int32_t res = (height & 7) >> 1; 232 int32_t src_stride_2x = (src_stride << 1); 233 int32_t dst_stride_2x = (dst_stride << 1); 234 int32_t src_stride_4x = (src_stride << 2); 235 int32_t dst_stride_4x = (dst_stride << 2); 236 int32_t src2_stride_x = (src2_stride << 1); 237 int32_t src2_stride_2x = (src2_stride << 2); 238 int32_t src_stride_3x = src_stride_2x + src_stride; 239 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 240 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 241 __m128i out0, out1, out2, out3; 242 __m128i src0, src1, src2, src3; 243 __m128i zero = __lsx_vldi(0); 244 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 245 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 246 __m128i reg0, reg1, reg2, reg3; 247 248 for (loop_cnt = (height >> 3); loop_cnt--;) { 249 reg0 = __lsx_vldrepl_d(src0_ptr, 0); 250 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 251 reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0); 252 reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0); 253 DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1); 254 src0_ptr += src_stride_4x; 255 reg0 = __lsx_vldrepl_d(src0_ptr, 0); 256 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 257 reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0); 258 reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0); 259 DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3); 260 src0_ptr += src_stride_4x; 261 DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 262 dst0, dst2, dst4, dst6); 263 DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, 264 src3, dst1, dst3, dst5, dst7); 265 DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1, 266 dst3, dst5, dst7); 267 in0 = __lsx_vld(src1_ptr, 0); 268 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 269 src2_stride_2x, in1, in2); 270 in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 271 src1_ptr += src2_stride_2x; 272 in4 = __lsx_vld(src1_ptr, 0); 273 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 274 src2_stride_2x, in5, in6); 275 in7 = __lsx_vldx(src1_ptr, src2_stride_3x); 276 src1_ptr += src2_stride_2x; 277 out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 278 out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 279 out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5); 280 out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7); 281 __lsx_vstelm_d(out0, dst, 0, 0); 282 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 283 __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 284 __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 285 dst += dst_stride_4x; 286 __lsx_vstelm_d(out2, dst, 0, 0); 287 __lsx_vstelm_d(out2, dst + dst_stride, 0, 1); 288 __lsx_vstelm_d(out3, dst + dst_stride_2x, 0, 0); 289 __lsx_vstelm_d(out3, dst + dst_stride_3x, 0, 1); 290 dst += dst_stride_4x; 291 } 292 for (;res--;) { 293 reg0 = __lsx_vldrepl_d(src0_ptr, 0); 294 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0); 295 src0 = __lsx_vilvl_d(reg1, reg0); 296 in0 = __lsx_vld(src1_ptr, 0); 297 in1 = __lsx_vldx(src1_ptr, src2_stride_x); 298 dst0 = __lsx_vsllwil_hu_bu(src0, 6); 299 dst1 = __lsx_vilvh_b(zero, src0); 300 dst1 = __lsx_vslli_h(dst1, 6); 301 out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 302 __lsx_vstelm_d(out0, dst, 0, 0); 303 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 304 src0_ptr += src_stride_2x; 305 src1_ptr += src2_stride_x; 306 dst += dst_stride_2x; 307 } 308} 309 310static 311void hevc_bi_copy_12w_lsx(uint8_t *src0_ptr, int32_t src_stride, 312 int16_t *src1_ptr, int32_t src2_stride, 313 uint8_t *dst, int32_t dst_stride, int32_t height) 314{ 315 uint32_t loop_cnt; 316 int32_t src_stride_2x = (src_stride << 1); 317 int32_t dst_stride_2x = (dst_stride << 1); 318 int32_t src_stride_4x = (src_stride << 2); 319 int32_t dst_stride_4x = (dst_stride << 2); 320 int32_t src2_stride_x = (src2_stride << 1); 321 int32_t src2_stride_2x = (src2_stride << 2); 322 int32_t src_stride_3x = src_stride_2x + src_stride; 323 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 324 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 325 int16_t* _src1 = src1_ptr + 8; 326 __m128i out0, out1, out2; 327 __m128i src0, src1, src2, src3; 328 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 329 __m128i dst0, dst1, dst2, dst3, dst4, dst5; 330 331 for (loop_cnt = 4; loop_cnt--;) { 332 src0 = __lsx_vld(src0_ptr, 0); 333 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 334 src1, src2); 335 src3 = __lsx_vldx(src0_ptr, src_stride_3x); 336 src0_ptr += src_stride_4x; 337 in0 = __lsx_vld(src1_ptr, 0); 338 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 339 src2_stride_2x, in1, in2); 340 in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 341 src1_ptr += src2_stride_2x; 342 in4 = __lsx_vld(_src1, 0); 343 DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x, 344 in5, in6); 345 in7 = __lsx_vldx(_src1, src2_stride_3x); 346 _src1 += src2_stride_2x; 347 348 DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5); 349 DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 350 dst0, dst1, dst2, dst3) 351 DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1); 352 DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst4, dst5) 353 out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 354 out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 355 out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5); 356 __lsx_vstelm_d(out0, dst, 0, 0); 357 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 358 __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 359 __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 360 __lsx_vstelm_w(out2, dst, 8, 0); 361 __lsx_vstelm_w(out2, dst + dst_stride, 8, 1); 362 __lsx_vstelm_w(out2, dst + dst_stride_2x, 8, 2); 363 __lsx_vstelm_w(out2, dst + dst_stride_3x, 8, 3); 364 dst += dst_stride_4x; 365 } 366} 367 368static 369void hevc_bi_copy_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 370 int16_t *src1_ptr, int32_t src2_stride, 371 uint8_t *dst, int32_t dst_stride, int32_t height) 372{ 373 uint32_t loop_cnt; 374 int32_t src_stride_2x = (src_stride << 1); 375 int32_t dst_stride_2x = (dst_stride << 1); 376 int32_t src_stride_4x = (src_stride << 2); 377 int32_t dst_stride_4x = (dst_stride << 2); 378 int32_t src2_stride_x = (src2_stride << 1); 379 int32_t src2_stride_2x = (src2_stride << 2); 380 int32_t src_stride_3x = src_stride_2x + src_stride; 381 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 382 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 383 int16_t *_src1 = src1_ptr + 8; 384 __m128i out0, out1, out2, out3; 385 __m128i src0, src1, src2, src3; 386 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 387 __m128i dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; 388 __m128i zero = {0}; 389 390 for (loop_cnt = (height >> 2); loop_cnt--;) { 391 src0 = __lsx_vld(src0_ptr, 0); 392 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 393 src1, src2); 394 src3 = __lsx_vldx(src0_ptr, src_stride_3x); 395 src0_ptr += src_stride_4x; 396 in0 = __lsx_vld(src1_ptr, 0); 397 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 398 src2_stride_2x, in1, in2); 399 in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 400 src1_ptr += src2_stride_2x; 401 in4 = __lsx_vld(_src1, 0); 402 DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x, 403 in5, in6); 404 in7 = __lsx_vldx(_src1, src2_stride_3x); 405 _src1 += src2_stride_2x; 406 DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6, 407 dst0_r, dst1_r, dst2_r, dst3_r) 408 DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3, 409 dst0_l, dst1_l, dst2_l, dst3_l); 410 DUP4_ARG2(__lsx_vslli_h, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6, 411 dst0_l, dst1_l, dst2_l, dst3_l); 412 413 out0 = hevc_bi_rnd_clip(in0, dst0_r, in4, dst0_l); 414 out1 = hevc_bi_rnd_clip(in1, dst1_r, in5, dst1_l); 415 out2 = hevc_bi_rnd_clip(in2, dst2_r, in6, dst2_l); 416 out3 = hevc_bi_rnd_clip(in3, dst3_r, in7, dst3_l); 417 __lsx_vst(out0, dst, 0); 418 __lsx_vstx(out1, dst, dst_stride); 419 __lsx_vstx(out2, dst, dst_stride_2x); 420 __lsx_vstx(out3, dst, dst_stride_3x); 421 dst += dst_stride_4x; 422 } 423} 424 425static 426void hevc_bi_copy_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 427 int16_t *src1_ptr, int32_t src2_stride, 428 uint8_t *dst, int32_t dst_stride, int32_t height) 429{ 430 hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 431 dst, dst_stride, height); 432 hevc_bi_copy_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 433 dst + 16, dst_stride, height); 434} 435 436static 437void hevc_bi_copy_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 438 int16_t *src1_ptr, int32_t src2_stride, 439 uint8_t *dst, int32_t dst_stride, int32_t height) 440{ 441 hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 442 dst, dst_stride, height); 443 hevc_bi_copy_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 444 dst + 16, dst_stride, height); 445} 446 447static 448void hevc_bi_copy_48w_lsx(uint8_t *src0_ptr, int32_t src_stride, 449 int16_t *src1_ptr, int32_t src2_stride, 450 uint8_t *dst, int32_t dst_stride, int32_t height) 451{ 452 hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 453 dst, dst_stride, height); 454 hevc_bi_copy_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 455 dst + 16, dst_stride, height); 456} 457 458static 459void hevc_bi_copy_64w_lsx(uint8_t *src0_ptr, int32_t src_stride, 460 int16_t *src1_ptr, int32_t src2_stride, 461 uint8_t *dst, int32_t dst_stride, int32_t height) 462{ 463 hevc_bi_copy_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 464 dst, dst_stride, height); 465 hevc_bi_copy_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride, 466 dst + 32, dst_stride, height); 467} 468 469static void hevc_hz_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 470 int16_t *src1_ptr, int32_t src2_stride, 471 uint8_t *dst, int32_t dst_stride, 472 const int8_t *filter, int32_t height) 473{ 474 uint32_t loop_cnt; 475 const int32_t dst_stride_2x = (dst_stride << 1); 476 __m128i src0, src1, src2, src3; 477 __m128i filt0, filt1, filt2, filt3; 478 __m128i mask1, mask2, mask3; 479 __m128i vec0, vec1, vec2, vec3; 480 __m128i dst0, dst1, dst2, dst3; 481 __m128i in0, in1, in2, in3; 482 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 483 484 src0_ptr -= 3; 485 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 486 filt0, filt1, filt2, filt3); 487 488 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 489 mask3 = __lsx_vaddi_bu(mask0, 6); 490 491 for (loop_cnt = (height >> 1); loop_cnt--;) { 492 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src0, src1); 493 src0_ptr += src_stride; 494 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src2, src3); 495 src0_ptr += src_stride; 496 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1); 497 src1_ptr += src2_stride; 498 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3); 499 src1_ptr += src2_stride; 500 501 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, 502 vec0, vec1); 503 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, 504 vec2, vec3); 505 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 506 vec3, filt0, dst0, dst1, dst2, dst3); 507 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, 508 vec0, vec1); 509 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, 510 vec2, vec3); 511 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 512 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 513 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, 514 vec0, vec1); 515 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, 516 vec2, vec3); 517 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2, 518 dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3); 519 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, 520 vec0, vec1); 521 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3, 522 vec2, vec3); 523 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3, 524 dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3); 525 526 dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 527 dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 528 __lsx_vst(dst0, dst, 0); 529 __lsx_vstx(dst1, dst, dst_stride); 530 dst += dst_stride_2x; 531 } 532} 533 534static void hevc_hz_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 535 int16_t *src1_ptr, int32_t src2_stride, 536 uint8_t *dst, int32_t dst_stride, 537 const int8_t *filter, int32_t height) 538{ 539 uint32_t loop_cnt; 540 __m128i src0, src1, tmp0, tmp1; 541 __m128i filt0, filt1, filt2, filt3; 542 __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7; 543 __m128i vec0, vec1, vec2, vec3; 544 __m128i dst0, dst1, dst2; 545 __m128i in0, in1, in2; 546 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 547 548 src0_ptr -= 3; 549 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 550 filt0, filt1, filt2, filt3); 551 552 DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1, 553 mask2, mask3, mask4); 554 DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6); 555 mask7 = __lsx_vaddi_bu(mask0, 14); 556 557 for (loop_cnt = height; loop_cnt--;) { 558 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1); 559 src0_ptr += src_stride; 560 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1); 561 in2 = __lsx_vld(src1_ptr, 32); 562 src1_ptr += src2_stride; 563 564 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1, 565 src1, mask0, src0, src0, mask1, vec0, vec1, vec2, vec3); 566 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, dst0, dst1); 567 dst2 = __lsx_vdp2_h_bu_b(vec2, filt0); 568 dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt1); 569 DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask5, src1, src1, mask1, src0, 570 src0, mask2, src1, src0, mask6, vec0, vec1, vec2, vec3); 571 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec0, filt1, dst2, vec1, filt1, 572 dst0, vec2, filt2, dst1, vec3, filt2, dst1, dst2, dst0, dst1); 573 DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask2, src0, src0, mask3, src1, src0, 574 mask7, src1, src1, mask3, vec0, vec1, vec2, vec3); 575 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec0, filt2, dst0, vec1, filt3, 576 dst1, vec2, filt3, dst2, vec3, filt3, dst2, dst0, dst1, dst2); 577 578 tmp0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 579 dst2 = __lsx_vsadd_h(dst2, in2); 580 tmp1 = __lsx_vssrarni_bu_h(dst2, dst2, 7); 581 582 __lsx_vst(tmp0, dst, 0); 583 __lsx_vstelm_d(tmp1, dst, 16, 0); 584 dst += dst_stride; 585 } 586} 587 588static void hevc_hz_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 589 int16_t *src1_ptr, int32_t src2_stride, 590 uint8_t *dst, int32_t dst_stride, 591 const int8_t *filter, int32_t height) 592{ 593 hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 594 dst, dst_stride, filter, height); 595 hevc_hz_8t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 596 dst + 16, dst_stride, filter, height); 597} 598 599static void hevc_hz_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride, 600 int16_t *src1_ptr, int32_t src2_stride, 601 uint8_t *dst, int32_t dst_stride, 602 const int8_t *filter, int32_t height) 603{ 604 hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 605 dst, dst_stride, filter, height); 606 hevc_hz_8t_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 607 dst + 16, dst_stride, filter, height); 608} 609 610static void hevc_hz_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride, 611 int16_t *src1_ptr, int32_t src2_stride, 612 uint8_t *dst, int32_t dst_stride, 613 const int8_t *filter, int32_t height) 614{ 615 hevc_hz_8t_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 616 dst, dst_stride, filter, height); 617 hevc_hz_8t_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride, 618 dst + 32, dst_stride, filter, height); 619} 620 621static av_always_inline 622void hevc_vt_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, 623 int32_t src2_stride, uint8_t *dst, int32_t dst_stride,\ 624 const int8_t *filter, int32_t height) 625{ 626 int32_t loop_cnt; 627 int32_t src_stride_2x = (src_stride << 1); 628 int32_t dst_stride_2x = (dst_stride << 1); 629 int32_t src_stride_4x = (src_stride << 2); 630 int32_t dst_stride_4x = (dst_stride << 2); 631 int32_t src2_stride_x = (src2_stride << 1); 632 int32_t src2_stride_2x = (src2_stride << 2); 633 int32_t src_stride_3x = src_stride_2x + src_stride; 634 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 635 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 636 __m128i src0, src1, src2, src3, src4, src5; 637 __m128i src6, src7, src8, src9, src10; 638 __m128i in0, in1, in2, in3; 639 __m128i src10_r, src32_r, src54_r, src76_r, src98_r; 640 __m128i src21_r, src43_r, src65_r, src87_r, src109_r; 641 __m128i dst0_r, dst1_r, dst2_r, dst3_r; 642 __m128i filt0, filt1, filt2, filt3; 643 644 src0_ptr -= src_stride_3x; 645 646 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 647 filt0, filt1, filt2, filt3); 648 649 src0 = __lsx_vld(src0_ptr, 0); 650 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 651 src1, src2); 652 src3 = __lsx_vldx(src0_ptr, src_stride_3x); 653 src0_ptr += src_stride_4x; 654 src4 = __lsx_vld(src0_ptr, 0); 655 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 656 src5, src6); 657 src0_ptr += src_stride_3x; 658 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 659 src10_r, src32_r, src54_r, src21_r); 660 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 661 662 for (loop_cnt = (height >> 2); loop_cnt--;) { 663 src7 = __lsx_vld(src0_ptr, 0); 664 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 665 src8, src9); 666 src10 = __lsx_vldx(src0_ptr, src_stride_3x); 667 src0_ptr += src_stride_4x; 668 in0 = __lsx_vld(src1_ptr, 0); 669 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x, 670 in1, in2); 671 in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 672 src1_ptr += src2_stride_2x; 673 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, 674 src76_r, src87_r, src98_r, src109_r); 675 676 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r, 677 filt0, src43_r, filt0, dst0_r, dst1_r, dst2_r, dst3_r); 678 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r, 679 filt1, dst2_r, src54_r, filt1, dst3_r, src65_r, filt1, 680 dst0_r, dst1_r, dst2_r, dst3_r); 681 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r, src65_r, 682 filt2, dst2_r, src76_r, filt2, dst3_r, src87_r, filt2, 683 dst0_r, dst1_r, dst2_r, dst3_r); 684 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r, src87_r, 685 filt3, dst2_r, src98_r, filt3, dst3_r, src109_r, filt3, 686 dst0_r, dst1_r, dst2_r, dst3_r); 687 688 dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r); 689 dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r); 690 __lsx_vstelm_d(dst0_r, dst, 0, 0); 691 __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1); 692 __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0); 693 __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1); 694 dst += dst_stride_4x; 695 696 src10_r = src54_r; 697 src32_r = src76_r; 698 src54_r = src98_r; 699 src21_r = src65_r; 700 src43_r = src87_r; 701 src65_r = src109_r; 702 703 src6 = src10; 704 } 705} 706 707static av_always_inline 708void hevc_vt_8t_16multx2mult_lsx(uint8_t *src0_ptr, int32_t src_stride, 709 int16_t *src1_ptr, int32_t src2_stride, 710 uint8_t *dst, int32_t dst_stride, 711 const int8_t *filter, int32_t height, 712 int32_t width) 713{ 714 uint8_t *src0_ptr_tmp; 715 int16_t *src1_ptr_tmp; 716 uint8_t *dst_tmp; 717 uint32_t loop_cnt; 718 uint32_t cnt; 719 int32_t src_stride_2x = (src_stride << 1); 720 int32_t dst_stride_2x = (dst_stride << 1); 721 int32_t src_stride_4x = (src_stride << 2); 722 int32_t src_stride_3x = src_stride_2x + src_stride; 723 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 724 __m128i in0, in1, in2, in3; 725 __m128i src10_r, src32_r, src54_r, src76_r; 726 __m128i src21_r, src43_r, src65_r, src87_r; 727 __m128i dst0_r, dst1_r; 728 __m128i src10_l, src32_l, src54_l, src76_l; 729 __m128i src21_l, src43_l, src65_l, src87_l; 730 __m128i dst0_l, dst1_l; 731 __m128i filt0, filt1, filt2, filt3; 732 733 src0_ptr -= src_stride_3x; 734 735 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 736 filt0, filt1, filt2, filt3); 737 738 for (cnt = (width >> 4); cnt--;) { 739 src0_ptr_tmp = src0_ptr; 740 src1_ptr_tmp = src1_ptr; 741 dst_tmp = dst; 742 743 src0 = __lsx_vld(src0_ptr_tmp, 0); 744 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 745 src_stride_2x, src1, src2); 746 src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x); 747 src0_ptr_tmp += src_stride_4x; 748 src4 = __lsx_vld(src0_ptr_tmp, 0); 749 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 750 src_stride_2x, src5, src6); 751 src0_ptr_tmp += src_stride_3x; 752 753 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 754 src10_r, src32_r, src54_r, src21_r); 755 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 756 DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, 757 src10_l, src32_l, src54_l, src21_l); 758 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l); 759 760 for (loop_cnt = (height >> 1); loop_cnt--;) { 761 src7 = __lsx_vld(src0_ptr_tmp, 0); 762 src8 = __lsx_vldx(src0_ptr_tmp, src_stride); 763 src0_ptr_tmp += src_stride_2x; 764 DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in0, in2); 765 src1_ptr_tmp += src2_stride; 766 DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in1, in3); 767 src1_ptr_tmp += src2_stride; 768 769 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 770 DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l); 771 772 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l, 773 filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l); 774 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, 775 src43_r, filt1, dst0_l, src32_l, filt1, dst1_l, src43_l, 776 filt1, dst0_r, dst1_r, dst0_l, dst1_l); 777 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r, 778 src65_r, filt2, dst0_l, src54_l, filt2, dst1_l, src65_l, 779 filt2, dst0_r, dst1_r, dst0_l, dst1_l); 780 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r, 781 src87_r, filt3, dst0_l, src76_l, filt3, dst1_l, src87_l, 782 filt3, dst0_r, dst1_r, dst0_l, dst1_l); 783 dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l); 784 dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l); 785 786 __lsx_vst(dst0_r, dst_tmp, 0); 787 __lsx_vstx(dst1_r, dst_tmp, dst_stride); 788 dst_tmp += dst_stride_2x; 789 790 src10_r = src32_r; 791 src32_r = src54_r; 792 src54_r = src76_r; 793 src21_r = src43_r; 794 src43_r = src65_r; 795 src65_r = src87_r; 796 src10_l = src32_l; 797 src32_l = src54_l; 798 src54_l = src76_l; 799 src21_l = src43_l; 800 src43_l = src65_l; 801 src65_l = src87_l; 802 src6 = src8; 803 } 804 805 src0_ptr += 16; 806 src1_ptr += 16; 807 dst += 16; 808 } 809} 810 811static void hevc_vt_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 812 int16_t *src1_ptr, int32_t src2_stride, 813 uint8_t *dst, int32_t dst_stride, 814 const int8_t *filter, int32_t height) 815{ 816 hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 817 dst, dst_stride, filter, height, 16); 818} 819 820static void hevc_vt_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 821 int16_t *src1_ptr, int32_t src2_stride, 822 uint8_t *dst, int32_t dst_stride, 823 const int8_t *filter, int32_t height) 824{ 825 hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 826 dst, dst_stride, filter, height, 16); 827 hevc_vt_8t_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 828 dst + 16, dst_stride, filter, height); 829} 830 831static void hevc_vt_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 832 int16_t *src1_ptr, int32_t src2_stride, 833 uint8_t *dst, int32_t dst_stride, 834 const int8_t *filter, int32_t height) 835{ 836 hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 837 dst, dst_stride, filter, height, 32); 838} 839 840static void hevc_vt_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride, 841 int16_t *src1_ptr, int32_t src2_stride, 842 uint8_t *dst, int32_t dst_stride, 843 const int8_t *filter, int32_t height) 844{ 845 hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 846 dst, dst_stride, filter, height, 48); 847} 848 849static void hevc_vt_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride, 850 int16_t *src1_ptr, int32_t src2_stride, 851 uint8_t *dst, int32_t dst_stride, 852 const int8_t *filter, int32_t height) 853{ 854 hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 855 dst, dst_stride, filter, height, 64); 856} 857 858static av_always_inline 859void hevc_hv_8t_8multx1mult_lsx(uint8_t *src0_ptr, int32_t src_stride, 860 int16_t *src1_ptr, int32_t src2_stride, 861 uint8_t *dst, int32_t dst_stride, 862 const int8_t *filter_x, const int8_t *filter_y, 863 int32_t height, int32_t width) 864{ 865 uint32_t loop_cnt; 866 uint32_t cnt; 867 uint8_t *src0_ptr_tmp; 868 int16_t *src1_ptr_tmp; 869 uint8_t *dst_tmp; 870 int32_t src_stride_2x = (src_stride << 1); 871 int32_t src_stride_4x = (src_stride << 2); 872 int32_t src_stride_3x = src_stride_2x + src_stride; 873 __m128i out; 874 __m128i src0, src1, src2, src3, src4, src5, src6, src7; 875 __m128i in0, tmp; 876 __m128i filt0, filt1, filt2, filt3; 877 __m128i filt_h0, filt_h1, filt_h2, filt_h3; 878 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 879 __m128i mask1, mask2, mask3; 880 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 881 __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 882 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 883 __m128i dst0_r, dst0_l; 884 __m128i dst10_r, dst32_r, dst54_r, dst76_r; 885 __m128i dst10_l, dst32_l, dst54_l, dst76_l; 886 887 src0_ptr -= src_stride_3x + 3; 888 889 DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, filter_x, 890 6, filt0, filt1, filt2, filt3); 891 filt_h3 = __lsx_vld(filter_y, 0); 892 filt_h3 = __lsx_vsllwil_h_b(filt_h3, 0); 893 894 DUP4_ARG2(__lsx_vreplvei_w, filt_h3, 0, filt_h3, 1, filt_h3, 2, filt_h3, 3, 895 filt_h0, filt_h1, filt_h2, filt_h3); 896 897 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 898 mask3 = __lsx_vaddi_bu(mask0, 6); 899 900 for (cnt = width >> 3; cnt--;) { 901 src0_ptr_tmp = src0_ptr; 902 dst_tmp = dst; 903 src1_ptr_tmp = src1_ptr; 904 905 src0 = __lsx_vld(src0_ptr_tmp, 0); 906 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 907 src_stride_2x, src1, src2); 908 src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x); 909 src0_ptr_tmp += src_stride_4x; 910 src4 = __lsx_vld(src0_ptr_tmp, 0); 911 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 912 src_stride_2x, src5, src6); 913 src0_ptr_tmp += src_stride_3x; 914 915 /* row 0 row 1 row 2 row 3 */ 916 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, 917 src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3); 918 DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, 919 src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7); 920 DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, 921 src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11); 922 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, 923 src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15); 924 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0, 925 vec12, filt0, dst0, dst1, dst2, dst3); 926 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1, 927 dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3); 928 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2, 929 dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3); 930 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3, 931 dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3); 932 933 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, 934 src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3); 935 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, 936 src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7); 937 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, 938 src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11); 939 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5); 940 dst6 = __lsx_vdp2_h_bu_b(vec8, filt0); 941 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1, 942 dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4); 943 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2, 944 dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5); 945 dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3); 946 947 for (loop_cnt = height; loop_cnt--;) { 948 src7 = __lsx_vld(src0_ptr_tmp, 0); 949 src0_ptr_tmp += src_stride; 950 951 in0 = __lsx_vld(src1_ptr_tmp, 0); 952 src1_ptr_tmp += src2_stride; 953 954 DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7, 955 src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3); 956 dst7 = __lsx_vdp2_h_bu_b(vec0, filt0); 957 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, 958 filt2, dst7, dst7); 959 dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3); 960 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, 961 dst6, dst10_r, dst32_r, dst54_r, dst76_r); 962 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, 963 dst6, dst10_l, dst32_l, dst54_l, dst76_l); 964 965 DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, 966 dst0_r, dst0_l); 967 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, 968 dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, 969 dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l); 970 DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, 971 dst76_l, filt_h3, dst0_r, dst0_l); 972 dst0_r = __lsx_vsrli_w(dst0_r, 6); 973 dst0_l = __lsx_vsrli_w(dst0_l, 6); 974 975 tmp = __lsx_vpickev_h(dst0_l, dst0_r); 976 tmp = __lsx_vsadd_h(tmp, in0); 977 tmp = __lsx_vmaxi_h(tmp, 0); 978 out = __lsx_vssrlrni_bu_h(tmp, tmp, 7); 979 __lsx_vstelm_d(out, dst_tmp, 0, 0); 980 dst_tmp += dst_stride; 981 982 dst0 = dst1; 983 dst1 = dst2; 984 dst2 = dst3; 985 dst3 = dst4; 986 dst4 = dst5; 987 dst5 = dst6; 988 dst6 = dst7; 989 } 990 991 src0_ptr += 8; 992 dst += 8; 993 src1_ptr += 8; 994 } 995} 996 997static void hevc_hv_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, 998 int16_t *src1_ptr, int32_t src2_stride, 999 uint8_t *dst, int32_t dst_stride, 1000 const int8_t *filter_x, const int8_t *filter_y, 1001 int32_t height) 1002{ 1003 hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1004 dst, dst_stride, filter_x, filter_y, height, 8); 1005} 1006 1007static void hevc_hv_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1008 int16_t *src1_ptr, int32_t src2_stride, 1009 uint8_t *dst, int32_t dst_stride, 1010 const int8_t *filter_x, const int8_t *filter_y, 1011 int32_t height) 1012{ 1013 hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1014 dst, dst_stride, filter_x, filter_y, height, 16); 1015} 1016 1017static void hevc_hv_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1018 int16_t *src1_ptr, int32_t src2_stride, 1019 uint8_t *dst, int32_t dst_stride, 1020 const int8_t *filter_x, const int8_t *filter_y, 1021 int32_t height) 1022{ 1023 hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1024 dst, dst_stride, filter_x, filter_y, height, 24); 1025} 1026 1027static void hevc_hv_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1028 int16_t *src1_ptr, int32_t src2_stride, 1029 uint8_t *dst, int32_t dst_stride, 1030 const int8_t *filter_x, const int8_t *filter_y, 1031 int32_t height) 1032{ 1033 hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1034 dst, dst_stride, filter_x, filter_y, height, 32); 1035} 1036 1037static void hevc_hv_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1038 int16_t *src1_ptr, int32_t src2_stride, 1039 uint8_t *dst, int32_t dst_stride, 1040 const int8_t *filter_x, const int8_t *filter_y, 1041 int32_t height) 1042{ 1043 hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1044 dst, dst_stride, filter_x, filter_y, height, 48); 1045} 1046 1047static void hevc_hv_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1048 int16_t *src1_ptr, int32_t src2_stride, 1049 uint8_t *dst, int32_t dst_stride, 1050 const int8_t *filter_x, const int8_t *filter_y, 1051 int32_t height) 1052{ 1053 hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1054 dst, dst_stride, filter_x, filter_y, height, 64); 1055} 1056 1057static void hevc_hz_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1058 int16_t *src1_ptr, int32_t src2_stride, 1059 uint8_t *dst, int32_t dst_stride, 1060 const int8_t *filter, int32_t height) 1061{ 1062 int16_t *src1_ptr_tmp; 1063 uint8_t *dst_tmp; 1064 uint32_t loop_cnt; 1065 int32_t dst_stride_2x = (dst_stride << 1); 1066 int32_t dst_stride_4x = (dst_stride << 2); 1067 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1068 int32_t src2_stride_x = src2_stride << 1; 1069 int32_t src2_stride_2x = src2_stride << 2; 1070 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 1071 1072 __m128i src0, src1, src2, src3, src4, src5, src6, src7; 1073 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1074 __m128i filt0, filt1; 1075 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1076 __m128i mask1, mask2, mask3; 1077 __m128i vec0, vec1, vec2, vec3; 1078 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1079 1080 src0_ptr -= 1; 1081 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 1082 1083 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2); 1084 mask3 = __lsx_vaddi_bu(mask0, 10); 1085 1086 dst_tmp = dst + 16; 1087 src1_ptr_tmp = src1_ptr + 16; 1088 1089 for (loop_cnt = (height >> 2); loop_cnt--;) { 1090 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1); 1091 src0_ptr += src_stride; 1092 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src3); 1093 src0_ptr += src_stride; 1094 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src5); 1095 src0_ptr += src_stride; 1096 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src6, src7); 1097 src0_ptr += src_stride; 1098 1099 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1); 1100 src1_ptr += src2_stride; 1101 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3); 1102 src1_ptr += src2_stride; 1103 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in4, in5); 1104 src1_ptr += src2_stride; 1105 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in6, in7); 1106 src1_ptr += src2_stride; 1107 1108 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src2, 1109 src2, mask0, src3, src2, mask2, vec0, vec1, vec2, vec3); 1110 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 1111 vec3, filt0, dst0, dst1, dst2, dst3); 1112 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src2, 1113 src2, mask1, src3, src2, mask3, vec0, vec1, vec2, vec3); 1114 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 1115 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 1116 1117 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src4, mask2, src6, 1118 src6, mask0, src7, src6, mask2, vec0, vec1, vec2, vec3); 1119 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 1120 vec3, filt0, dst4, dst5, dst6, dst7); 1121 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src4, mask3, src6, 1122 src6, mask1, src7, src6, mask3, vec0, vec1, vec2, vec3); 1123 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec0, filt1, dst5, vec1, filt1, 1124 dst6, vec2, filt1, dst7, vec3, filt1, dst4, dst5, dst6, dst7); 1125 1126 dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 1127 dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 1128 dst2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5); 1129 dst3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7); 1130 __lsx_vst(dst0, dst, 0); 1131 __lsx_vstx(dst1, dst, dst_stride); 1132 __lsx_vstx(dst2, dst, dst_stride_2x); 1133 __lsx_vstx(dst3, dst, dst_stride_3x); 1134 dst += dst_stride_4x; 1135 1136 in0 = __lsx_vld(src1_ptr_tmp, 0); 1137 DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp, 1138 src2_stride_2x, in1, in2); 1139 in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x); 1140 src1_ptr_tmp += src2_stride_2x; 1141 1142 DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src3, src3, mask0, src5, 1143 src5, mask0, src7, src7, mask0, vec0, vec1, vec2, vec3); 1144 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 1145 vec3, filt0, dst0, dst1, dst2, dst3); 1146 DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask1, src3, src3, mask1, src5, 1147 src5, mask1, src7, src7, mask1, vec0, vec1, vec2, vec3); 1148 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 1149 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 1150 dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 1151 dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 1152 __lsx_vstelm_d(dst0, dst_tmp, 0, 0); 1153 __lsx_vstelm_d(dst0, dst_tmp + dst_stride, 0, 1); 1154 __lsx_vstelm_d(dst1, dst_tmp + dst_stride_2x, 0, 0); 1155 __lsx_vstelm_d(dst1, dst_tmp + dst_stride_3x, 0, 1); 1156 dst_tmp += dst_stride_4x; 1157 } 1158} 1159 1160static void hevc_hz_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1161 int16_t *src1_ptr, int32_t src2_stride, 1162 uint8_t *dst, int32_t dst_stride, 1163 const int8_t *filter, int32_t height) 1164{ 1165 uint32_t loop_cnt; 1166 __m128i src0, src1, src2; 1167 __m128i in0, in1, in2, in3; 1168 __m128i filt0, filt1; 1169 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1170 __m128i mask1, mask2, mask3; 1171 __m128i dst0, dst1, dst2, dst3; 1172 __m128i vec0, vec1, vec2, vec3; 1173 1174 src0_ptr -= 1; 1175 1176 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 1177 1178 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2); 1179 mask3 = __lsx_vaddi_bu(mask0, 10); 1180 1181 for (loop_cnt = height; loop_cnt--;) { 1182 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1); 1183 src2 = __lsx_vld(src0_ptr, 24); 1184 src0_ptr += src_stride; 1185 DUP4_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, src1_ptr, 32, 1186 src1_ptr, 48, in0, in1, in2, in3); 1187 src1_ptr += src2_stride; 1188 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src1, 1189 src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3); 1190 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 1191 vec3, filt0, dst0, dst1, dst2, dst3); 1192 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src1, 1193 src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3); 1194 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, 1195 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3); 1196 dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1); 1197 dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3); 1198 __lsx_vst(dst0, dst, 0); 1199 __lsx_vst(dst1, dst, 16); 1200 dst += dst_stride; 1201 } 1202} 1203 1204static void hevc_vt_4t_12w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1205 int16_t *src1_ptr, int32_t src2_stride, 1206 uint8_t *dst, int32_t dst_stride, 1207 const int8_t *filter, int32_t height) 1208{ 1209 int32_t loop_cnt; 1210 int32_t src_stride_2x = (src_stride << 1); 1211 int32_t dst_stride_2x = (dst_stride << 1); 1212 int32_t dst_stride_4x = (dst_stride << 2); 1213 int32_t src_stride_4x = (src_stride << 2); 1214 int32_t src2_stride_x = (src2_stride << 1); 1215 int32_t src2_stride_2x = (src2_stride << 2); 1216 int32_t src_stride_3x = src_stride_2x + src_stride; 1217 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1218 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 1219 int16_t *_src1 = src1_ptr + 8; 1220 __m128i src0, src1, src2, src3, src4, src5, src6; 1221 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1222 __m128i src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 1223 __m128i dst0_r, dst1_r, dst2_r, dst3_r; 1224 __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 1225 __m128i src2110, src4332, src6554; 1226 __m128i dst0_l, dst1_l, filt0, filt1; 1227 1228 src0_ptr -= src_stride; 1229 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 1230 1231 src0 = __lsx_vld(src0_ptr, 0); 1232 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1233 src1, src2); 1234 src0_ptr += src_stride_3x; 1235 DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 1236 DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 1237 src2110 = __lsx_vilvl_d(src21_l, src10_l); 1238 1239 for (loop_cnt = (height >> 2); loop_cnt--;) { 1240 src3 = __lsx_vld(src0_ptr, 0); 1241 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1242 src4, src5); 1243 src6 = __lsx_vldx(src0_ptr, src_stride_3x); 1244 src0_ptr += src_stride_4x; 1245 in0 = __lsx_vld(src1_ptr, 0); 1246 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 1247 src2_stride_2x, in1, in2); 1248 in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 1249 src1_ptr += src2_stride_2x; 1250 in4 = __lsx_vld(_src1, 0); 1251 DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x, 1252 in5, in6); 1253 in7 = __lsx_vldx(_src1, src2_stride_3x); 1254 _src1 += src2_stride_2x; 1255 DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5); 1256 1257 DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 1258 DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 1259 src4332 = __lsx_vilvl_d(src43_l, src32_l); 1260 DUP2_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src54_r, src65_r); 1261 DUP2_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src54_l, src65_l); 1262 src6554 = __lsx_vilvl_d(src65_l, src54_l); 1263 1264 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src2110, 1265 filt0, src32_r, filt0, dst0_r, dst1_r, dst0_l, dst2_r); 1266 DUP2_ARG2(__lsx_vdp2_h_bu_b, src43_r, filt0, src4332, filt0, 1267 dst3_r, dst1_l); 1268 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, 1269 src43_r, filt1, dst0_l, src4332, filt1, dst2_r, src54_r, 1270 filt1, dst0_r, dst1_r, dst0_l, dst2_r); 1271 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst1_l, 1272 src6554, filt1, dst3_r, dst1_l); 1273 dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r); 1274 dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r); 1275 dst0_l = hevc_bi_rnd_clip(in4, dst0_l, in5, dst1_l); 1276 __lsx_vstelm_d(dst0_r, dst, 0, 0); 1277 __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1); 1278 __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0); 1279 __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1); 1280 __lsx_vstelm_w(dst0_l, dst, 8, 0); 1281 __lsx_vstelm_w(dst0_l, dst + dst_stride, 8, 1); 1282 __lsx_vstelm_w(dst0_l, dst + dst_stride_2x, 8, 2); 1283 __lsx_vstelm_w(dst0_l, dst + dst_stride_3x, 8, 3); 1284 dst += dst_stride_4x; 1285 1286 src2 = src6; 1287 src10_r = src54_r; 1288 src21_r = src65_r; 1289 src2110 = src6554; 1290 } 1291} 1292 1293static void hevc_vt_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1294 int16_t *src1_ptr, int32_t src2_stride, 1295 uint8_t *dst, int32_t dst_stride, 1296 const int8_t *filter, int32_t height) 1297{ 1298 int32_t loop_cnt; 1299 const int32_t src_stride_2x = (src_stride << 1); 1300 const int32_t dst_stride_2x = (dst_stride << 1); 1301 const int32_t src_stride_3x = src_stride_2x + src_stride; 1302 __m128i src0, src1, src2, src3, src4, src5; 1303 __m128i in0, in1, in2, in3; 1304 __m128i src10_r, src32_r, src21_r, src43_r; 1305 __m128i src10_l, src32_l, src21_l, src43_l; 1306 __m128i dst0_r, dst1_r, dst0_l, dst1_l; 1307 __m128i filt0, filt1; 1308 1309 src0_ptr -= src_stride; 1310 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 1311 1312 src0 = __lsx_vld(src0_ptr, 0); 1313 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1314 src1, src2); 1315 src0_ptr += src_stride_3x; 1316 DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 1317 DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 1318 1319 for (loop_cnt = (height >> 2); loop_cnt--;) { 1320 src3 = __lsx_vld(src0_ptr, 0); 1321 src4 = __lsx_vldx(src0_ptr, src_stride); 1322 src0_ptr += src_stride_2x; 1323 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2); 1324 src1_ptr += src2_stride; 1325 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3); 1326 src1_ptr += src2_stride; 1327 DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 1328 DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 1329 1330 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l, 1331 filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l); 1332 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r, 1333 filt1, dst0_l, src32_l, filt1, dst1_l, src43_l, filt1, 1334 dst0_r, dst1_r, dst0_l, dst1_l); 1335 1336 dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l); 1337 dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l); 1338 __lsx_vst(dst0_r, dst, 0); 1339 __lsx_vstx(dst1_r, dst, dst_stride); 1340 dst += dst_stride_2x; 1341 1342 src5 = __lsx_vld(src0_ptr, 0); 1343 src2 = __lsx_vldx(src0_ptr, src_stride); 1344 src0_ptr += src_stride_2x; 1345 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2); 1346 src1_ptr += src2_stride; 1347 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3); 1348 src1_ptr += src2_stride; 1349 DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r); 1350 DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l); 1351 1352 DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r, 1353 filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 1354 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, 1355 src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, 1356 filt1, dst0_r, dst0_l, dst1_r, dst1_l); 1357 dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l); 1358 dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l); 1359 __lsx_vst(dst0_r, dst, 0); 1360 __lsx_vstx(dst1_r, dst, dst_stride); 1361 dst += dst_stride_2x; 1362 } 1363} 1364 1365static void hevc_vt_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1366 int16_t *src1_ptr, int32_t src2_stride, 1367 uint8_t *dst, int32_t dst_stride, 1368 const int8_t *filter, int32_t height) 1369{ 1370 uint32_t loop_cnt; 1371 int32_t dst_stride_2x = dst_stride << 1; 1372 __m128i src0, src1, src2, src3, src4, src5; 1373 __m128i src6, src7, src8, src9, src10, src11; 1374 __m128i in0, in1, in2, in3, in4, in5; 1375 __m128i src10_r, src32_r, src76_r, src98_r; 1376 __m128i src21_r, src43_r, src87_r, src109_r; 1377 __m128i src10_l, src32_l, src21_l, src43_l; 1378 __m128i dst0_r, dst1_r, dst2_r, dst3_r; 1379 __m128i dst0_l, dst1_l; 1380 __m128i filt0, filt1; 1381 1382 src0_ptr -= src_stride; 1383 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 1384 1385 /* 16width */ 1386 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src6); 1387 src0_ptr += src_stride; 1388 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src1, src7); 1389 src0_ptr += src_stride; 1390 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8); 1391 src0_ptr += src_stride; 1392 DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 1393 DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 1394 /* 8width */ 1395 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 1396 1397 for (loop_cnt = (height >> 2); loop_cnt--;) { 1398 /* 16width */ 1399 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src3, src9); 1400 src0_ptr += src_stride; 1401 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src10); 1402 src0_ptr += src_stride; 1403 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2); 1404 in4 = __lsx_vld(src1_ptr, 32); 1405 src1_ptr += src2_stride; 1406 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3); 1407 in5 = __lsx_vld(src1_ptr, 32); 1408 src1_ptr += src2_stride; 1409 DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 1410 DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 1411 /* 8width */ 1412 DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r); 1413 /* 16width */ 1414 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r, 1415 filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 1416 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l, 1417 src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l, filt1, 1418 dst0_r, dst0_l, dst1_r, dst1_l); 1419 /* 8width */ 1420 DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0, 1421 dst2_r, dst3_r); 1422 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r, 1423 src109_r, filt1, dst2_r, dst3_r); 1424 /* 16width */ 1425 dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l); 1426 dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l); 1427 dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r); 1428 __lsx_vst(dst0_r, dst, 0); 1429 __lsx_vstx(dst1_r, dst, dst_stride); 1430 __lsx_vstelm_d(dst2_r, dst, 16, 0); 1431 __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1); 1432 dst += dst_stride_2x; 1433 1434 /* 16width */ 1435 DUP4_ARG2(__lsx_vld, src0_ptr, 0, src1_ptr, 0, src1_ptr, 16, src1_ptr, 1436 32, src5, in0, in2, in4); 1437 src1_ptr += src2_stride; 1438 DUP4_ARG2(__lsx_vld, src0_ptr, 16, src1_ptr, 0, src1_ptr, 16, src1_ptr, 1439 32, src11, in1, in3, in5); 1440 src1_ptr += src2_stride; 1441 src0_ptr += src_stride; 1442 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8); 1443 src0_ptr += src_stride; 1444 DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r); 1445 DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l); 1446 /* 8width */ 1447 DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r); 1448 /* 16width */ 1449 DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r, 1450 filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l); 1451 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, 1452 src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, 1453 filt1, dst0_r, dst0_l, dst1_r, dst1_l); 1454 1455 /* 8width */ 1456 DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0, 1457 dst2_r, dst3_r); 1458 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r, 1459 src87_r, filt1, dst2_r, dst3_r); 1460 1461 dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l); 1462 dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l); 1463 dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r); 1464 __lsx_vst(dst0_r, dst, 0); 1465 __lsx_vstx(dst1_r, dst, dst_stride); 1466 __lsx_vstelm_d(dst2_r, dst, 16, 0); 1467 __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1); 1468 dst += dst_stride_2x; 1469 } 1470} 1471 1472static void hevc_vt_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1473 int16_t *src1_ptr, int32_t src2_stride, 1474 uint8_t *dst, int32_t dst_stride, 1475 const int8_t *filter, int32_t height) 1476{ 1477 hevc_vt_4t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 1478 dst, dst_stride, filter, height); 1479 hevc_vt_4t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 1480 dst + 16, dst_stride, filter, height); 1481} 1482 1483static void hevc_hv_4t_6w_lsx(uint8_t *src0_ptr, int32_t src_stride, 1484 int16_t *src1_ptr, int32_t src2_stride, 1485 uint8_t *dst, int32_t dst_stride, 1486 const int8_t *filter_x, const int8_t *filter_y, 1487 int32_t height) 1488{ 1489 int32_t src_stride_2x = (src_stride << 1); 1490 int32_t dst_stride_2x = (dst_stride << 1); 1491 int32_t src_stride_4x = (src_stride << 2); 1492 int32_t dst_stride_4x = (dst_stride << 2); 1493 int32_t src2_stride_2x = (src2_stride << 1); 1494 int32_t src2_stride_4x = (src2_stride << 2); 1495 int32_t src_stride_3x = src_stride_2x + src_stride; 1496 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1497 int32_t src2_stride_3x = src2_stride_2x + src2_stride; 1498 __m128i out0, out1; 1499 __m128i src0, src1, src2, src3, src4, src5, src6; 1500 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, mask1; 1501 __m128i filt0, filt1, filt_h0, filt_h1; 1502 __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5; 1503 __m128i dsth6, dsth7, dsth8, dsth9, dsth10; 1504 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 1505 __m128i dst4_r, dst5_r, dst6_r, dst7_r; 1506 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; 1507 __m128i reg0, reg1, reg2, reg3; 1508 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1509 1510 src0_ptr -= (src_stride + 1); 1511 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1512 1513 filt_h1 = __lsx_vld(filter_y, 0); 1514 filt_h1 = __lsx_vsllwil_h_b(filt_h1, 0); 1515 DUP2_ARG2(__lsx_vreplvei_w, filt_h1, 0, filt_h1, 1, filt_h0, filt_h1); 1516 1517 mask1 = __lsx_vaddi_bu(mask0, 2); 1518 1519 src0 = __lsx_vld(src0_ptr, 0); 1520 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1521 src1, src2); 1522 src0_ptr += src_stride_3x; 1523 1524 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1); 1525 DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3); 1526 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5); 1527 1528 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1); 1529 dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0); 1530 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1, 1531 dsth0, dsth1); 1532 dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1); 1533 1534 DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, tmp0, tmp2); 1535 DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, tmp1, tmp3); 1536 1537 src3 = __lsx_vld(src0_ptr, 0); 1538 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1539 src4, src5); 1540 src6 = __lsx_vldx(src0_ptr, src_stride_3x); 1541 src0_ptr += src_stride_4x; 1542 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1); 1543 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3); 1544 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5); 1545 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7); 1546 1547 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 1548 filt0, dsth3, dsth4, dsth5, dsth6); 1549 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4, vec3, filt1, dsth5, 1550 vec5, filt1, dsth6, vec7, filt1, dsth3, dsth4, dsth5, dsth6); 1551 1552 src3 = __lsx_vld(src0_ptr, 0); 1553 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1554 src4, src5); 1555 src6 = __lsx_vldx(src0_ptr, src_stride_3x); 1556 1557 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1); 1558 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3); 1559 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5); 1560 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7); 1561 1562 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 1563 filt0, dsth7, dsth8, dsth9, dsth10); 1564 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth7, vec1, filt1, dsth8, vec3, filt1, dsth9, 1565 vec5, filt1, dsth10, vec7, filt1, dsth7, dsth8, dsth9, dsth10); 1566 1567 DUP2_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, tmp4, tmp6); 1568 DUP2_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, tmp5, tmp7); 1569 DUP2_ARG2(__lsx_vilvl_h, dsth5, dsth4, dsth6, dsth5, dsth0, dsth2); 1570 DUP2_ARG2(__lsx_vilvh_h, dsth5, dsth4, dsth6, dsth5, dsth1, dsth3); 1571 DUP4_ARG2(__lsx_vdp2_w_h, tmp0, filt_h0, tmp2, filt_h0, tmp4, filt_h0, 1572 tmp6, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r); 1573 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, tmp4, filt_h1, dst1_r, tmp6, 1574 filt_h1, dst2_r, dsth0, filt_h1, dst3_r, dsth2, filt_h1, 1575 dst0_r, dst1_r, dst2_r, dst3_r); 1576 DUP2_ARG2(__lsx_vpickev_d, tmp3, tmp1, tmp7, tmp5, tmp0, tmp8); 1577 dst0_l = __lsx_vdp2_w_h(tmp0, filt_h0); 1578 dst0_l = __lsx_vdp2add_w_h(dst0_l, tmp8, filt_h1); 1579 1580 DUP2_ARG2(__lsx_vilvl_h, dsth7, dsth6, dsth8, dsth7, tmp0, tmp2); 1581 DUP2_ARG2(__lsx_vilvh_h, dsth7, dsth6, dsth8, dsth7, tmp1, tmp3); 1582 DUP2_ARG2(__lsx_vilvl_h, dsth9, dsth8, dsth10, dsth9, tmp4, tmp6); 1583 DUP2_ARG2(__lsx_vilvh_h, dsth9, dsth8, dsth10, dsth9, tmp5, tmp7); 1584 DUP4_ARG2(__lsx_vdp2_w_h, dsth0, filt_h0, dsth2, filt_h0, tmp0, filt_h0, 1585 tmp2, filt_h0, dst4_r, dst5_r, dst6_r, dst7_r); 1586 DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, tmp0, filt_h1, dst5_r, tmp2, 1587 filt_h1, dst6_r, tmp4, filt_h1, dst7_r, tmp6, filt_h1, 1588 dst4_r, dst5_r, dst6_r, dst7_r); 1589 DUP2_ARG2(__lsx_vpickev_d, dsth3, dsth1, tmp3, tmp1, tmp0, tmp1); 1590 tmp2 = __lsx_vpickev_d(tmp7, tmp5); 1591 1592 DUP2_ARG2(__lsx_vdp2_w_h, tmp8, filt_h0, tmp0, filt_h0, dst1_l, dst2_l); 1593 dst3_l = __lsx_vdp2_w_h(tmp1, filt_h0); 1594 DUP2_ARG3(__lsx_vdp2add_w_h, dst1_l, tmp0, filt_h1, dst2_l, tmp1, filt_h1, 1595 dst1_l, dst2_l); 1596 dst3_l = __lsx_vdp2add_w_h(dst3_l, tmp2, filt_h1); 1597 1598 DUP4_ARG2(__lsx_vsrai_d, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6, 1599 dst0_r, dst1_r, dst2_r, dst3_r); 1600 DUP4_ARG2(__lsx_vsrai_d, dst4_r, 6, dst5_r, 6, dst6_r, 6, dst7_r, 6, 1601 dst4_r, dst5_r, dst6_r, dst7_r); 1602 DUP4_ARG2(__lsx_vsrai_d, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6, 1603 dst0_l, dst1_l, dst2_l, dst3_l); 1604 DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); 1605 DUP2_ARG2(__lsx_vpickev_h, dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3); 1606 DUP2_ARG2(__lsx_vpickev_h, dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5); 1607 1608 reg0 = __lsx_vldrepl_d(src1_ptr, 0); 1609 reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0); 1610 dsth0 = __lsx_vilvl_d(reg1, reg0); 1611 reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0); 1612 reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0); 1613 dsth1 = __lsx_vilvl_d(reg1, reg0); 1614 src1_ptr += src2_stride_4x; 1615 reg0 = __lsx_vldrepl_d(src1_ptr, 0); 1616 reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0); 1617 dsth2 = __lsx_vilvl_d(reg1, reg0); 1618 reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0); 1619 reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0); 1620 dsth3 = __lsx_vilvl_d(reg1, reg0); 1621 1622 DUP4_ARG2(__lsx_vsadd_h, dsth0, tmp0, dsth1, tmp1, dsth2, tmp2, dsth3, 1623 tmp3, tmp0, tmp1, tmp2, tmp3); 1624 DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, 1625 tmp0, tmp1, tmp2, tmp3); 1626 DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); 1627 1628 __lsx_vstelm_w(out0, dst, 0, 0); 1629 __lsx_vstelm_w(out0, dst + dst_stride, 0, 1); 1630 __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2); 1631 __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3); 1632 dst += dst_stride_4x; 1633 __lsx_vstelm_w(out1, dst, 0, 0); 1634 __lsx_vstelm_w(out1, dst + dst_stride, 0, 1); 1635 __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2); 1636 __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3); 1637 dst -= dst_stride_4x; 1638 1639 src1_ptr -= src2_stride_4x; 1640 1641 reg0 = __lsx_vldrepl_w(src1_ptr, 8); 1642 reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8); 1643 reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8); 1644 reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8); 1645 DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1); 1646 dsth4 = __lsx_vilvl_d(tmp1, tmp0); 1647 src1_ptr += src2_stride_4x; 1648 1649 reg0 = __lsx_vldrepl_w(src1_ptr, 8); 1650 reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8); 1651 reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8); 1652 reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8); 1653 DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1); 1654 dsth5 = __lsx_vilvl_d(tmp1, tmp0); 1655 DUP2_ARG2(__lsx_vsadd_h, dsth4, tmp4, dsth5, tmp5, tmp4, tmp5); 1656 DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 7, tmp4, tmp5); 1657 out0 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7); 1658 1659 __lsx_vstelm_h(out0, dst, 4, 0); 1660 __lsx_vstelm_h(out0, dst + dst_stride, 4, 1); 1661 __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 2); 1662 __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 3); 1663 dst += dst_stride_4x; 1664 __lsx_vstelm_h(out0, dst, 4, 4); 1665 __lsx_vstelm_h(out0, dst + dst_stride, 4, 5); 1666 __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 6); 1667 __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 7); 1668} 1669 1670static av_always_inline 1671void hevc_hv_4t_8x2_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, 1672 int32_t src2_stride, uint8_t *dst, int32_t dst_stride, 1673 const int8_t *filter_x, const int8_t *filter_y) 1674{ 1675 int32_t src_stride_2x = (src_stride << 1); 1676 int32_t src_stride_4x = (src_stride << 2); 1677 int32_t src_stride_3x = src_stride_2x + src_stride; 1678 1679 __m128i out; 1680 __m128i src0, src1, src2, src3, src4; 1681 __m128i filt0, filt1; 1682 __m128i filt_h0, filt_h1; 1683 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1684 __m128i mask1, filter_vec; 1685 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 1686 __m128i dst0, dst1, dst2, dst3, dst4; 1687 __m128i dst0_r, dst0_l, dst1_r, dst1_l; 1688 __m128i dst10_r, dst32_r, dst21_r, dst43_r; 1689 __m128i dst10_l, dst32_l, dst21_l, dst43_l; 1690 __m128i tmp0, tmp1; 1691 __m128i in0, in1; 1692 1693 src0_ptr -= (src_stride + 1); 1694 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1695 1696 filter_vec = __lsx_vld(filter_y, 0); 1697 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1698 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 1699 1700 mask1 = __lsx_vaddi_bu(mask0, 2); 1701 1702 src0 = __lsx_vld(src0_ptr, 0); 1703 DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1704 src0_ptr, src_stride_3x, src0_ptr, src_stride_4x, 1705 src1, src2, src3, src4); 1706 1707 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr + src2_stride, 0, in0, in1); 1708 1709 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1); 1710 DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3); 1711 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5); 1712 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7); 1713 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9); 1714 1715 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 1716 filt0, dst0, dst1, dst2, dst3); 1717 dst4 = __lsx_vdp2_h_bu_b(vec8, filt0); 1718 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2, 1719 vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3); 1720 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1); 1721 1722 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 1723 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 1724 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r); 1725 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l); 1726 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 1727 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 1728 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 1729 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 1730 dst0_r, dst0_l, dst1_r, dst1_l); 1731 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 1732 dst0_r, dst0_l, dst1_r, dst1_l); 1733 DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); 1734 DUP2_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, tmp0, tmp1); 1735 DUP2_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp0, tmp1); 1736 out = __lsx_vssrlrni_bu_h(tmp1, tmp0, 7); 1737 __lsx_vstelm_d(out, dst, 0, 0); 1738 __lsx_vstelm_d(out, dst + dst_stride, 0, 1); 1739} 1740 1741static av_always_inline 1742void hevc_hv_4t_8multx4_lsx(uint8_t *src0_ptr, int32_t src_stride, 1743 int16_t *src1_ptr, int32_t src2_stride, 1744 uint8_t *dst, int32_t dst_stride, 1745 const int8_t *filter_x, const int8_t *filter_y, 1746 int32_t width8mult) 1747{ 1748 uint32_t cnt; 1749 int32_t src_stride_2x = (src_stride << 1); 1750 int32_t dst_stride_2x = (dst_stride << 1); 1751 int32_t src_stride_4x = (src_stride << 2); 1752 int32_t src2_stride_x = (src2_stride << 1); 1753 int32_t src2_stride_2x = (src2_stride << 2); 1754 int32_t src_stride_3x = src_stride_2x + src_stride; 1755 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1756 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 1757 1758 __m128i out0, out1; 1759 __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 1760 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1761 __m128i filt0, filt1, filt_h0, filt_h1, filter_vec; 1762 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 1763 __m128i in0, in1, in2, in3; 1764 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 1765 __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 1766 __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 1767 1768 src0_ptr -= (src_stride + 1); 1769 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1770 1771 filter_vec = __lsx_vld(filter_y, 0); 1772 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1773 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 1774 1775 mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1776 mask1 = __lsx_vaddi_bu(mask0, 2); 1777 1778 for (cnt = width8mult; cnt--;) { 1779 src0 = __lsx_vld(src0_ptr, 0); 1780 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1781 src1, src2); 1782 src3 = __lsx_vldx(src0_ptr, src_stride_3x); 1783 src0_ptr += src_stride_4x; 1784 src4 = __lsx_vld(src0_ptr, 0); 1785 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1786 src5, src6); 1787 src0_ptr += (8 - src_stride_4x); 1788 1789 in0 = __lsx_vld(src1_ptr, 0); 1790 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, 1791 src2_stride_2x, in1, in2); 1792 in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 1793 src1_ptr += 8; 1794 1795 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, 1796 vec0, vec1); 1797 DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, 1798 vec2, vec3); 1799 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, 1800 vec4, vec5); 1801 1802 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 1803 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 1804 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 1805 dst0, dst1); 1806 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 1807 1808 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 1809 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 1810 1811 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, 1812 vec0, vec1); 1813 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, 1814 vec2, vec3); 1815 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, 1816 vec4, vec5); 1817 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, 1818 vec6, vec7); 1819 1820 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 1821 vec6, filt0, dst3, dst4, dst5, dst6); 1822 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1, 1823 dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6); 1824 1825 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r); 1826 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l); 1827 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r); 1828 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l); 1829 1830 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 1831 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 1832 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 1833 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 1834 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 1835 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 1836 dst0_r, dst0_l, dst1_r, dst1_l); 1837 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 1838 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 1839 dst2_r, dst2_l, dst3_r, dst3_l); 1840 1841 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 1842 dst0_r, dst0_l, dst1_r, dst1_l); 1843 DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, 1844 dst2_r, dst2_l, dst3_r, dst3_l); 1845 DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, 1846 dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); 1847 DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 1848 tmp0, tmp1, tmp2, tmp3); 1849 DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, 1850 tmp0, tmp1, tmp2, tmp3); 1851 DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); 1852 __lsx_vstelm_d(out0, dst, 0, 0); 1853 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 1854 __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 1855 __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 1856 dst += 8; 1857 } 1858} 1859 1860static av_always_inline 1861void hevc_hv_4t_8x6_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, 1862 int32_t src2_stride, uint8_t *dst, int32_t dst_stride, 1863 const int8_t *filter_x, const int8_t *filter_y) 1864{ 1865 int32_t src_stride_2x = (src_stride << 1); 1866 int32_t dst_stride_2x = (dst_stride << 1); 1867 int32_t src_stride_4x = (src_stride << 2); 1868 int32_t dst_stride_4x = (dst_stride << 2); 1869 int32_t src2_stride_x = (src2_stride << 1); 1870 int32_t src2_stride_2x = (src2_stride << 2); 1871 int32_t src_stride_3x = src_stride_2x + src_stride; 1872 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1873 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 1874 1875 __m128i out0, out1, out2; 1876 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 1877 __m128i in0, in1, in2, in3, in4, in5; 1878 __m128i filt0, filt1; 1879 __m128i filt_h0, filt_h1; 1880 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1881 __m128i mask1, filter_vec; 1882 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 1883 __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 1884 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 1885 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 1886 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 1887 __m128i dst4_r, dst4_l, dst5_r, dst5_l; 1888 __m128i dst10_r, dst32_r, dst10_l, dst32_l; 1889 __m128i dst21_r, dst43_r, dst21_l, dst43_l; 1890 __m128i dst54_r, dst54_l, dst65_r, dst65_l; 1891 __m128i dst76_r, dst76_l, dst87_r, dst87_l; 1892 1893 src0_ptr -= (src_stride + 1); 1894 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1895 1896 filter_vec = __lsx_vld(filter_y, 0); 1897 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1898 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 1899 1900 mask1 = __lsx_vaddi_bu(mask0, 2); 1901 1902 src0 = __lsx_vld(src0_ptr, 0); 1903 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1904 src1, src2); 1905 src3 = __lsx_vldx(src0_ptr, src_stride_3x); 1906 src0_ptr += src_stride_4x; 1907 src4 = __lsx_vld(src0_ptr, 0); 1908 DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x, 1909 src0_ptr, src_stride_3x, src0_ptr, src_stride_4x, 1910 src5, src6, src7, src8); 1911 1912 in0 = __lsx_vld(src1_ptr, 0); 1913 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x, 1914 in1, in2); 1915 in3 = __lsx_vldx(src1_ptr, src2_stride_3x); 1916 src1_ptr += src2_stride_2x; 1917 in4 = __lsx_vld(src1_ptr, 0); 1918 in5 = __lsx_vldx(src1_ptr, src2_stride_x); 1919 1920 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1); 1921 DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3); 1922 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5); 1923 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7); 1924 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9); 1925 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec10, vec11); 1926 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec12, vec13); 1927 DUP2_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, vec14, vec15); 1928 DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17); 1929 1930 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 1931 filt0, dst0, dst1, dst2, dst3); 1932 dst4 = __lsx_vdp2_h_bu_b(vec8, filt0); 1933 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec10, filt0, vec12, filt0, vec14, filt0, 1934 vec16, filt0, dst5, dst6, dst7, dst8); 1935 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2, 1936 vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3); 1937 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1); 1938 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec11, filt1, dst6, vec13, filt1, 1939 dst7, vec15, filt1, dst8, vec17, filt1, dst5, dst6, dst7, dst8); 1940 1941 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 1942 dst10_r, dst21_r, dst32_r, dst43_r); 1943 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 1944 dst10_l, dst21_l, dst32_l, dst43_l); 1945 DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7, 1946 dst54_r, dst65_r, dst76_r, dst87_r); 1947 DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7, 1948 dst54_l, dst65_l, dst76_l, dst87_l); 1949 1950 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 1951 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 1952 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 1953 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 1954 DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r, 1955 filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l); 1956 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 1957 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 1958 dst0_r, dst0_l, dst1_r, dst1_l); 1959 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 1960 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 1961 dst2_r, dst2_l, dst3_r, dst3_l); 1962 DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l, 1963 filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1, 1964 dst4_r, dst4_l, dst5_r, dst5_l); 1965 1966 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 1967 dst0_r, dst0_l, dst1_r, dst1_l); 1968 DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, 1969 dst2_r, dst2_l, dst3_r, dst3_l); 1970 DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6, 1971 dst4_r, dst4_l, dst5_r, dst5_l); 1972 DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, 1973 dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); 1974 DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5); 1975 DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 1976 tmp0, tmp1, tmp2, tmp3); 1977 DUP2_ARG2(__lsx_vsadd_h, in4, tmp4, in5, tmp5, tmp4, tmp5); 1978 DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, 1979 tmp0, tmp1, tmp2, tmp3); 1980 DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 0, tmp4, tmp5); 1981 DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); 1982 out2 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7); 1983 __lsx_vstelm_d(out0, dst, 0, 0); 1984 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 1985 __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 1986 __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 1987 dst += dst_stride_4x; 1988 __lsx_vstelm_d(out2, dst, 0, 0); 1989 __lsx_vstelm_d(out2, dst + dst_stride, 0, 1); 1990} 1991 1992static av_always_inline 1993void hevc_hv_4t_8multx4mult_lsx(uint8_t *src0_ptr, int32_t src_stride, 1994 int16_t *src1_ptr, int32_t src2_stride, 1995 uint8_t *dst, int32_t dst_stride, 1996 const int8_t *filter_x, const int8_t *filter_y, 1997 int32_t height, int32_t width) 1998{ 1999 uint32_t loop_cnt, cnt; 2000 uint8_t *src0_ptr_tmp; 2001 int16_t *src1_ptr_tmp; 2002 uint8_t *dst_tmp; 2003 const int32_t src_stride_2x = (src_stride << 1); 2004 const int32_t dst_stride_2x = (dst_stride << 1); 2005 const int32_t src_stride_4x = (src_stride << 2); 2006 const int32_t dst_stride_4x = (dst_stride << 2); 2007 const int32_t src2_stride_x = (src2_stride << 1); 2008 const int32_t src2_stride_2x = (src2_stride << 2); 2009 const int32_t src_stride_3x = src_stride_2x + src_stride; 2010 const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 2011 const int32_t src2_stride_3x = src2_stride_2x + src2_stride_x; 2012 __m128i out0, out1; 2013 __m128i src0, src1, src2, src3, src4, src5, src6; 2014 __m128i in0, in1, in2, in3; 2015 __m128i filt0, filt1; 2016 __m128i filt_h0, filt_h1; 2017 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 2018 __m128i mask1, filter_vec; 2019 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2020 __m128i dst0, dst1, dst2, dst3, dst4, dst5; 2021 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 2022 __m128i tmp0, tmp1, tmp2, tmp3; 2023 __m128i dst10_r, dst32_r, dst21_r, dst43_r; 2024 __m128i dst10_l, dst32_l, dst21_l, dst43_l; 2025 __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6; 2026 2027 src0_ptr -= (src_stride + 1); 2028 2029 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 2030 2031 filter_vec = __lsx_vld(filter_y, 0); 2032 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 2033 2034 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 2035 2036 mask1 = __lsx_vaddi_bu(mask0, 2); 2037 2038 for (cnt = width >> 3; cnt--;) { 2039 src0_ptr_tmp = src0_ptr; 2040 dst_tmp = dst; 2041 src1_ptr_tmp = src1_ptr; 2042 2043 src0 = __lsx_vld(src0_ptr_tmp, 0); 2044 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 2045 src_stride_2x, src1, src2); 2046 src0_ptr_tmp += src_stride_3x; 2047 2048 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, 2049 vec0, vec1); 2050 DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, 2051 vec2, vec3); 2052 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, 2053 vec4, vec5); 2054 2055 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 2056 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 2057 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 2058 dst0, dst1); 2059 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 2060 2061 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 2062 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 2063 2064 for (loop_cnt = height >> 2; loop_cnt--;) { 2065 src3 = __lsx_vld(src0_ptr_tmp, 0); 2066 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp, 2067 src_stride_2x, src4, src5); 2068 src6 = __lsx_vldx(src0_ptr_tmp, src_stride_3x); 2069 src0_ptr_tmp += src_stride_4x; 2070 in0 = __lsx_vld(src1_ptr_tmp, 0); 2071 DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp, 2072 src2_stride_2x, in1, in2); 2073 in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x); 2074 src1_ptr_tmp += src2_stride_2x; 2075 2076 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4, 2077 src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3); 2078 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6, 2079 src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7); 2080 2081 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 2082 vec6, filt0, dst3, dst4, dst5, dst6); 2083 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, 2084 filt1, dst5, vec5, filt1, dst6, vec7, filt1, 2085 dst3, dst4, dst5, dst6); 2086 2087 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r); 2088 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l); 2089 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r); 2090 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l); 2091 2092 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 2093 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 2094 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 2095 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 2096 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, 2097 dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, 2098 dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l); 2099 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, 2100 dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, 2101 dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l); 2102 2103 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, 2104 dst0_r, dst0_l, dst1_r, dst1_l); 2105 DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, 2106 dst2_r, dst2_l, dst3_r, dst3_l); 2107 DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, 2108 dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); 2109 DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 2110 tmp0, tmp1, tmp2, tmp3); 2111 DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, tmp0, 2112 tmp1, tmp2, tmp3); 2113 DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); 2114 __lsx_vstelm_d(out0, dst_tmp, 0, 0); 2115 __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1); 2116 __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0); 2117 __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1); 2118 dst_tmp += dst_stride_4x; 2119 2120 dst10_r = dst54_r; 2121 dst10_l = dst54_l; 2122 dst21_r = dst65_r; 2123 dst21_l = dst65_l; 2124 dst2 = dst6; 2125 } 2126 2127 src0_ptr += 8; 2128 dst += 8; 2129 src1_ptr += 8; 2130 } 2131} 2132 2133static void hevc_hv_4t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, 2134 int16_t *src1_ptr, int32_t src2_stride, 2135 uint8_t *dst, int32_t dst_stride, 2136 const int8_t *filter_x, const int8_t *filter_y, 2137 int32_t height) 2138{ 2139 if (2 == height) { 2140 hevc_hv_4t_8x2_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2141 dst, dst_stride, filter_x, filter_y); 2142 } else if (4 == height) { 2143 hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2144 dst, dst_stride, filter_x, filter_y, 1); 2145 } else if (6 == height) { 2146 hevc_hv_4t_8x6_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2147 dst, dst_stride, filter_x, filter_y); 2148 } else { 2149 hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2150 dst, dst_stride, filter_x, filter_y, height, 8); 2151 } 2152} 2153 2154static void hevc_hv_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride, 2155 int16_t *src1_ptr, int32_t src2_stride, 2156 uint8_t *dst, int32_t dst_stride, 2157 const int8_t *filter_x, const int8_t *filter_y, 2158 int32_t height) 2159{ 2160 if (4 == height) { 2161 hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2162 dst, dst_stride, filter_x, filter_y, 2); 2163 } else { 2164 hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2165 dst, dst_stride, filter_x, filter_y, height, 16); 2166 } 2167} 2168 2169static void hevc_hv_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride, 2170 int16_t *src1_ptr, int32_t src2_stride, 2171 uint8_t *dst, int32_t dst_stride, 2172 const int8_t *filter_x, const int8_t *filter_y, 2173 int32_t height) 2174{ 2175 hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2176 dst, dst_stride, filter_x, filter_y, height, 24); 2177} 2178 2179static void hevc_hv_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride, 2180 int16_t *src1_ptr, int32_t src2_stride, 2181 uint8_t *dst, int32_t dst_stride, 2182 const int8_t *filter_x, const int8_t *filter_y, 2183 int32_t height) 2184{ 2185 hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride, 2186 dst, dst_stride, filter_x, filter_y, height, 32); 2187} 2188 2189#define BI_MC_COPY(WIDTH) \ 2190void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_lsx(uint8_t *dst, \ 2191 ptrdiff_t dst_stride, \ 2192 uint8_t *src, \ 2193 ptrdiff_t src_stride, \ 2194 int16_t *src_16bit, \ 2195 int height, \ 2196 intptr_t mx, \ 2197 intptr_t my, \ 2198 int width) \ 2199{ \ 2200 hevc_bi_copy_##WIDTH##w_lsx(src, src_stride, src_16bit, MAX_PB_SIZE, \ 2201 dst, dst_stride, height); \ 2202} 2203 2204BI_MC_COPY(4); 2205BI_MC_COPY(6); 2206BI_MC_COPY(8); 2207BI_MC_COPY(12); 2208BI_MC_COPY(16); 2209BI_MC_COPY(24); 2210BI_MC_COPY(32); 2211BI_MC_COPY(48); 2212BI_MC_COPY(64); 2213 2214#undef BI_MC_COPY 2215 2216#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 2217void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \ 2218 ptrdiff_t dst_stride, \ 2219 uint8_t *src, \ 2220 ptrdiff_t src_stride, \ 2221 int16_t *src_16bit, \ 2222 int height, \ 2223 intptr_t mx, \ 2224 intptr_t my, \ 2225 int width) \ 2226{ \ 2227 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 2228 \ 2229 hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit, \ 2230 MAX_PB_SIZE, dst, dst_stride, \ 2231 filter, height); \ 2232} 2233 2234BI_MC(qpel, h, 16, 8, hz, mx); 2235BI_MC(qpel, h, 24, 8, hz, mx); 2236BI_MC(qpel, h, 32, 8, hz, mx); 2237BI_MC(qpel, h, 48, 8, hz, mx); 2238BI_MC(qpel, h, 64, 8, hz, mx); 2239 2240BI_MC(qpel, v, 8, 8, vt, my); 2241BI_MC(qpel, v, 16, 8, vt, my); 2242BI_MC(qpel, v, 24, 8, vt, my); 2243BI_MC(qpel, v, 32, 8, vt, my); 2244BI_MC(qpel, v, 48, 8, vt, my); 2245BI_MC(qpel, v, 64, 8, vt, my); 2246 2247BI_MC(epel, h, 24, 4, hz, mx); 2248BI_MC(epel, h, 32, 4, hz, mx); 2249 2250BI_MC(epel, v, 12, 4, vt, my); 2251BI_MC(epel, v, 16, 4, vt, my); 2252BI_MC(epel, v, 24, 4, vt, my); 2253BI_MC(epel, v, 32, 4, vt, my); 2254 2255#undef BI_MC 2256 2257#define BI_MC_HV(PEL, WIDTH, TAP) \ 2258void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \ 2259 ptrdiff_t dst_stride, \ 2260 uint8_t *src, \ 2261 ptrdiff_t src_stride, \ 2262 int16_t *src_16bit, \ 2263 int height, \ 2264 intptr_t mx, \ 2265 intptr_t my, \ 2266 int width) \ 2267{ \ 2268 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 2269 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 2270 \ 2271 hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit, \ 2272 MAX_PB_SIZE, dst, dst_stride, \ 2273 filter_x, filter_y, height); \ 2274} 2275 2276BI_MC_HV(qpel, 8, 8); 2277BI_MC_HV(qpel, 16, 8); 2278BI_MC_HV(qpel, 24, 8); 2279BI_MC_HV(qpel, 32, 8); 2280BI_MC_HV(qpel, 48, 8); 2281BI_MC_HV(qpel, 64, 8); 2282 2283BI_MC_HV(epel, 8, 4); 2284BI_MC_HV(epel, 6, 4); 2285BI_MC_HV(epel, 16, 4); 2286BI_MC_HV(epel, 24, 4); 2287BI_MC_HV(epel, 32, 4); 2288 2289#undef BI_MC_HV 2290