1/* 2 * Copyright (c) 2022 Loongson Technology Corporation Limited 3 * Contributed by Lu Wang <wanglu@loongson.cn> 4 * Hao Chen <chenhao@loongson.cn> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/loongarch/loongson_intrinsics.h" 24#include "hevcdsp_lsx.h" 25 26static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = { 27 /* 8 width cases */ 28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 29 /* 4 width cases */ 30 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 31 /* 4 width cases */ 32 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 33}; 34 35static av_always_inline 36void common_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride, 37 uint8_t *dst, int32_t dst_stride, 38 const int8_t *filter, int32_t height) 39{ 40 int32_t loop_cnt; 41 __m128i mask0, mask1, mask2, mask3, out1, out2; 42 __m128i src0, src1, src2, src3, src4, src5, src6, src7; 43 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 44 __m128i filt0, filt1, filt2, filt3; 45 __m128i res0, res1, res2, res3; 46 47 mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 48 src -= 3; 49 50 /* rearranging filter */ 51 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 52 filt0, filt1, filt2, filt3); 53 54 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 55 mask3 = __lsx_vaddi_bu(mask0, 6); 56 57 for (loop_cnt = height; loop_cnt--;) { 58 DUP4_ARG2(__lsx_vld, src, 0, src, 8, src, 16, src, 24, 59 src0, src1, src2, src3); 60 DUP4_ARG2(__lsx_vld, src, 32, src, 40, src, 48, src, 56, 61 src4, src5, src6, src7); 62 src += src_stride; 63 64 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, 65 vec0, vec1); 66 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, 67 vec2, vec3); 68 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 69 vec3, filt0, res0, res1, res2, res3); 70 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, 71 vec0, vec1); 72 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, 73 vec2, vec3); 74 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2, 75 res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3); 76 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, 77 vec4, vec5); 78 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, 79 vec6, vec7); 80 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1, 81 res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3); 82 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, 83 vec4, vec5); 84 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3, 85 vec6, vec7); 86 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3, 87 res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3); 88 89 DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6, 90 out1, out2); 91 __lsx_vst(out1, dst, 0); 92 __lsx_vst(out2, dst, 16); 93 94 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src5, mask0, 95 vec0, vec1); 96 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src7, src7, mask0, 97 vec2, vec3); 98 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0, 99 vec3, filt0, res0, res1, res2, res3); 100 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask2, src5, src5, mask2, 101 vec0, vec1); 102 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask2, src7, src7, mask2, 103 vec2, vec3); 104 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2, 105 res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3); 106 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src5, mask1, 107 vec4, vec5); 108 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask1, src7, src7, mask1, 109 vec6, vec7); 110 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1, 111 res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3); 112 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask3, src5, src5, mask3, 113 vec4, vec5); 114 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask3, src7, src7, mask3, 115 vec6, vec7); 116 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3, 117 res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3); 118 119 DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6, 120 out1, out2); 121 __lsx_vst(out1, dst, 32); 122 __lsx_vst(out2, dst, 48); 123 dst += dst_stride; 124 } 125} 126 127static av_always_inline 128void common_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride, 129 uint8_t *dst, int32_t dst_stride, 130 const int8_t *filter, int32_t height) 131{ 132 uint32_t loop_cnt; 133 int32_t src_stride_2x = (src_stride << 1); 134 int32_t dst_stride_2x = (dst_stride << 1); 135 int32_t src_stride_4x = (src_stride << 2); 136 int32_t dst_stride_4x = (dst_stride << 2); 137 int32_t src_stride_3x = src_stride_2x + src_stride; 138 int32_t dst_stride_3x = dst_stride_2x + dst_stride; 139 140 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 141 __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 142 __m128i src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; 143 __m128i tmp0, tmp1; 144 __m128i out0_r, out1_r, out2_r, out3_r; 145 146 src -= src_stride_3x; 147 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 148 filt0, filt1, filt2, filt3); 149 150 src0 = __lsx_vld(src, 0); 151 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 152 src3 = __lsx_vldx(src, src_stride_3x); 153 src += src_stride_4x; 154 src4 = __lsx_vld(src, 0); 155 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 156 src += src_stride_3x; 157 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 158 src10_r, src32_r, src54_r, src21_r); 159 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 160 161 for (loop_cnt = (height >> 2); loop_cnt--;) { 162 src7 = __lsx_vld(src, 0); 163 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9); 164 src10 = __lsx_vldx(src, src_stride_3x); 165 src += src_stride_4x; 166 167 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, 168 src9, src76_r, src87_r, src98_r, src109_r); 169 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r, 170 filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r); 171 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r, 172 src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r, 173 filt1, out0_r, out1_r, out2_r, out3_r); 174 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r, 175 src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r, 176 filt2, out0_r, out1_r, out2_r, out3_r); 177 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r, 178 src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r, 179 filt3, out0_r, out1_r, out2_r, out3_r); 180 181 DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6, 182 tmp0, tmp1) 183 __lsx_vstelm_d(tmp0, dst, 0, 0); 184 __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); 185 __lsx_vstelm_d(tmp1, dst + dst_stride_2x, 0, 0); 186 __lsx_vstelm_d(tmp1, dst + dst_stride_3x, 0, 1); 187 dst += dst_stride_4x; 188 189 src10_r = src54_r; 190 src32_r = src76_r; 191 src54_r = src98_r; 192 src21_r = src65_r; 193 src43_r = src87_r; 194 src65_r = src109_r; 195 src6 = src10; 196 } 197} 198 199static av_always_inline 200void common_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 201 int32_t dst_stride, const int8_t *filter, 202 int32_t height, int32_t width) 203{ 204 uint8_t *src_tmp; 205 uint8_t *dst_tmp; 206 uint32_t loop_cnt, cnt; 207 const int32_t src_stride_2x = (src_stride << 1); 208 const int32_t dst_stride_2x = (dst_stride << 1); 209 const int32_t src_stride_4x = (src_stride << 2); 210 const int32_t dst_stride_4x = (dst_stride << 2); 211 const int32_t src_stride_3x = src_stride_2x + src_stride; 212 const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 213 214 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 215 __m128i filt0, filt1, filt2, filt3; 216 __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 217 __m128i src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 218 __m128i src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 219 __m128i tmp0, tmp1, tmp2, tmp3; 220 __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 221 222 src -= src_stride_3x; 223 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, filt0, 224 filt1, filt2, filt3); 225 226 for (cnt = (width >> 4); cnt--;) { 227 src_tmp = src; 228 dst_tmp = dst; 229 230 src0 = __lsx_vld(src_tmp, 0); 231 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 232 src1, src2); 233 src3 = __lsx_vldx(src_tmp, src_stride_3x); 234 src_tmp += src_stride_4x; 235 src4 = __lsx_vld(src_tmp, 0); 236 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 237 src5, src6); 238 src_tmp += src_stride_3x; 239 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 240 src10_r, src32_r, src54_r, src21_r); 241 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r); 242 DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, 243 src10_l, src32_l, src54_l, src21_l); 244 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l); 245 246 for (loop_cnt = (height >> 2); loop_cnt--;) { 247 src7 = __lsx_vld(src_tmp, 0); 248 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 249 src8, src9); 250 src10 = __lsx_vldx(src_tmp, src_stride_3x); 251 src_tmp += src_stride_4x; 252 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, 253 src9, src76_r, src87_r, src98_r, src109_r); 254 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, 255 src9, src76_l, src87_l, src98_l, src109_l); 256 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r, 257 filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r); 258 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r, 259 src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r, 260 filt1, out0_r, out1_r, out2_r, out3_r); 261 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r, 262 src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r, 263 filt2, out0_r, out1_r, out2_r, out3_r); 264 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r, 265 src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r, 266 filt3, out0_r, out1_r, out2_r, out3_r); 267 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_l, filt0, src21_l, filt0, src32_l, 268 filt0, src43_l, filt0, out0_l, out1_l, out2_l, out3_l); 269 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src32_l, filt1, out1_l, 270 src43_l, filt1, out2_l, src54_l, filt1, out3_l, src65_l, 271 filt1, out0_l, out1_l, out2_l, out3_l); 272 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src54_l, filt2, out1_l, 273 src65_l, filt2, out2_l, src76_l, filt2, out3_l, src87_l, 274 filt2, out0_l, out1_l, out2_l, out3_l); 275 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src76_l, filt3, out1_l, 276 src87_l, filt3, out2_l, src98_l, filt3, out3_l, src109_l, 277 filt3, out0_l, out1_l, out2_l, out3_l); 278 DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 279 6, out2_l, out2_r, 6, out3_l, out3_r, 6, 280 tmp0, tmp1, tmp2, tmp3); 281 __lsx_vst(tmp0, dst_tmp, 0); 282 __lsx_vstx(tmp1, dst_tmp, dst_stride); 283 __lsx_vstx(tmp2, dst_tmp, dst_stride_2x); 284 __lsx_vstx(tmp3, dst_tmp, dst_stride_3x); 285 dst_tmp += dst_stride_4x; 286 287 src10_r = src54_r; 288 src32_r = src76_r; 289 src54_r = src98_r; 290 src21_r = src65_r; 291 src43_r = src87_r; 292 src65_r = src109_r; 293 src10_l = src54_l; 294 src32_l = src76_l; 295 src54_l = src98_l; 296 src21_l = src65_l; 297 src43_l = src87_l; 298 src65_l = src109_l; 299 src6 = src10; 300 } 301 302 src += 16; 303 dst += 16; 304 } 305} 306 307static void common_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride, 308 uint8_t *dst, int32_t dst_stride, 309 const int8_t *filter, int32_t height) 310{ 311 common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 16); 312 common_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride, filter, 313 height); 314} 315 316static void common_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride, 317 uint8_t *dst, int32_t dst_stride, 318 const int8_t *filter, int32_t height) 319{ 320 common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 32); 321} 322 323static void common_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride, 324 uint8_t *dst, int32_t dst_stride, 325 const int8_t *filter, int32_t height) 326{ 327 common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 48); 328} 329 330static void common_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride, 331 uint8_t *dst, int32_t dst_stride, 332 const int8_t *filter, int32_t height) 333{ 334 common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 64); 335} 336 337static av_always_inline 338void hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 339 int32_t dst_stride, const int8_t *filter_x, 340 const int8_t *filter_y, int32_t height, int32_t width) 341{ 342 uint32_t loop_cnt, cnt; 343 uint8_t *src_tmp; 344 uint8_t *dst_tmp; 345 const int32_t src_stride_2x = (src_stride << 1); 346 const int32_t dst_stride_2x = (dst_stride << 1); 347 const int32_t src_stride_4x = (src_stride << 2); 348 const int32_t src_stride_3x = src_stride_2x + src_stride; 349 350 __m128i out; 351 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 352 __m128i filt0, filt1, filt2, filt3; 353 __m128i filt_h0, filt_h1, filt_h2, filt_h3; 354 __m128i mask1, mask2, mask3; 355 __m128i filter_vec; 356 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 357 __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 358 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 359 __m128i dst0_r, dst0_l, dst1_r, dst1_l; 360 __m128i dst10_r, dst32_r, dst54_r, dst76_r; 361 __m128i dst10_l, dst32_l, dst54_l, dst76_l; 362 __m128i dst21_r, dst43_r, dst65_r, dst87_r; 363 __m128i dst21_l, dst43_l, dst65_l, dst87_l; 364 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 365 366 src -= (src_stride_3x + 3); 367 DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, 368 filter_x, 6, filt0, filt1, filt2, filt3); 369 370 filter_vec = __lsx_vld(filter_y, 0); 371 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 372 DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2, 373 filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3); 374 375 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 376 mask3 = __lsx_vaddi_bu(mask0, 6); 377 378 for (cnt = width >> 3; cnt--;) { 379 src_tmp = src; 380 dst_tmp = dst; 381 382 src0 = __lsx_vld(src_tmp, 0); 383 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 384 src1, src2); 385 src3 = __lsx_vldx(src_tmp, src_stride_3x); 386 src_tmp += src_stride_4x; 387 src4 = __lsx_vld(src_tmp, 0); 388 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 389 src5, src6); 390 src_tmp += src_stride_3x; 391 392 /* row 0 row 1 row 2 row 3 */ 393 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, 394 src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3); 395 DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, 396 src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7); 397 DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, 398 src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11); 399 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, 400 src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15); 401 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0, 402 vec12, filt0, dst0, dst1, dst2, dst3); 403 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1, 404 dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3); 405 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2, 406 dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3); 407 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3, 408 dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3); 409 410 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, 411 src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3); 412 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, 413 src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7); 414 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, 415 src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11); 416 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5); 417 dst6 = __lsx_vdp2_h_bu_b(vec8, filt0); 418 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1, 419 dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4); 420 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2, 421 dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5); 422 dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3); 423 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2, 424 dst1, dst10_r, dst32_r, dst54_r, dst21_r); 425 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2, 426 dst1, dst10_l, dst32_l, dst54_l, dst21_l); 427 DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r); 428 DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l); 429 430 for (loop_cnt = height >> 1; loop_cnt--;) { 431 src7 = __lsx_vld(src_tmp, 0); 432 src8 = __lsx_vldx(src_tmp, src_stride); 433 src_tmp += src_stride_2x; 434 435 DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7, 436 src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3); 437 dst7 = __lsx_vdp2_h_bu_b(vec0, filt0); 438 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, 439 filt2, dst7, dst7); 440 dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3); 441 dst76_r = __lsx_vilvl_h(dst7, dst6); 442 dst76_l = __lsx_vilvh_h(dst7, dst6); 443 DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, 444 dst0_r, dst0_l); 445 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, 446 dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, 447 dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l); 448 DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, 449 dst76_l, filt_h3, dst0_r, dst0_l); 450 DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l); 451 452 DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8, 453 src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3); 454 dst8 = __lsx_vdp2_h_bu_b(vec0, filt0); 455 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2, 456 filt2, dst8, dst8); 457 dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3); 458 459 dst87_r = __lsx_vilvl_h(dst8, dst7); 460 dst87_l = __lsx_vilvh_h(dst8, dst7); 461 DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0, 462 dst1_r, dst1_l); 463 DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l, 464 dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l, 465 dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l); 466 DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l, 467 dst87_l, filt_h3, dst1_r, dst1_l); 468 DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l); 469 DUP4_ARG2(__lsx_vsrari_w, dst0_r, 6, dst0_l, 6,dst1_r, 6, dst1_l, 470 6, dst0_r, dst0_l, dst1_r, dst1_l); 471 DUP4_ARG1(__lsx_vclip255_w, dst0_l, dst0_r, dst1_l, dst1_r, 472 dst0_l, dst0_r, dst1_l, dst1_r); 473 DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, 474 dst0, dst1); 475 out = __lsx_vpickev_b(dst1, dst0); 476 __lsx_vstelm_d(out, dst_tmp, 0, 0); 477 __lsx_vstelm_d(out, dst_tmp + dst_stride, 0, 1); 478 dst_tmp += dst_stride_2x; 479 480 dst10_r = dst32_r; 481 dst32_r = dst54_r; 482 dst54_r = dst76_r; 483 dst10_l = dst32_l; 484 dst32_l = dst54_l; 485 dst54_l = dst76_l; 486 dst21_r = dst43_r; 487 dst43_r = dst65_r; 488 dst65_r = dst87_r; 489 dst21_l = dst43_l; 490 dst43_l = dst65_l; 491 dst65_l = dst87_l; 492 dst6 = dst8; 493 } 494 src += 8; 495 dst += 8; 496 } 497} 498 499static void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 500 int32_t dst_stride, const int8_t *filter_x, 501 const int8_t *filter_y, int32_t height) 502{ 503 hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 504 filter_x, filter_y, height, 8); 505} 506 507static void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 508 int32_t dst_stride, const int8_t *filter_x, 509 const int8_t *filter_y, int32_t height) 510{ 511 hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 512 filter_x, filter_y, height, 16); 513} 514 515static void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 516 int32_t dst_stride, const int8_t *filter_x, 517 const int8_t *filter_y, int32_t height) 518{ 519 hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 520 filter_x, filter_y, height, 24); 521} 522 523static void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 524 int32_t dst_stride, const int8_t *filter_x, 525 const int8_t *filter_y, int32_t height) 526{ 527 hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 528 filter_x, filter_y, height, 32); 529} 530 531static void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 532 int32_t dst_stride, const int8_t *filter_x, 533 const int8_t *filter_y, int32_t height) 534{ 535 hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 536 filter_x, filter_y, height, 48); 537} 538 539static void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 540 int32_t dst_stride, const int8_t *filter_x, 541 const int8_t *filter_y, int32_t height) 542{ 543 hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, 544 filter_x, filter_y, height, 64); 545} 546 547static av_always_inline 548void common_vt_4t_24w_lsx(uint8_t *src, int32_t src_stride, 549 uint8_t *dst, int32_t dst_stride, 550 const int8_t *filter, int32_t height) 551{ 552 uint32_t loop_cnt; 553 int32_t src_stride_2x = (src_stride << 1); 554 int32_t src_stride_3x = src_stride_2x + src_stride; 555 uint8_t *_src; 556 557 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 558 __m128i src11, filt0, filt1; 559 __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 560 __m128i src109_r, src10_l, src32_l, src21_l, src43_l; 561 __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l; 562 __m128i out1, out2, out3, out4; 563 564 src -= src_stride; 565 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 566 _src = src + 16; 567 568 /* 16 width */ 569 src0 = __lsx_vld(src, 0); 570 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 571 DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 572 DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 573 574 /* 8 width */ 575 src6 = __lsx_vld(_src, 0); 576 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8); 577 src += src_stride_3x; 578 _src += src_stride_3x; 579 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 580 581 for (loop_cnt = 8; loop_cnt--;) { 582 /* 16 width */ 583 DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9); 584 DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10); 585 DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 586 DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 587 588 /* 8 width */ 589 src += src_stride_2x; 590 _src += src_stride_2x; 591 DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r); 592 593 /* 16 width */ 594 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r, 595 filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l); 596 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l, 597 filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1, 598 out0_r, out0_l, out1_r, out1_l); 599 600 /* 8 width */ 601 DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0, 602 out2_r, out3_r); 603 DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out3_r, 604 src109_r, filt1, out2_r, out3_r); 605 606 /* 16 + 8 width */ 607 DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6, 608 out3_r, out3_r, 6, out1_l, out1_r, 6, out1, out2, out3, out4); 609 __lsx_vst(out1, dst, 0); 610 __lsx_vstelm_d(out2, dst, 16, 0); 611 dst += dst_stride; 612 __lsx_vst(out4, dst, 0); 613 __lsx_vstelm_d(out3, dst, 16, 0); 614 dst += dst_stride; 615 616 /* 16 width */ 617 DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11); 618 DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8); 619 DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r); 620 DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l); 621 622 /* 8 width */ 623 src += src_stride_2x; 624 _src += src_stride_2x; 625 DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r); 626 627 /* 16 width */ 628 DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r, 629 filt0, src43_l, filt0, out0_r, out0_l, out1_r, out1_l); 630 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src10_r, filt1, out0_l, src10_l, 631 filt1, out1_r, src21_r, filt1, out1_l, src21_l, filt1, 632 out0_r, out0_l, out1_r, out1_l); 633 634 /* 8 width */ 635 DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0, 636 out2_r, out3_r); 637 DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src76_r, filt1, out3_r, 638 src87_r, filt1, out2_r, out3_r); 639 640 /* 16 + 8 width */ 641 DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6, 642 out1_l, out1_r, 6, out3_r, out3_r, 6, out1, out2, out3, out4); 643 644 __lsx_vst(out1, dst, 0); 645 __lsx_vstelm_d(out2, dst, 16, 0); 646 dst += dst_stride; 647 __lsx_vst(out3, dst, 0); 648 __lsx_vstelm_d(out4, dst, 16, 0); 649 dst += dst_stride; 650 } 651} 652 653static av_always_inline 654void common_vt_4t_32w_lsx(uint8_t *src, int32_t src_stride, 655 uint8_t *dst, int32_t dst_stride, 656 const int8_t *filter, int32_t height) 657{ 658 uint32_t loop_cnt; 659 int32_t src_stride_2x = (src_stride << 1); 660 int32_t dst_stride_2x = (dst_stride << 1); 661 int32_t src_stride_3x = src_stride_2x + src_stride; 662 uint8_t *_src; 663 664 __m128i src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; 665 __m128i src10_r, src32_r, src76_r, src98_r; 666 __m128i src21_r, src43_r, src87_r, src109_r; 667 __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 668 __m128i src10_l, src32_l, src76_l, src98_l; 669 __m128i src21_l, src43_l, src87_l, src109_l; 670 __m128i filt0, filt1; 671 __m128i out1, out2; 672 673 src -= src_stride; 674 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 675 _src = src + 16; 676 677 /* 16 width */ 678 src0 = __lsx_vld(src, 0); 679 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 680 681 DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); 682 DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); 683 684 /* next 16 width */ 685 src6 = __lsx_vld(_src, 0); 686 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8); 687 src += src_stride_3x; 688 _src += src_stride_3x; 689 690 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r); 691 DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l); 692 693 for (loop_cnt = (height >> 1); loop_cnt--;) { 694 /* 16 width */ 695 DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9); 696 DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10); 697 DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); 698 DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l); 699 700 /* 16 width */ 701 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r, 702 filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l); 703 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l, 704 filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1, 705 out0_r, out0_l, out1_r, out1_l); 706 707 DUP2_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 6, 708 out1, out2); 709 __lsx_vst(out1, dst, 0); 710 __lsx_vstx(out2, dst, dst_stride); 711 712 src10_r = src32_r; 713 src21_r = src43_r; 714 src10_l = src32_l; 715 src21_l = src43_l; 716 src2 = src4; 717 718 /* next 16 width */ 719 src += src_stride_2x; 720 _src += src_stride_2x; 721 DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r); 722 DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l); 723 724 /* next 16 width */ 725 DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r, 726 filt0, src87_l, filt0, out2_r, out2_l, out3_r, out3_l); 727 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out2_l, src98_l, 728 filt1, out3_r, src109_r, filt1, out3_l, src109_l, filt1, 729 out2_r, out2_l, out3_r, out3_l); 730 731 /* next 16 width */ 732 DUP2_ARG3(__lsx_vssrarni_bu_h, out2_l, out2_r, 6, out3_l, out3_r, 6, 733 out1, out2); 734 __lsx_vst(out1, dst, 16); 735 __lsx_vst(out2, dst + dst_stride, 16); 736 737 dst += dst_stride_2x; 738 739 src76_r = src98_r; 740 src87_r = src109_r; 741 src76_l = src98_l; 742 src87_l = src109_l; 743 src8 = src10; 744 } 745} 746 747static av_always_inline 748void hevc_hv_4t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 749 int32_t dst_stride, const int8_t *filter_x, 750 const int8_t *filter_y) 751{ 752 const int32_t src_stride_2x = (src_stride << 1); 753 const int32_t src_stride_4x = (src_stride << 2); 754 const int32_t src_stride_3x = src_stride_2x + src_stride; 755 __m128i out; 756 __m128i src0, src1, src2, src3, src4; 757 __m128i filt0, filt1; 758 __m128i filt_h0, filt_h1, filter_vec; 759 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 760 __m128i mask1; 761 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 762 __m128i dst0, dst1, dst2, dst3, dst4; 763 __m128i dst0_r, dst0_l, dst1_r, dst1_l; 764 __m128i dst10_r, dst32_r, dst21_r, dst43_r; 765 __m128i dst10_l, dst32_l, dst21_l, dst43_l; 766 __m128i out0_r, out1_r; 767 768 src -= (src_stride + 1); 769 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 770 771 filter_vec = __lsx_vld(filter_y, 0); 772 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 773 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 774 775 mask1 = __lsx_vaddi_bu(mask0, 2); 776 src0 = __lsx_vld(src, 0); 777 DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src, 778 src_stride_3x, src, src_stride_4x, src1, src2, src3, src4); 779 780 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1, 781 mask0, src1, src1, mask1, vec0, vec1, vec2, vec3); 782 DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3, 783 mask0, src3, src3, mask1, vec4, vec5, vec6, vec7); 784 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9); 785 786 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 787 filt0, dst0, dst1, dst2, dst3); 788 dst4 = __lsx_vdp2_h_bu_b(vec8, filt0); 789 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2, 790 vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3); 791 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1); 792 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 793 dst10_r, dst21_r, dst32_r, dst43_r); 794 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 795 dst10_l, dst21_l, dst32_l, dst43_l); 796 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 797 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 798 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 799 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 800 dst0_r, dst0_l, dst1_r, dst1_l); 801 DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, 802 out0_r, out1_r); 803 out = __lsx_vssrarni_bu_h(out1_r, out0_r, 6); 804 __lsx_vstelm_d(out, dst, 0, 0); 805 __lsx_vstelm_d(out, dst + dst_stride, 0, 1); 806} 807 808static av_always_inline 809void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 810 int32_t dst_stride, const int8_t *filter_x, 811 const int8_t *filter_y, int32_t width8mult) 812{ 813 uint32_t cnt; 814 const int32_t src_stride_2x = (src_stride << 1); 815 const int32_t dst_stride_2x = (dst_stride << 1); 816 const int32_t src_stride_4x = (src_stride << 2); 817 const int32_t src_stride_3x = src_stride_2x + src_stride; 818 const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 819 820 __m128i out0, out1; 821 __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 822 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 823 __m128i filt0, filt1, filt_h0, filt_h1, filter_vec; 824 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 825 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 826 __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 827 __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 828 829 src -= (src_stride + 1); 830 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 831 832 filter_vec = __lsx_vld(filter_y, 0); 833 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 834 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 835 836 mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 837 mask1 = __lsx_vaddi_bu(mask0, 2); 838 839 for (cnt = width8mult; cnt--;) { 840 src0 = __lsx_vld(src, 0); 841 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 842 src3 = __lsx_vldx(src, src_stride_3x); 843 src += src_stride_4x; 844 src4 = __lsx_vld(src, 0); 845 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6); 846 src += (8 - src_stride_4x); 847 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, 848 vec0, vec1); 849 DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, 850 vec2, vec3); 851 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, 852 vec4, vec5); 853 854 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 855 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 856 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 857 dst0, dst1); 858 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 859 860 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 861 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 862 863 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, 864 vec0, vec1); 865 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, 866 vec2, vec3); 867 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, 868 vec4, vec5); 869 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, 870 vec6, vec7); 871 872 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 873 vec6, filt0, dst3, dst4, dst5, dst6); 874 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1, 875 dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6); 876 877 DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6, 878 dst5, dst32_r, dst43_r, dst54_r, dst65_r); 879 DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6, 880 dst5, dst32_l, dst43_l, dst54_l, dst65_l); 881 882 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 883 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 884 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 885 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 886 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 887 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 888 dst0_r, dst0_l, dst1_r, dst1_l); 889 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 890 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 891 dst2_r, dst2_l, dst3_r, dst3_l); 892 893 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, 894 dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3); 895 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1); 896 __lsx_vstelm_d(out0, dst, 0, 0); 897 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 898 __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 899 __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 900 dst += 8; 901 } 902} 903 904static av_always_inline 905void hevc_hv_4t_8x6_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 906 int32_t dst_stride, const int8_t *filter_x, 907 const int8_t *filter_y) 908{ 909 const int32_t src_stride_2x = (src_stride << 1); 910 const int32_t dst_stride_2x = (dst_stride << 1); 911 const int32_t src_stride_4x = (src_stride << 2); 912 const int32_t dst_stride_4x = (dst_stride << 2); 913 const int32_t src_stride_3x = src_stride_2x + src_stride; 914 const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 915 __m128i out0, out1, out2; 916 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 917 __m128i filt0, filt1; 918 __m128i filt_h0, filt_h1, filter_vec; 919 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 920 __m128i mask1; 921 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 922 __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 923 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 924 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 925 __m128i dst4_r, dst4_l, dst5_r, dst5_l; 926 __m128i dst10_r, dst32_r, dst10_l, dst32_l; 927 __m128i dst21_r, dst43_r, dst21_l, dst43_l; 928 __m128i dst54_r, dst54_l, dst65_r, dst65_l; 929 __m128i dst76_r, dst76_l, dst87_r, dst87_l; 930 __m128i out0_r, out1_r, out2_r, out3_r, out4_r, out5_r; 931 932 src -= (src_stride + 1); 933 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 934 935 filter_vec = __lsx_vld(filter_y, 0); 936 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 937 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 938 939 mask1 = __lsx_vaddi_bu(mask0, 2); 940 941 src0 = __lsx_vld(src, 0); 942 DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src, 943 src_stride_3x, src, src_stride_4x, src1, src2, src3, src4); 944 src += src_stride_4x; 945 DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src, 946 src_stride_3x, src, src_stride_4x, src5, src6, src7, src8); 947 948 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1, 949 mask0, src1, src1, mask1, vec0, vec1, vec2, vec3); 950 DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3, 951 mask0, src3, src3, mask1, vec4, vec5, vec6, vec7); 952 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5, 953 mask0, src5, src5, mask1, vec8, vec9, vec10, vec11); 954 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7, 955 mask0, src7, src7, mask1, vec12, vec13, vec14, vec15); 956 DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17); 957 958 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6, 959 filt0, dst0, dst1, dst2, dst3); 960 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, vec14, 961 filt0, dst4, dst5, dst6, dst7); 962 dst8 = __lsx_vdp2_h_bu_b(vec16, filt0); 963 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2, 964 vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3); 965 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6, 966 vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7); 967 dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1); 968 969 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 970 dst10_r, dst21_r, dst32_r, dst43_r); 971 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3, 972 dst10_l, dst21_l, dst32_l, dst43_l); 973 DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7, 974 dst54_r, dst65_r, dst76_r, dst87_r); 975 DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7, 976 dst54_l, dst65_l, dst76_l, dst87_l); 977 978 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 979 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 980 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 981 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 982 DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r, 983 filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l); 984 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 985 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 986 dst0_r, dst0_l, dst1_r, dst1_l); 987 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 988 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 989 dst2_r, dst2_l, dst3_r, dst3_l); 990 DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l, 991 filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1, 992 dst4_r, dst4_l, dst5_r, dst5_l); 993 994 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l, 995 dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, out2_r, out3_r); 996 DUP2_ARG3(__lsx_vsrani_h_w, dst4_l, dst4_r, 6, dst5_l, dst5_r, 6, 997 out4_r, out5_r); 998 DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6, 999 out0, out1); 1000 out2 = __lsx_vssrarni_bu_h(out5_r, out4_r, 6); 1001 1002 __lsx_vstelm_d(out0, dst, 0, 0); 1003 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); 1004 __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0); 1005 __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1); 1006 dst += dst_stride_4x; 1007 __lsx_vstelm_d(out2, dst, 0, 0); 1008 __lsx_vstelm_d(out2, dst + dst_stride, 0, 1); 1009} 1010 1011static av_always_inline 1012void hevc_hv_4t_8multx4mult_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1013 int32_t dst_stride, const int8_t *filter_x, 1014 const int8_t *filter_y, int32_t height, 1015 int32_t width8mult) 1016{ 1017 uint32_t loop_cnt, cnt; 1018 uint8_t *src_tmp; 1019 uint8_t *dst_tmp; 1020 const int32_t src_stride_2x = (src_stride << 1); 1021 const int32_t dst_stride_2x = (dst_stride << 1); 1022 const int32_t src_stride_4x = (src_stride << 2); 1023 const int32_t dst_stride_4x = (dst_stride << 2); 1024 const int32_t src_stride_3x = src_stride_2x + src_stride; 1025 const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1026 1027 __m128i out0, out1; 1028 __m128i src0, src1, src2, src3, src4, src5, src6; 1029 __m128i filt0, filt1; 1030 __m128i filt_h0, filt_h1, filter_vec; 1031 __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1032 __m128i mask1; 1033 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1034 __m128i dst0, dst1, dst2, dst3, dst4, dst5; 1035 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 1036 __m128i dst10_r, dst32_r, dst21_r, dst43_r; 1037 __m128i dst10_l, dst32_l, dst21_l, dst43_l; 1038 __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6; 1039 __m128i out0_r, out1_r, out2_r, out3_r; 1040 1041 src -= (src_stride + 1); 1042 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1043 1044 filter_vec = __lsx_vld(filter_y, 0); 1045 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1046 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 1047 mask1 = __lsx_vaddi_bu(mask0, 2); 1048 1049 for (cnt = width8mult; cnt--;) { 1050 src_tmp = src; 1051 dst_tmp = dst; 1052 1053 src0 = __lsx_vld(src_tmp, 0); 1054 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1055 src1, src2); 1056 src_tmp += src_stride_3x; 1057 1058 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, 1059 vec0, vec1); 1060 DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, 1061 vec2, vec3); 1062 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, 1063 vec4, vec5); 1064 1065 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1); 1066 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0); 1067 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, 1068 dst0, dst1); 1069 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1); 1070 1071 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r); 1072 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l); 1073 1074 for (loop_cnt = (height >> 2); loop_cnt--;) { 1075 src3 = __lsx_vld(src_tmp, 0); 1076 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1077 src4, src5); 1078 src6 = __lsx_vldx(src_tmp, src_stride_3x); 1079 src_tmp += src_stride_4x; 1080 1081 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4, 1082 src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3); 1083 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6, 1084 src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7); 1085 1086 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 1087 vec6, filt0, dst3, dst4, dst5, dst6); 1088 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, 1089 filt1, dst5, vec5, filt1, dst6, vec7, filt1, 1090 dst3, dst4, dst5, dst6); 1091 1092 DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, 1093 dst6, dst5, dst32_r, dst43_r, dst54_r, dst65_r); 1094 DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, 1095 dst6, dst5, dst32_l, dst43_l, dst54_l, dst65_l); 1096 1097 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 1098 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 1099 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 1100 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 1101 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, 1102 dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, 1103 dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l); 1104 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, 1105 dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, 1106 dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l); 1107 1108 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, 1109 dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, 1110 out2_r, out3_r); 1111 DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 1112 6, out0, out1); 1113 __lsx_vstelm_d(out0, dst_tmp, 0, 0); 1114 __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1); 1115 __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0); 1116 __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1); 1117 dst_tmp += dst_stride_4x; 1118 1119 dst10_r = dst54_r; 1120 dst10_l = dst54_l; 1121 dst21_r = dst65_r; 1122 dst21_l = dst65_l; 1123 dst2 = dst6; 1124 } 1125 src += 8; 1126 dst += 8; 1127 } 1128} 1129 1130static 1131void hevc_hv_4t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1132 int32_t dst_stride, const int8_t *filter_x, 1133 const int8_t *filter_y, int32_t height) 1134{ 1135 if (2 == height) { 1136 hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y); 1137 } else if (4 == height) { 1138 hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, 1139 filter_x, filter_y, 1); 1140 } else if (6 == height) { 1141 hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y); 1142 } else if (0 == (height & 0x03)) { 1143 hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 1144 filter_x, filter_y, height, 1); 1145 } 1146} 1147 1148static av_always_inline 1149void hevc_hv_4t_12w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1150 int32_t dst_stride, const int8_t *filter_x, 1151 const int8_t *filter_y, int32_t height) 1152{ 1153 uint32_t loop_cnt; 1154 uint8_t *src_tmp, *dst_tmp; 1155 const int32_t src_stride_2x = (src_stride << 1); 1156 const int32_t dst_stride_2x = (dst_stride << 1); 1157 const int32_t src_stride_4x = (src_stride << 2); 1158 const int32_t dst_stride_4x = (dst_stride << 2); 1159 const int32_t src_stride_3x = src_stride_2x + src_stride; 1160 const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1161 __m128i out0, out1; 1162 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1163 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1164 __m128i mask0, mask1, mask2, mask3; 1165 __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3; 1166 __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6; 1167 __m128i dst10, dst21, dst22, dst73, dst84, dst95, dst106; 1168 __m128i dst76_r, dst98_r, dst87_r, dst109_r; 1169 __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 1170 __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 1171 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 1172 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1173 1174 src -= (src_stride + 1); 1175 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1); 1176 1177 filter_vec = __lsx_vld(filter_y, 0); 1178 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0); 1179 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1); 1180 1181 mask0 = __lsx_vld(ff_hevc_mask_arr, 0); 1182 mask1 = __lsx_vaddi_bu(mask0, 2); 1183 1184 src_tmp = src; 1185 dst_tmp = dst; 1186 1187 src0 = __lsx_vld(src_tmp, 0); 1188 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1189 src1, src2); 1190 src_tmp += src_stride_3x; 1191 1192 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1); 1193 DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3); 1194 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5); 1195 1196 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1); 1197 dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0); 1198 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1, 1199 dsth0, dsth1); 1200 dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1); 1201 1202 DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r); 1203 DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l); 1204 1205 for (loop_cnt = 4; loop_cnt--;) { 1206 src3 = __lsx_vld(src_tmp, 0); 1207 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, 1208 src4, src5); 1209 src6 = __lsx_vldx(src_tmp, src_stride_3x); 1210 src_tmp += src_stride_4x; 1211 1212 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4, 1213 src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3); 1214 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6, 1215 src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7); 1216 1217 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 1218 vec6, filt0, dsth3, dsth4, dsth5, dsth6); 1219 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4, 1220 vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1, 1221 dsth3, dsth4, dsth5, dsth6); 1222 1223 DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4, 1224 dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r); 1225 DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4, 1226 dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l); 1227 1228 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r, 1229 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l); 1230 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r, 1231 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l); 1232 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l, 1233 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1, 1234 dst0_r, dst0_l, dst1_r, dst1_l); 1235 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l, 1236 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1, 1237 dst2_r, dst2_l, dst3_r, dst3_l); 1238 1239 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l, 1240 dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3); 1241 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1); 1242 1243 __lsx_vstelm_d(out0, dst_tmp, 0, 0); 1244 __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1); 1245 __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0); 1246 __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1); 1247 dst_tmp += dst_stride_4x; 1248 1249 dst10_r = dst54_r; 1250 dst10_l = dst54_l; 1251 dst21_r = dst65_r; 1252 dst21_l = dst65_l; 1253 dsth2 = dsth6; 1254 } 1255 1256 src += 8; 1257 dst += 8; 1258 1259 mask2 = __lsx_vld(ff_hevc_mask_arr, 16); 1260 mask3 = __lsx_vaddi_bu(mask2, 2); 1261 1262 src0 = __lsx_vld(src, 0); 1263 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2); 1264 src += src_stride_3x; 1265 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1); 1266 DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3); 1267 1268 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21); 1269 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1, 1270 dst10, dst21); 1271 1272 dst10_r = __lsx_vilvl_h(dst21, dst10); 1273 dst21_r = __lsx_vilvh_h(dst21, dst10); 1274 dst22 = __lsx_vreplvei_d(dst21, 1); 1275 1276 for (loop_cnt = 2; loop_cnt--;) { 1277 src3 = __lsx_vld(src, 0); 1278 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5); 1279 src6 = __lsx_vldx(src, src_stride_3x); 1280 src += src_stride_4x; 1281 src7 = __lsx_vld(src, 0); 1282 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9); 1283 src10 = __lsx_vldx(src, src_stride_3x); 1284 src += src_stride_4x; 1285 DUP4_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3, src8, 1286 src4, mask2, src8, src4, mask3, vec0, vec1, vec2, vec3); 1287 DUP4_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3, src10, 1288 src6, mask2, src10, src6, mask3, vec4, vec5, vec6, vec7); 1289 1290 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, 1291 vec6, filt0, dst73, dst84, dst95, dst106); 1292 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3, 1293 filt1, dst95, vec5, filt1, dst106, vec7, filt1, 1294 dst73, dst84, dst95, dst106); 1295 1296 dst32_r = __lsx_vilvl_h(dst73, dst22); 1297 DUP2_ARG2(__lsx_vilvl_h, dst84, dst73, dst95, dst84, dst43_r, dst54_r); 1298 DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r); 1299 dst65_r = __lsx_vilvl_h(dst106, dst95); 1300 dst109_r = __lsx_vilvh_h(dst106, dst95); 1301 dst22 = __lsx_vreplvei_d(dst73, 1); 1302 dst76_r = __lsx_vilvl_h(dst22, dst106); 1303 1304 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r, 1305 filt_h0, dst43_r, filt_h0, dst0, dst1, dst2, dst3); 1306 DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r, 1307 filt_h0, dst87_r, filt_h0, dst4, dst5, dst6, dst7); 1308 DUP4_ARG3(__lsx_vdp2add_w_h, dst0, dst32_r, filt_h1, dst1, dst43_r, 1309 filt_h1, dst2, dst54_r, filt_h1, dst3, dst65_r, filt_h1, 1310 dst0, dst1, dst2, dst3); 1311 DUP4_ARG3(__lsx_vdp2add_w_h, dst4, dst76_r, filt_h1, dst5, dst87_r, 1312 filt_h1, dst6, dst98_r, filt_h1, dst7, dst109_r, filt_h1, 1313 dst4, dst5, dst6, dst7); 1314 1315 DUP4_ARG3(__lsx_vsrani_h_w, dst1, dst0, 6, dst3, dst2, 6, dst5, dst4, 1316 6, dst7, dst6, 6, tmp0, tmp1, tmp2, tmp3); 1317 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1); 1318 1319 __lsx_vstelm_w(out0, dst, 0, 0); 1320 __lsx_vstelm_w(out0, dst + dst_stride, 0, 1); 1321 __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2); 1322 __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3); 1323 dst += dst_stride_4x; 1324 __lsx_vstelm_w(out1, dst, 0, 0); 1325 __lsx_vstelm_w(out1, dst + dst_stride, 0, 1); 1326 __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2); 1327 __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3); 1328 dst += dst_stride_4x; 1329 1330 dst10_r = dst98_r; 1331 dst21_r = dst109_r; 1332 dst22 = __lsx_vreplvei_d(dst106, 1); 1333 } 1334} 1335 1336static void hevc_hv_4t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1337 int32_t dst_stride, const int8_t *filter_x, 1338 const int8_t *filter_y, int32_t height) 1339{ 1340 if (4 == height) { 1341 hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, filter_x, 1342 filter_y, 2); 1343 } else { 1344 hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 1345 filter_x, filter_y, height, 2); 1346 } 1347} 1348 1349static void hevc_hv_4t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1350 int32_t dst_stride, const int8_t *filter_x, 1351 const int8_t *filter_y, int32_t height) 1352{ 1353 hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 1354 filter_x, filter_y, height, 3); 1355} 1356 1357static void hevc_hv_4t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, 1358 int32_t dst_stride, const int8_t *filter_x, 1359 const int8_t *filter_y, int32_t height) 1360{ 1361 hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride, 1362 filter_x, filter_y, height, 4); 1363} 1364 1365#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 1366void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \ 1367 ptrdiff_t dst_stride, \ 1368 uint8_t *src, \ 1369 ptrdiff_t src_stride, \ 1370 int height, \ 1371 intptr_t mx, \ 1372 intptr_t my, \ 1373 int width) \ 1374{ \ 1375 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 1376 \ 1377 common_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \ 1378 filter, height); \ 1379} 1380 1381UNI_MC(qpel, h, 64, 8, hz, mx); 1382 1383UNI_MC(qpel, v, 24, 8, vt, my); 1384UNI_MC(qpel, v, 32, 8, vt, my); 1385UNI_MC(qpel, v, 48, 8, vt, my); 1386UNI_MC(qpel, v, 64, 8, vt, my); 1387 1388UNI_MC(epel, v, 24, 4, vt, my); 1389UNI_MC(epel, v, 32, 4, vt, my); 1390 1391#undef UNI_MC 1392 1393#define UNI_MC_HV(PEL, WIDTH, TAP) \ 1394void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \ 1395 ptrdiff_t dst_stride, \ 1396 uint8_t *src, \ 1397 ptrdiff_t src_stride, \ 1398 int height, \ 1399 intptr_t mx, \ 1400 intptr_t my, \ 1401 int width) \ 1402{ \ 1403 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 1404 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 1405 \ 1406 hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \ 1407 filter_x, filter_y, height); \ 1408} 1409 1410UNI_MC_HV(qpel, 8, 8); 1411UNI_MC_HV(qpel, 16, 8); 1412UNI_MC_HV(qpel, 24, 8); 1413UNI_MC_HV(qpel, 32, 8); 1414UNI_MC_HV(qpel, 48, 8); 1415UNI_MC_HV(qpel, 64, 8); 1416 1417UNI_MC_HV(epel, 8, 4); 1418UNI_MC_HV(epel, 12, 4); 1419UNI_MC_HV(epel, 16, 4); 1420UNI_MC_HV(epel, 24, 4); 1421UNI_MC_HV(epel, 32, 4); 1422 1423#undef UNI_MC_HV 1424