1/* 2 * Loongson LASX optimized h264qpel 3 * 4 * Copyright (c) 2020 Loongson Technology Corporation Limited 5 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include "h264qpel_lasx.h" 25#include "libavutil/loongarch/loongson_intrinsics.h" 26#include "libavutil/attributes.h" 27 28static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = { 29 /* 8 width cases */ 30 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12, 31 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12, 32 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 33 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 34 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 35 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 36}; 37 38#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \ 39( { \ 40 __m256i out0_m; \ 41 __m256i tmp0_m; \ 42 \ 43 tmp0_m = __lasx_xvshuf_b(in1, in0, mask0); \ 44 out0_m = __lasx_xvhaddw_h_b(tmp0_m, tmp0_m); \ 45 tmp0_m = __lasx_xvshuf_b(in1, in0, mask1); \ 46 out0_m = __lasx_xvdp2add_h_b(out0_m, minus5b, tmp0_m); \ 47 tmp0_m = __lasx_xvshuf_b(in1, in0, mask2); \ 48 out0_m = __lasx_xvdp2add_h_b(out0_m, plus20b, tmp0_m); \ 49 \ 50 out0_m; \ 51} ) 52 53#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ 54( { \ 55 __m256i out0_m; \ 56 \ 57 out0_m = __lasx_xvdp2_h_b(in0, coeff0); \ 58 DUP2_ARG3(__lasx_xvdp2add_h_b, out0_m, in1, coeff1, out0_m,\ 59 in2, coeff2, out0_m, out0_m); \ 60 \ 61 out0_m; \ 62} ) 63 64static av_always_inline 65void avc_luma_hv_qrt_and_aver_dst_16x16_lasx(uint8_t *src_x, 66 uint8_t *src_y, 67 uint8_t *dst, ptrdiff_t stride) 68{ 69 const int16_t filt_const0 = 0xfb01; 70 const int16_t filt_const1 = 0x1414; 71 const int16_t filt_const2 = 0x1fb; 72 uint32_t loop_cnt; 73 ptrdiff_t stride_2x = stride << 1; 74 ptrdiff_t stride_3x = stride_2x + stride; 75 ptrdiff_t stride_4x = stride << 2; 76 __m256i tmp0, tmp1; 77 __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2; 78 __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 79 __m256i src_vt7, src_vt8; 80 __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h; 81 __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2; 82 __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 83 __m256i vt_out3, out0, out1, out2, out3; 84 __m256i minus5b = __lasx_xvldi(0xFB); 85 __m256i plus20b = __lasx_xvldi(20); 86 87 filt0 = __lasx_xvreplgr2vr_h(filt_const0); 88 filt1 = __lasx_xvreplgr2vr_h(filt_const1); 89 filt2 = __lasx_xvreplgr2vr_h(filt_const2); 90 91 mask0 = __lasx_xvld(luma_mask_arr, 0); 92 DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2); 93 src_vt0 = __lasx_xvld(src_y, 0); 94 DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x, 95 src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4); 96 src_y += stride_4x; 97 98 src_vt0 = __lasx_xvxori_b(src_vt0, 128); 99 DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128, 100 src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4); 101 102 for (loop_cnt = 4; loop_cnt--;) { 103 src_hz0 = __lasx_xvld(src_x, 0); 104 DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x, 105 src_hz1, src_hz2); 106 src_hz3 = __lasx_xvldx(src_x, stride_3x); 107 src_x += stride_4x; 108 src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94); 109 src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94); 110 src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94); 111 src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94); 112 DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128, 113 src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3); 114 115 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 116 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 117 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 118 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 119 hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5); 120 hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5); 121 122 DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, 123 src_y, stride_3x, src_y, stride_4x, 124 src_vt5, src_vt6, src_vt7, src_vt8); 125 src_y += stride_4x; 126 127 DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128, 128 src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8); 129 130 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5, 131 0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02, 132 src_vt0, src_vt1, src_vt2, src_vt3); 133 src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02); 134 DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1, 135 src_vt3, src_vt2, src_vt87_h, src_vt3, 136 src_hz0, src_hz1, src_hz2, src_hz3); 137 DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1, 138 src_vt3, src_vt2, src_vt87_h, src_vt3, 139 src_vt0, src_vt1, src_vt2, src_vt3); 140 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, src_hz1, 141 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 0x02, 142 src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h); 143 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, src_hz1, 144 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 0x13, 145 src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h); 146 vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, filt0, 147 filt1, filt2); 148 vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, filt0, 149 filt1, filt2); 150 vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, filt0, 151 filt1, filt2); 152 vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, filt0, 153 filt1, filt2); 154 vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5); 155 vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5); 156 157 DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2, 158 out0, out2); 159 DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2, 160 out1, out3); 161 tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1); 162 tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1); 163 164 DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 165 out0 = __lasx_xvld(dst, 0); 166 DUP2_ARG2(__lasx_xvldx, dst, stride, dst, stride_2x, out1, out2); 167 out3 = __lasx_xvldx(dst, stride_3x); 168 out0 = __lasx_xvpermi_q(out0, out2, 0x02); 169 out1 = __lasx_xvpermi_q(out1, out3, 0x02); 170 out2 = __lasx_xvilvl_d(out1, out0); 171 out3 = __lasx_xvilvh_d(out1, out0); 172 out0 = __lasx_xvpermi_q(out2, out3, 0x02); 173 out1 = __lasx_xvpermi_q(out2, out3, 0x13); 174 tmp0 = __lasx_xvavgr_bu(out0, tmp0); 175 tmp1 = __lasx_xvavgr_bu(out1, tmp1); 176 177 __lasx_xvstelm_d(tmp0, dst, 0, 0); 178 __lasx_xvstelm_d(tmp0, dst + stride, 0, 1); 179 __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0); 180 __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1); 181 182 __lasx_xvstelm_d(tmp0, dst, 8, 2); 183 __lasx_xvstelm_d(tmp0, dst + stride, 8, 3); 184 __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2); 185 __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3); 186 187 dst += stride_4x; 188 src_vt0 = src_vt4; 189 src_vt1 = src_vt5; 190 src_vt2 = src_vt6; 191 src_vt3 = src_vt7; 192 src_vt4 = src_vt8; 193 } 194} 195 196static av_always_inline void 197avc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y, 198 uint8_t *dst, ptrdiff_t stride) 199{ 200 const int16_t filt_const0 = 0xfb01; 201 const int16_t filt_const1 = 0x1414; 202 const int16_t filt_const2 = 0x1fb; 203 uint32_t loop_cnt; 204 ptrdiff_t stride_2x = stride << 1; 205 ptrdiff_t stride_3x = stride_2x + stride; 206 ptrdiff_t stride_4x = stride << 2; 207 __m256i tmp0, tmp1; 208 __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2; 209 __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 210 __m256i src_vt7, src_vt8; 211 __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h; 212 __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2; 213 __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 214 __m256i vt_out3, out0, out1, out2, out3; 215 __m256i minus5b = __lasx_xvldi(0xFB); 216 __m256i plus20b = __lasx_xvldi(20); 217 218 filt0 = __lasx_xvreplgr2vr_h(filt_const0); 219 filt1 = __lasx_xvreplgr2vr_h(filt_const1); 220 filt2 = __lasx_xvreplgr2vr_h(filt_const2); 221 222 mask0 = __lasx_xvld(luma_mask_arr, 0); 223 DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2); 224 src_vt0 = __lasx_xvld(src_y, 0); 225 DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x, 226 src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4); 227 src_y += stride_4x; 228 229 src_vt0 = __lasx_xvxori_b(src_vt0, 128); 230 DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128, 231 src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4); 232 233 for (loop_cnt = 4; loop_cnt--;) { 234 src_hz0 = __lasx_xvld(src_x, 0); 235 DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x, 236 src_hz1, src_hz2); 237 src_hz3 = __lasx_xvldx(src_x, stride_3x); 238 src_x += stride_4x; 239 src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94); 240 src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94); 241 src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94); 242 src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94); 243 DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128, 244 src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3); 245 246 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 247 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 248 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 249 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 250 hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5); 251 hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5); 252 253 DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, 254 src_y, stride_3x, src_y, stride_4x, 255 src_vt5, src_vt6, src_vt7, src_vt8); 256 src_y += stride_4x; 257 258 DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128, 259 src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8); 260 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5, 261 0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02, 262 src_vt0, src_vt1, src_vt2, src_vt3); 263 src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02); 264 DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1, 265 src_vt3, src_vt2, src_vt87_h, src_vt3, 266 src_hz0, src_hz1, src_hz2, src_hz3); 267 DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1, 268 src_vt3, src_vt2, src_vt87_h, src_vt3, 269 src_vt0, src_vt1, src_vt2, src_vt3); 270 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, 271 src_hz1, 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 272 0x02, src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h); 273 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, 274 src_hz1, 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 275 0x13, src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h); 276 277 vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, 278 filt0, filt1, filt2); 279 vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, 280 filt0, filt1, filt2); 281 vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, 282 filt0, filt1, filt2); 283 vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, 284 filt0, filt1, filt2); 285 vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5); 286 vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5); 287 288 DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2, 289 out0, out2); 290 DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2, 291 out1, out3); 292 tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1); 293 tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1); 294 295 DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 296 __lasx_xvstelm_d(tmp0, dst, 0, 0); 297 __lasx_xvstelm_d(tmp0, dst + stride, 0, 1); 298 __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0); 299 __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1); 300 301 __lasx_xvstelm_d(tmp0, dst, 8, 2); 302 __lasx_xvstelm_d(tmp0, dst + stride, 8, 3); 303 __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2); 304 __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3); 305 306 dst += stride_4x; 307 src_vt0 = src_vt4; 308 src_vt1 = src_vt5; 309 src_vt2 = src_vt6; 310 src_vt3 = src_vt7; 311 src_vt4 = src_vt8; 312 } 313} 314 315/* put_pixels8_8_inline_asm: dst = src */ 316static av_always_inline void 317put_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) 318{ 319 uint64_t tmp[8]; 320 ptrdiff_t stride_2, stride_3, stride_4; 321 __asm__ volatile ( 322 "slli.d %[stride_2], %[stride], 1 \n\t" 323 "add.d %[stride_3], %[stride_2], %[stride] \n\t" 324 "slli.d %[stride_4], %[stride_2], 1 \n\t" 325 "ld.d %[tmp0], %[src], 0x0 \n\t" 326 "ldx.d %[tmp1], %[src], %[stride] \n\t" 327 "ldx.d %[tmp2], %[src], %[stride_2] \n\t" 328 "ldx.d %[tmp3], %[src], %[stride_3] \n\t" 329 "add.d %[src], %[src], %[stride_4] \n\t" 330 "ld.d %[tmp4], %[src], 0x0 \n\t" 331 "ldx.d %[tmp5], %[src], %[stride] \n\t" 332 "ldx.d %[tmp6], %[src], %[stride_2] \n\t" 333 "ldx.d %[tmp7], %[src], %[stride_3] \n\t" 334 335 "st.d %[tmp0], %[dst], 0x0 \n\t" 336 "stx.d %[tmp1], %[dst], %[stride] \n\t" 337 "stx.d %[tmp2], %[dst], %[stride_2] \n\t" 338 "stx.d %[tmp3], %[dst], %[stride_3] \n\t" 339 "add.d %[dst], %[dst], %[stride_4] \n\t" 340 "st.d %[tmp4], %[dst], 0x0 \n\t" 341 "stx.d %[tmp5], %[dst], %[stride] \n\t" 342 "stx.d %[tmp6], %[dst], %[stride_2] \n\t" 343 "stx.d %[tmp7], %[dst], %[stride_3] \n\t" 344 : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), 345 [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), 346 [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]), 347 [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]), 348 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 349 [stride_4]"=&r"(stride_4), 350 [dst]"+&r"(dst), [src]"+&r"(src) 351 : [stride]"r"(stride) 352 : "memory" 353 ); 354} 355 356/* avg_pixels8_8_lsx : dst = avg(src, dst) 357 * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8. 358 * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 359static av_always_inline void 360avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) 361{ 362 uint8_t *tmp = dst; 363 ptrdiff_t stride_2, stride_3, stride_4; 364 __asm__ volatile ( 365 /* h0~h7 */ 366 "slli.d %[stride_2], %[stride], 1 \n\t" 367 "add.d %[stride_3], %[stride_2], %[stride] \n\t" 368 "slli.d %[stride_4], %[stride_2], 1 \n\t" 369 "vld $vr0, %[src], 0 \n\t" 370 "vldx $vr1, %[src], %[stride] \n\t" 371 "vldx $vr2, %[src], %[stride_2] \n\t" 372 "vldx $vr3, %[src], %[stride_3] \n\t" 373 "add.d %[src], %[src], %[stride_4] \n\t" 374 "vld $vr4, %[src], 0 \n\t" 375 "vldx $vr5, %[src], %[stride] \n\t" 376 "vldx $vr6, %[src], %[stride_2] \n\t" 377 "vldx $vr7, %[src], %[stride_3] \n\t" 378 379 "vld $vr8, %[tmp], 0 \n\t" 380 "vldx $vr9, %[tmp], %[stride] \n\t" 381 "vldx $vr10, %[tmp], %[stride_2] \n\t" 382 "vldx $vr11, %[tmp], %[stride_3] \n\t" 383 "add.d %[tmp], %[tmp], %[stride_4] \n\t" 384 "vld $vr12, %[tmp], 0 \n\t" 385 "vldx $vr13, %[tmp], %[stride] \n\t" 386 "vldx $vr14, %[tmp], %[stride_2] \n\t" 387 "vldx $vr15, %[tmp], %[stride_3] \n\t" 388 389 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 390 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 391 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 392 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 393 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 394 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 395 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 396 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 397 398 "vstelm.d $vr0, %[dst], 0, 0 \n\t" 399 "add.d %[dst], %[dst], %[stride] \n\t" 400 "vstelm.d $vr1, %[dst], 0, 0 \n\t" 401 "add.d %[dst], %[dst], %[stride] \n\t" 402 "vstelm.d $vr2, %[dst], 0, 0 \n\t" 403 "add.d %[dst], %[dst], %[stride] \n\t" 404 "vstelm.d $vr3, %[dst], 0, 0 \n\t" 405 "add.d %[dst], %[dst], %[stride] \n\t" 406 "vstelm.d $vr4, %[dst], 0, 0 \n\t" 407 "add.d %[dst], %[dst], %[stride] \n\t" 408 "vstelm.d $vr5, %[dst], 0, 0 \n\t" 409 "add.d %[dst], %[dst], %[stride] \n\t" 410 "vstelm.d $vr6, %[dst], 0, 0 \n\t" 411 "add.d %[dst], %[dst], %[stride] \n\t" 412 "vstelm.d $vr7, %[dst], 0, 0 \n\t" 413 : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src), 414 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 415 [stride_4]"=&r"(stride_4) 416 : [stride]"r"(stride) 417 : "memory" 418 ); 419} 420 421/* avg_pixels8_8_lsx : dst = avg(src, dst) 422 * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8. 423 * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 424static av_always_inline void 425put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, 426 ptrdiff_t dstStride, ptrdiff_t srcStride) 427{ 428 ptrdiff_t stride_2, stride_3, stride_4; 429 __asm__ volatile ( 430 /* h0~h7 */ 431 "slli.d %[stride_2], %[srcStride], 1 \n\t" 432 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" 433 "slli.d %[stride_4], %[stride_2], 1 \n\t" 434 "vld $vr0, %[src], 0 \n\t" 435 "vldx $vr1, %[src], %[srcStride] \n\t" 436 "vldx $vr2, %[src], %[stride_2] \n\t" 437 "vldx $vr3, %[src], %[stride_3] \n\t" 438 "add.d %[src], %[src], %[stride_4] \n\t" 439 "vld $vr4, %[src], 0 \n\t" 440 "vldx $vr5, %[src], %[srcStride] \n\t" 441 "vldx $vr6, %[src], %[stride_2] \n\t" 442 "vldx $vr7, %[src], %[stride_3] \n\t" 443 444 "vld $vr8, %[half], 0x00 \n\t" 445 "vld $vr9, %[half], 0x08 \n\t" 446 "vld $vr10, %[half], 0x10 \n\t" 447 "vld $vr11, %[half], 0x18 \n\t" 448 "vld $vr12, %[half], 0x20 \n\t" 449 "vld $vr13, %[half], 0x28 \n\t" 450 "vld $vr14, %[half], 0x30 \n\t" 451 "vld $vr15, %[half], 0x38 \n\t" 452 453 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 454 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 455 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 456 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 457 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 458 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 459 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 460 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 461 462 "vstelm.d $vr0, %[dst], 0, 0 \n\t" 463 "add.d %[dst], %[dst], %[dstStride] \n\t" 464 "vstelm.d $vr1, %[dst], 0, 0 \n\t" 465 "add.d %[dst], %[dst], %[dstStride] \n\t" 466 "vstelm.d $vr2, %[dst], 0, 0 \n\t" 467 "add.d %[dst], %[dst], %[dstStride] \n\t" 468 "vstelm.d $vr3, %[dst], 0, 0 \n\t" 469 "add.d %[dst], %[dst], %[dstStride] \n\t" 470 "vstelm.d $vr4, %[dst], 0, 0 \n\t" 471 "add.d %[dst], %[dst], %[dstStride] \n\t" 472 "vstelm.d $vr5, %[dst], 0, 0 \n\t" 473 "add.d %[dst], %[dst], %[dstStride] \n\t" 474 "vstelm.d $vr6, %[dst], 0, 0 \n\t" 475 "add.d %[dst], %[dst], %[dstStride] \n\t" 476 "vstelm.d $vr7, %[dst], 0, 0 \n\t" 477 : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src), 478 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 479 [stride_4]"=&r"(stride_4) 480 : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride) 481 : "memory" 482 ); 483} 484 485/* avg_pixels8_8_lsx : dst = avg(src, dst) 486 * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8. 487 * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 488static av_always_inline void 489avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, 490 ptrdiff_t dstStride, ptrdiff_t srcStride) 491{ 492 uint8_t *tmp = dst; 493 ptrdiff_t stride_2, stride_3, stride_4; 494 __asm__ volatile ( 495 /* h0~h7 */ 496 "slli.d %[stride_2], %[srcStride], 1 \n\t" 497 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" 498 "slli.d %[stride_4], %[stride_2], 1 \n\t" 499 "vld $vr0, %[src], 0 \n\t" 500 "vldx $vr1, %[src], %[srcStride] \n\t" 501 "vldx $vr2, %[src], %[stride_2] \n\t" 502 "vldx $vr3, %[src], %[stride_3] \n\t" 503 "add.d %[src], %[src], %[stride_4] \n\t" 504 "vld $vr4, %[src], 0 \n\t" 505 "vldx $vr5, %[src], %[srcStride] \n\t" 506 "vldx $vr6, %[src], %[stride_2] \n\t" 507 "vldx $vr7, %[src], %[stride_3] \n\t" 508 509 "vld $vr8, %[half], 0x00 \n\t" 510 "vld $vr9, %[half], 0x08 \n\t" 511 "vld $vr10, %[half], 0x10 \n\t" 512 "vld $vr11, %[half], 0x18 \n\t" 513 "vld $vr12, %[half], 0x20 \n\t" 514 "vld $vr13, %[half], 0x28 \n\t" 515 "vld $vr14, %[half], 0x30 \n\t" 516 "vld $vr15, %[half], 0x38 \n\t" 517 518 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 519 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 520 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 521 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 522 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 523 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 524 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 525 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 526 527 "slli.d %[stride_2], %[dstStride], 1 \n\t" 528 "add.d %[stride_3], %[stride_2], %[dstStride] \n\t" 529 "slli.d %[stride_4], %[stride_2], 1 \n\t" 530 "vld $vr8, %[tmp], 0 \n\t" 531 "vldx $vr9, %[tmp], %[dstStride] \n\t" 532 "vldx $vr10, %[tmp], %[stride_2] \n\t" 533 "vldx $vr11, %[tmp], %[stride_3] \n\t" 534 "add.d %[tmp], %[tmp], %[stride_4] \n\t" 535 "vld $vr12, %[tmp], 0 \n\t" 536 "vldx $vr13, %[tmp], %[dstStride] \n\t" 537 "vldx $vr14, %[tmp], %[stride_2] \n\t" 538 "vldx $vr15, %[tmp], %[stride_3] \n\t" 539 540 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 541 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 542 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 543 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 544 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 545 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 546 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 547 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 548 549 "vstelm.d $vr0, %[dst], 0, 0 \n\t" 550 "add.d %[dst], %[dst], %[dstStride] \n\t" 551 "vstelm.d $vr1, %[dst], 0, 0 \n\t" 552 "add.d %[dst], %[dst], %[dstStride] \n\t" 553 "vstelm.d $vr2, %[dst], 0, 0 \n\t" 554 "add.d %[dst], %[dst], %[dstStride] \n\t" 555 "vstelm.d $vr3, %[dst], 0, 0 \n\t" 556 "add.d %[dst], %[dst], %[dstStride] \n\t" 557 "vstelm.d $vr4, %[dst], 0, 0 \n\t" 558 "add.d %[dst], %[dst], %[dstStride] \n\t" 559 "vstelm.d $vr5, %[dst], 0, 0 \n\t" 560 "add.d %[dst], %[dst], %[dstStride] \n\t" 561 "vstelm.d $vr6, %[dst], 0, 0 \n\t" 562 "add.d %[dst], %[dst], %[dstStride] \n\t" 563 "vstelm.d $vr7, %[dst], 0, 0 \n\t" 564 : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), 565 [src]"+&r"(src), [stride_2]"=&r"(stride_2), 566 [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4) 567 : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride) 568 : "memory" 569 ); 570} 571 572/* put_pixels16_8_lsx: dst = src */ 573static av_always_inline void 574put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) 575{ 576 ptrdiff_t stride_2, stride_3, stride_4; 577 __asm__ volatile ( 578 "slli.d %[stride_2], %[stride], 1 \n\t" 579 "add.d %[stride_3], %[stride_2], %[stride] \n\t" 580 "slli.d %[stride_4], %[stride_2], 1 \n\t" 581 "vld $vr0, %[src], 0 \n\t" 582 "vldx $vr1, %[src], %[stride] \n\t" 583 "vldx $vr2, %[src], %[stride_2] \n\t" 584 "vldx $vr3, %[src], %[stride_3] \n\t" 585 "add.d %[src], %[src], %[stride_4] \n\t" 586 "vld $vr4, %[src], 0 \n\t" 587 "vldx $vr5, %[src], %[stride] \n\t" 588 "vldx $vr6, %[src], %[stride_2] \n\t" 589 "vldx $vr7, %[src], %[stride_3] \n\t" 590 "add.d %[src], %[src], %[stride_4] \n\t" 591 592 "vst $vr0, %[dst], 0 \n\t" 593 "vstx $vr1, %[dst], %[stride] \n\t" 594 "vstx $vr2, %[dst], %[stride_2] \n\t" 595 "vstx $vr3, %[dst], %[stride_3] \n\t" 596 "add.d %[dst], %[dst], %[stride_4] \n\t" 597 "vst $vr4, %[dst], 0 \n\t" 598 "vstx $vr5, %[dst], %[stride] \n\t" 599 "vstx $vr6, %[dst], %[stride_2] \n\t" 600 "vstx $vr7, %[dst], %[stride_3] \n\t" 601 "add.d %[dst], %[dst], %[stride_4] \n\t" 602 603 "vld $vr0, %[src], 0 \n\t" 604 "vldx $vr1, %[src], %[stride] \n\t" 605 "vldx $vr2, %[src], %[stride_2] \n\t" 606 "vldx $vr3, %[src], %[stride_3] \n\t" 607 "add.d %[src], %[src], %[stride_4] \n\t" 608 "vld $vr4, %[src], 0 \n\t" 609 "vldx $vr5, %[src], %[stride] \n\t" 610 "vldx $vr6, %[src], %[stride_2] \n\t" 611 "vldx $vr7, %[src], %[stride_3] \n\t" 612 613 "vst $vr0, %[dst], 0 \n\t" 614 "vstx $vr1, %[dst], %[stride] \n\t" 615 "vstx $vr2, %[dst], %[stride_2] \n\t" 616 "vstx $vr3, %[dst], %[stride_3] \n\t" 617 "add.d %[dst], %[dst], %[stride_4] \n\t" 618 "vst $vr4, %[dst], 0 \n\t" 619 "vstx $vr5, %[dst], %[stride] \n\t" 620 "vstx $vr6, %[dst], %[stride_2] \n\t" 621 "vstx $vr7, %[dst], %[stride_3] \n\t" 622 : [dst]"+&r"(dst), [src]"+&r"(src), 623 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 624 [stride_4]"=&r"(stride_4) 625 : [stride]"r"(stride) 626 : "memory" 627 ); 628} 629 630/* avg_pixels16_8_lsx : dst = avg(src, dst) 631 * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8. 632 * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 633static av_always_inline void 634avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) 635{ 636 uint8_t *tmp = dst; 637 ptrdiff_t stride_2, stride_3, stride_4; 638 __asm__ volatile ( 639 /* h0~h7 */ 640 "slli.d %[stride_2], %[stride], 1 \n\t" 641 "add.d %[stride_3], %[stride_2], %[stride] \n\t" 642 "slli.d %[stride_4], %[stride_2], 1 \n\t" 643 "vld $vr0, %[src], 0 \n\t" 644 "vldx $vr1, %[src], %[stride] \n\t" 645 "vldx $vr2, %[src], %[stride_2] \n\t" 646 "vldx $vr3, %[src], %[stride_3] \n\t" 647 "add.d %[src], %[src], %[stride_4] \n\t" 648 "vld $vr4, %[src], 0 \n\t" 649 "vldx $vr5, %[src], %[stride] \n\t" 650 "vldx $vr6, %[src], %[stride_2] \n\t" 651 "vldx $vr7, %[src], %[stride_3] \n\t" 652 "add.d %[src], %[src], %[stride_4] \n\t" 653 654 "vld $vr8, %[tmp], 0 \n\t" 655 "vldx $vr9, %[tmp], %[stride] \n\t" 656 "vldx $vr10, %[tmp], %[stride_2] \n\t" 657 "vldx $vr11, %[tmp], %[stride_3] \n\t" 658 "add.d %[tmp], %[tmp], %[stride_4] \n\t" 659 "vld $vr12, %[tmp], 0 \n\t" 660 "vldx $vr13, %[tmp], %[stride] \n\t" 661 "vldx $vr14, %[tmp], %[stride_2] \n\t" 662 "vldx $vr15, %[tmp], %[stride_3] \n\t" 663 "add.d %[tmp], %[tmp], %[stride_4] \n\t" 664 665 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 666 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 667 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 668 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 669 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 670 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 671 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 672 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 673 674 "vst $vr0, %[dst], 0 \n\t" 675 "vstx $vr1, %[dst], %[stride] \n\t" 676 "vstx $vr2, %[dst], %[stride_2] \n\t" 677 "vstx $vr3, %[dst], %[stride_3] \n\t" 678 "add.d %[dst], %[dst], %[stride_4] \n\t" 679 "vst $vr4, %[dst], 0 \n\t" 680 "vstx $vr5, %[dst], %[stride] \n\t" 681 "vstx $vr6, %[dst], %[stride_2] \n\t" 682 "vstx $vr7, %[dst], %[stride_3] \n\t" 683 "add.d %[dst], %[dst], %[stride_4] \n\t" 684 685 /* h8~h15 */ 686 "vld $vr0, %[src], 0 \n\t" 687 "vldx $vr1, %[src], %[stride] \n\t" 688 "vldx $vr2, %[src], %[stride_2] \n\t" 689 "vldx $vr3, %[src], %[stride_3] \n\t" 690 "add.d %[src], %[src], %[stride_4] \n\t" 691 "vld $vr4, %[src], 0 \n\t" 692 "vldx $vr5, %[src], %[stride] \n\t" 693 "vldx $vr6, %[src], %[stride_2] \n\t" 694 "vldx $vr7, %[src], %[stride_3] \n\t" 695 696 "vld $vr8, %[tmp], 0 \n\t" 697 "vldx $vr9, %[tmp], %[stride] \n\t" 698 "vldx $vr10, %[tmp], %[stride_2] \n\t" 699 "vldx $vr11, %[tmp], %[stride_3] \n\t" 700 "add.d %[tmp], %[tmp], %[stride_4] \n\t" 701 "vld $vr12, %[tmp], 0 \n\t" 702 "vldx $vr13, %[tmp], %[stride] \n\t" 703 "vldx $vr14, %[tmp], %[stride_2] \n\t" 704 "vldx $vr15, %[tmp], %[stride_3] \n\t" 705 706 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 707 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 708 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 709 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 710 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 711 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 712 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 713 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 714 715 "vst $vr0, %[dst], 0 \n\t" 716 "vstx $vr1, %[dst], %[stride] \n\t" 717 "vstx $vr2, %[dst], %[stride_2] \n\t" 718 "vstx $vr3, %[dst], %[stride_3] \n\t" 719 "add.d %[dst], %[dst], %[stride_4] \n\t" 720 "vst $vr4, %[dst], 0 \n\t" 721 "vstx $vr5, %[dst], %[stride] \n\t" 722 "vstx $vr6, %[dst], %[stride_2] \n\t" 723 "vstx $vr7, %[dst], %[stride_3] \n\t" 724 : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src), 725 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 726 [stride_4]"=&r"(stride_4) 727 : [stride]"r"(stride) 728 : "memory" 729 ); 730} 731 732/* avg_pixels16_8_lsx : dst = avg(src, dst) 733 * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8. 734 * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 735static av_always_inline void 736put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, 737 ptrdiff_t dstStride, ptrdiff_t srcStride) 738{ 739 ptrdiff_t stride_2, stride_3, stride_4; 740 ptrdiff_t dstride_2, dstride_3, dstride_4; 741 __asm__ volatile ( 742 "slli.d %[stride_2], %[srcStride], 1 \n\t" 743 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" 744 "slli.d %[stride_4], %[stride_2], 1 \n\t" 745 "slli.d %[dstride_2], %[dstStride], 1 \n\t" 746 "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t" 747 "slli.d %[dstride_4], %[dstride_2], 1 \n\t" 748 /* h0~h7 */ 749 "vld $vr0, %[src], 0 \n\t" 750 "vldx $vr1, %[src], %[srcStride] \n\t" 751 "vldx $vr2, %[src], %[stride_2] \n\t" 752 "vldx $vr3, %[src], %[stride_3] \n\t" 753 "add.d %[src], %[src], %[stride_4] \n\t" 754 "vld $vr4, %[src], 0 \n\t" 755 "vldx $vr5, %[src], %[srcStride] \n\t" 756 "vldx $vr6, %[src], %[stride_2] \n\t" 757 "vldx $vr7, %[src], %[stride_3] \n\t" 758 "add.d %[src], %[src], %[stride_4] \n\t" 759 760 "vld $vr8, %[half], 0x00 \n\t" 761 "vld $vr9, %[half], 0x10 \n\t" 762 "vld $vr10, %[half], 0x20 \n\t" 763 "vld $vr11, %[half], 0x30 \n\t" 764 "vld $vr12, %[half], 0x40 \n\t" 765 "vld $vr13, %[half], 0x50 \n\t" 766 "vld $vr14, %[half], 0x60 \n\t" 767 "vld $vr15, %[half], 0x70 \n\t" 768 769 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 770 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 771 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 772 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 773 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 774 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 775 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 776 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 777 778 "vst $vr0, %[dst], 0 \n\t" 779 "vstx $vr1, %[dst], %[dstStride] \n\t" 780 "vstx $vr2, %[dst], %[dstride_2] \n\t" 781 "vstx $vr3, %[dst], %[dstride_3] \n\t" 782 "add.d %[dst], %[dst], %[dstride_4] \n\t" 783 "vst $vr4, %[dst], 0 \n\t" 784 "vstx $vr5, %[dst], %[dstStride] \n\t" 785 "vstx $vr6, %[dst], %[dstride_2] \n\t" 786 "vstx $vr7, %[dst], %[dstride_3] \n\t" 787 "add.d %[dst], %[dst], %[dstride_4] \n\t" 788 789 /* h8~h15 */ 790 "vld $vr0, %[src], 0 \n\t" 791 "vldx $vr1, %[src], %[srcStride] \n\t" 792 "vldx $vr2, %[src], %[stride_2] \n\t" 793 "vldx $vr3, %[src], %[stride_3] \n\t" 794 "add.d %[src], %[src], %[stride_4] \n\t" 795 "vld $vr4, %[src], 0 \n\t" 796 "vldx $vr5, %[src], %[srcStride] \n\t" 797 "vldx $vr6, %[src], %[stride_2] \n\t" 798 "vldx $vr7, %[src], %[stride_3] \n\t" 799 800 "vld $vr8, %[half], 0x80 \n\t" 801 "vld $vr9, %[half], 0x90 \n\t" 802 "vld $vr10, %[half], 0xa0 \n\t" 803 "vld $vr11, %[half], 0xb0 \n\t" 804 "vld $vr12, %[half], 0xc0 \n\t" 805 "vld $vr13, %[half], 0xd0 \n\t" 806 "vld $vr14, %[half], 0xe0 \n\t" 807 "vld $vr15, %[half], 0xf0 \n\t" 808 809 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 810 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 811 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 812 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 813 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 814 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 815 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 816 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 817 818 "vst $vr0, %[dst], 0 \n\t" 819 "vstx $vr1, %[dst], %[dstStride] \n\t" 820 "vstx $vr2, %[dst], %[dstride_2] \n\t" 821 "vstx $vr3, %[dst], %[dstride_3] \n\t" 822 "add.d %[dst], %[dst], %[dstride_4] \n\t" 823 "vst $vr4, %[dst], 0 \n\t" 824 "vstx $vr5, %[dst], %[dstStride] \n\t" 825 "vstx $vr6, %[dst], %[dstride_2] \n\t" 826 "vstx $vr7, %[dst], %[dstride_3] \n\t" 827 : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src), 828 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 829 [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2), 830 [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4) 831 : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride) 832 : "memory" 833 ); 834} 835 836/* avg_pixels16_8_lsx : dst = avg(src, dst) 837 * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8. 838 * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 839static av_always_inline void 840avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, 841 ptrdiff_t dstStride, ptrdiff_t srcStride) 842{ 843 uint8_t *tmp = dst; 844 ptrdiff_t stride_2, stride_3, stride_4; 845 ptrdiff_t dstride_2, dstride_3, dstride_4; 846 __asm__ volatile ( 847 "slli.d %[stride_2], %[srcStride], 1 \n\t" 848 "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" 849 "slli.d %[stride_4], %[stride_2], 1 \n\t" 850 "slli.d %[dstride_2], %[dstStride], 1 \n\t" 851 "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t" 852 "slli.d %[dstride_4], %[dstride_2], 1 \n\t" 853 /* h0~h7 */ 854 "vld $vr0, %[src], 0 \n\t" 855 "vldx $vr1, %[src], %[srcStride] \n\t" 856 "vldx $vr2, %[src], %[stride_2] \n\t" 857 "vldx $vr3, %[src], %[stride_3] \n\t" 858 "add.d %[src], %[src], %[stride_4] \n\t" 859 "vld $vr4, %[src], 0 \n\t" 860 "vldx $vr5, %[src], %[srcStride] \n\t" 861 "vldx $vr6, %[src], %[stride_2] \n\t" 862 "vldx $vr7, %[src], %[stride_3] \n\t" 863 "add.d %[src], %[src], %[stride_4] \n\t" 864 865 "vld $vr8, %[half], 0x00 \n\t" 866 "vld $vr9, %[half], 0x10 \n\t" 867 "vld $vr10, %[half], 0x20 \n\t" 868 "vld $vr11, %[half], 0x30 \n\t" 869 "vld $vr12, %[half], 0x40 \n\t" 870 "vld $vr13, %[half], 0x50 \n\t" 871 "vld $vr14, %[half], 0x60 \n\t" 872 "vld $vr15, %[half], 0x70 \n\t" 873 874 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 875 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 876 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 877 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 878 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 879 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 880 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 881 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 882 883 "vld $vr8, %[tmp], 0 \n\t" 884 "vldx $vr9, %[tmp], %[dstStride] \n\t" 885 "vldx $vr10, %[tmp], %[dstride_2] \n\t" 886 "vldx $vr11, %[tmp], %[dstride_3] \n\t" 887 "add.d %[tmp], %[tmp], %[dstride_4] \n\t" 888 "vld $vr12, %[tmp], 0 \n\t" 889 "vldx $vr13, %[tmp], %[dstStride] \n\t" 890 "vldx $vr14, %[tmp], %[dstride_2] \n\t" 891 "vldx $vr15, %[tmp], %[dstride_3] \n\t" 892 "add.d %[tmp], %[tmp], %[dstride_4] \n\t" 893 894 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 895 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 896 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 897 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 898 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 899 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 900 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 901 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 902 903 "vst $vr0, %[dst], 0 \n\t" 904 "vstx $vr1, %[dst], %[dstStride] \n\t" 905 "vstx $vr2, %[dst], %[dstride_2] \n\t" 906 "vstx $vr3, %[dst], %[dstride_3] \n\t" 907 "add.d %[dst], %[dst], %[dstride_4] \n\t" 908 "vst $vr4, %[dst], 0 \n\t" 909 "vstx $vr5, %[dst], %[dstStride] \n\t" 910 "vstx $vr6, %[dst], %[dstride_2] \n\t" 911 "vstx $vr7, %[dst], %[dstride_3] \n\t" 912 "add.d %[dst], %[dst], %[dstride_4] \n\t" 913 914 /* h8~h15 */ 915 "vld $vr0, %[src], 0 \n\t" 916 "vldx $vr1, %[src], %[srcStride] \n\t" 917 "vldx $vr2, %[src], %[stride_2] \n\t" 918 "vldx $vr3, %[src], %[stride_3] \n\t" 919 "add.d %[src], %[src], %[stride_4] \n\t" 920 "vld $vr4, %[src], 0 \n\t" 921 "vldx $vr5, %[src], %[srcStride] \n\t" 922 "vldx $vr6, %[src], %[stride_2] \n\t" 923 "vldx $vr7, %[src], %[stride_3] \n\t" 924 925 "vld $vr8, %[half], 0x80 \n\t" 926 "vld $vr9, %[half], 0x90 \n\t" 927 "vld $vr10, %[half], 0xa0 \n\t" 928 "vld $vr11, %[half], 0xb0 \n\t" 929 "vld $vr12, %[half], 0xc0 \n\t" 930 "vld $vr13, %[half], 0xd0 \n\t" 931 "vld $vr14, %[half], 0xe0 \n\t" 932 "vld $vr15, %[half], 0xf0 \n\t" 933 934 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 935 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 936 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 937 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 938 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 939 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 940 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 941 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 942 943 "vld $vr8, %[tmp], 0 \n\t" 944 "vldx $vr9, %[tmp], %[dstStride] \n\t" 945 "vldx $vr10, %[tmp], %[dstride_2] \n\t" 946 "vldx $vr11, %[tmp], %[dstride_3] \n\t" 947 "add.d %[tmp], %[tmp], %[dstride_4] \n\t" 948 "vld $vr12, %[tmp], 0 \n\t" 949 "vldx $vr13, %[tmp], %[dstStride] \n\t" 950 "vldx $vr14, %[tmp], %[dstride_2] \n\t" 951 "vldx $vr15, %[tmp], %[dstride_3] \n\t" 952 953 "vavgr.bu $vr0, $vr8, $vr0 \n\t" 954 "vavgr.bu $vr1, $vr9, $vr1 \n\t" 955 "vavgr.bu $vr2, $vr10, $vr2 \n\t" 956 "vavgr.bu $vr3, $vr11, $vr3 \n\t" 957 "vavgr.bu $vr4, $vr12, $vr4 \n\t" 958 "vavgr.bu $vr5, $vr13, $vr5 \n\t" 959 "vavgr.bu $vr6, $vr14, $vr6 \n\t" 960 "vavgr.bu $vr7, $vr15, $vr7 \n\t" 961 962 "vst $vr0, %[dst], 0 \n\t" 963 "vstx $vr1, %[dst], %[dstStride] \n\t" 964 "vstx $vr2, %[dst], %[dstride_2] \n\t" 965 "vstx $vr3, %[dst], %[dstride_3] \n\t" 966 "add.d %[dst], %[dst], %[dstride_4] \n\t" 967 "vst $vr4, %[dst], 0 \n\t" 968 "vstx $vr5, %[dst], %[dstStride] \n\t" 969 "vstx $vr6, %[dst], %[dstride_2] \n\t" 970 "vstx $vr7, %[dst], %[dstride_3] \n\t" 971 : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src), 972 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 973 [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2), 974 [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4) 975 : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride) 976 : "memory" 977 ); 978} 979 980#define QPEL8_H_LOWPASS(out_v) \ 981 src00 = __lasx_xvld(src, - 2); \ 982 src += srcStride; \ 983 src10 = __lasx_xvld(src, - 2); \ 984 src += srcStride; \ 985 src00 = __lasx_xvpermi_q(src00, src10, 0x02); \ 986 src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \ 987 src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \ 988 src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \ 989 src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \ 990 src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \ 991 DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\ 992 src00 = __lasx_xvaddwl_h_bu(src00, src05); \ 993 src02 = __lasx_xvmul_h(src02, h_20); \ 994 src01 = __lasx_xvmul_h(src01, h_5); \ 995 src02 = __lasx_xvssub_h(src02, src01); \ 996 src02 = __lasx_xvsadd_h(src02, src00); \ 997 src02 = __lasx_xvsadd_h(src02, h_16); \ 998 out_v = __lasx_xvssrani_bu_h(src02, src02, 5); \ 999 1000static av_always_inline void 1001put_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, 1002 int srcStride) 1003{ 1004 int dstStride_2x = dstStride << 1; 1005 __m256i src00, src01, src02, src03, src04, src05, src10; 1006 __m256i out0, out1, out2, out3; 1007 __m256i h_20 = __lasx_xvldi(0x414); 1008 __m256i h_5 = __lasx_xvldi(0x405); 1009 __m256i h_16 = __lasx_xvldi(0x410); 1010 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0}; 1011 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0}; 1012 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0}; 1013 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0}; 1014 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0}; 1015 1016 QPEL8_H_LOWPASS(out0) 1017 QPEL8_H_LOWPASS(out1) 1018 QPEL8_H_LOWPASS(out2) 1019 QPEL8_H_LOWPASS(out3) 1020 __lasx_xvstelm_d(out0, dst, 0, 0); 1021 __lasx_xvstelm_d(out0, dst + dstStride, 0, 2); 1022 dst += dstStride_2x; 1023 __lasx_xvstelm_d(out1, dst, 0, 0); 1024 __lasx_xvstelm_d(out1, dst + dstStride, 0, 2); 1025 dst += dstStride_2x; 1026 __lasx_xvstelm_d(out2, dst, 0, 0); 1027 __lasx_xvstelm_d(out2, dst + dstStride, 0, 2); 1028 dst += dstStride_2x; 1029 __lasx_xvstelm_d(out3, dst, 0, 0); 1030 __lasx_xvstelm_d(out3, dst + dstStride, 0, 2); 1031} 1032 1033#define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6, \ 1034 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5) \ 1035{ \ 1036 tmp0 = __lasx_xvpermi_q(src0, src1, 0x02); \ 1037 tmp1 = __lasx_xvpermi_q(src1, src2, 0x02); \ 1038 tmp2 = __lasx_xvpermi_q(src2, src3, 0x02); \ 1039 tmp3 = __lasx_xvpermi_q(src3, src4, 0x02); \ 1040 tmp4 = __lasx_xvpermi_q(src4, src5, 0x02); \ 1041 tmp5 = __lasx_xvpermi_q(src5, src6, 0x02); \ 1042 DUP2_ARG2(__lasx_xvaddwl_h_bu, tmp2, tmp3, tmp1, tmp4, tmp2, tmp1); \ 1043 tmp0 = __lasx_xvaddwl_h_bu(tmp0, tmp5); \ 1044 tmp2 = __lasx_xvmul_h(tmp2, h_20); \ 1045 tmp1 = __lasx_xvmul_h(tmp1, h_5); \ 1046 tmp2 = __lasx_xvssub_h(tmp2, tmp1); \ 1047 tmp2 = __lasx_xvsadd_h(tmp2, tmp0); \ 1048 tmp2 = __lasx_xvsadd_h(tmp2, h_16); \ 1049 tmp2 = __lasx_xvssrani_bu_h(tmp2, tmp2, 5); \ 1050} 1051 1052static av_always_inline void 1053put_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, 1054 int srcStride) 1055{ 1056 int srcStride_2x = srcStride << 1; 1057 int dstStride_2x = dstStride << 1; 1058 int srcStride_4x = srcStride << 2; 1059 int srcStride_3x = srcStride_2x + srcStride; 1060 __m256i src00, src01, src02, src03, src04, src05, src06; 1061 __m256i src07, src08, src09, src10, src11, src12; 1062 __m256i tmp00, tmp01, tmp02, tmp03, tmp04, tmp05; 1063 __m256i h_20 = __lasx_xvldi(0x414); 1064 __m256i h_5 = __lasx_xvldi(0x405); 1065 __m256i h_16 = __lasx_xvldi(0x410); 1066 1067 DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0, 1068 src00, src01); 1069 src02 = __lasx_xvld(src, 0); 1070 DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src, 1071 srcStride_3x, src, srcStride_4x, src03, src04, src05, src06); 1072 src += srcStride_4x; 1073 DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src, 1074 srcStride_3x, src, srcStride_4x, src07, src08, src09, src10); 1075 src += srcStride_4x; 1076 DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12); 1077 1078 QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06, 1079 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1080 __lasx_xvstelm_d(tmp02, dst, 0, 0); 1081 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2); 1082 dst += dstStride_2x; 1083 QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08, 1084 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1085 __lasx_xvstelm_d(tmp02, dst, 0, 0); 1086 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2); 1087 dst += dstStride_2x; 1088 QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10, 1089 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1090 __lasx_xvstelm_d(tmp02, dst, 0, 0); 1091 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2); 1092 dst += dstStride_2x; 1093 QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12, 1094 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1095 __lasx_xvstelm_d(tmp02, dst, 0, 0); 1096 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2); 1097} 1098 1099static av_always_inline void 1100avg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, 1101 int srcStride) 1102{ 1103 int srcStride_2x = srcStride << 1; 1104 int srcStride_4x = srcStride << 2; 1105 int dstStride_2x = dstStride << 1; 1106 int dstStride_4x = dstStride << 2; 1107 int srcStride_3x = srcStride_2x + srcStride; 1108 int dstStride_3x = dstStride_2x + dstStride; 1109 __m256i src00, src01, src02, src03, src04, src05, src06; 1110 __m256i src07, src08, src09, src10, src11, src12, tmp00; 1111 __m256i tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09; 1112 __m256i h_20 = __lasx_xvldi(0x414); 1113 __m256i h_5 = __lasx_xvldi(0x405); 1114 __m256i h_16 = __lasx_xvldi(0x410); 1115 1116 1117 DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0, 1118 src00, src01); 1119 src02 = __lasx_xvld(src, 0); 1120 DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src, 1121 srcStride_3x, src, srcStride_4x, src03, src04, src05, src06); 1122 src += srcStride_4x; 1123 DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src, 1124 srcStride_3x, src, srcStride_4x, src07, src08, src09, src10); 1125 src += srcStride_4x; 1126 DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12); 1127 1128 tmp06 = __lasx_xvld(dst, 0); 1129 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, 1130 dst, dstStride_3x, dst, dstStride_4x, 1131 tmp07, tmp02, tmp03, tmp04); 1132 dst += dstStride_4x; 1133 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, 1134 tmp05, tmp00); 1135 tmp01 = __lasx_xvldx(dst, dstStride_3x); 1136 dst -= dstStride_4x; 1137 1138 tmp06 = __lasx_xvpermi_q(tmp06, tmp07, 0x02); 1139 tmp07 = __lasx_xvpermi_q(tmp02, tmp03, 0x02); 1140 tmp08 = __lasx_xvpermi_q(tmp04, tmp05, 0x02); 1141 tmp09 = __lasx_xvpermi_q(tmp00, tmp01, 0x02); 1142 1143 QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06, 1144 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1145 tmp06 = __lasx_xvavgr_bu(tmp06, tmp02); 1146 __lasx_xvstelm_d(tmp06, dst, 0, 0); 1147 __lasx_xvstelm_d(tmp06, dst + dstStride, 0, 2); 1148 dst += dstStride_2x; 1149 QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08, 1150 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1151 tmp07 = __lasx_xvavgr_bu(tmp07, tmp02); 1152 __lasx_xvstelm_d(tmp07, dst, 0, 0); 1153 __lasx_xvstelm_d(tmp07, dst + dstStride, 0, 2); 1154 dst += dstStride_2x; 1155 QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10, 1156 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1157 tmp08 = __lasx_xvavgr_bu(tmp08, tmp02); 1158 __lasx_xvstelm_d(tmp08, dst, 0, 0); 1159 __lasx_xvstelm_d(tmp08, dst + dstStride, 0, 2); 1160 dst += dstStride_2x; 1161 QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12, 1162 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1163 tmp09 = __lasx_xvavgr_bu(tmp09, tmp02); 1164 __lasx_xvstelm_d(tmp09, dst, 0, 0); 1165 __lasx_xvstelm_d(tmp09, dst + dstStride, 0, 2); 1166} 1167 1168#define QPEL8_HV_LOWPASS_H(tmp) \ 1169{ \ 1170 src00 = __lasx_xvld(src, -2); \ 1171 src += srcStride; \ 1172 src10 = __lasx_xvld(src, -2); \ 1173 src += srcStride; \ 1174 src00 = __lasx_xvpermi_q(src00, src10, 0x02); \ 1175 src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \ 1176 src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \ 1177 src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \ 1178 src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \ 1179 src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \ 1180 DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\ 1181 src00 = __lasx_xvaddwl_h_bu(src00, src05); \ 1182 src02 = __lasx_xvmul_h(src02, h_20); \ 1183 src01 = __lasx_xvmul_h(src01, h_5); \ 1184 src02 = __lasx_xvssub_h(src02, src01); \ 1185 tmp = __lasx_xvsadd_h(src02, src00); \ 1186} 1187 1188#define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3, \ 1189 src4, src5, temp0, temp1, \ 1190 temp2, temp3, temp4, temp5, \ 1191 out) \ 1192{ \ 1193 DUP2_ARG2(__lasx_xvaddwl_w_h, src2, src3, src1, src4, temp0, temp2); \ 1194 DUP2_ARG2(__lasx_xvaddwh_w_h, src2, src3, src1, src4, temp1, temp3); \ 1195 temp4 = __lasx_xvaddwl_w_h(src0, src5); \ 1196 temp5 = __lasx_xvaddwh_w_h(src0, src5); \ 1197 temp0 = __lasx_xvmul_w(temp0, w_20); \ 1198 temp1 = __lasx_xvmul_w(temp1, w_20); \ 1199 temp2 = __lasx_xvmul_w(temp2, w_5); \ 1200 temp3 = __lasx_xvmul_w(temp3, w_5); \ 1201 temp0 = __lasx_xvssub_w(temp0, temp2); \ 1202 temp1 = __lasx_xvssub_w(temp1, temp3); \ 1203 temp0 = __lasx_xvsadd_w(temp0, temp4); \ 1204 temp1 = __lasx_xvsadd_w(temp1, temp5); \ 1205 temp0 = __lasx_xvsadd_w(temp0, w_512); \ 1206 temp1 = __lasx_xvsadd_w(temp1, w_512); \ 1207 temp0 = __lasx_xvssrani_hu_w(temp0, temp0, 10); \ 1208 temp1 = __lasx_xvssrani_hu_w(temp1, temp1, 10); \ 1209 temp0 = __lasx_xvpackev_d(temp1, temp0); \ 1210 out = __lasx_xvssrani_bu_h(temp0, temp0, 0); \ 1211} 1212 1213static av_always_inline void 1214put_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1215 ptrdiff_t dstStride, ptrdiff_t srcStride) 1216{ 1217 __m256i src00, src01, src02, src03, src04, src05, src10; 1218 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; 1219 __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12; 1220 __m256i h_20 = __lasx_xvldi(0x414); 1221 __m256i h_5 = __lasx_xvldi(0x405); 1222 __m256i w_20 = __lasx_xvldi(0x814); 1223 __m256i w_5 = __lasx_xvldi(0x805); 1224 __m256i w_512 = {512}; 1225 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0}; 1226 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0}; 1227 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0}; 1228 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0}; 1229 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0}; 1230 1231 w_512 = __lasx_xvreplve0_w(w_512); 1232 1233 src -= srcStride << 1; 1234 QPEL8_HV_LOWPASS_H(tmp0) 1235 QPEL8_HV_LOWPASS_H(tmp2) 1236 QPEL8_HV_LOWPASS_H(tmp4) 1237 QPEL8_HV_LOWPASS_H(tmp6) 1238 QPEL8_HV_LOWPASS_H(tmp8) 1239 QPEL8_HV_LOWPASS_H(tmp10) 1240 QPEL8_HV_LOWPASS_H(tmp12) 1241 tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21); 1242 tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21); 1243 tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21); 1244 tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21); 1245 tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21); 1246 tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21); 1247 1248 QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01, 1249 src02, src03, src04, src05, tmp0) 1250 QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01, 1251 src02, src03, src04, src05, tmp2) 1252 QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01, 1253 src02, src03, src04, src05, tmp4) 1254 QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01, 1255 src02, src03, src04, src05, tmp6) 1256 __lasx_xvstelm_d(tmp0, dst, 0, 0); 1257 dst += dstStride; 1258 __lasx_xvstelm_d(tmp0, dst, 0, 2); 1259 dst += dstStride; 1260 __lasx_xvstelm_d(tmp2, dst, 0, 0); 1261 dst += dstStride; 1262 __lasx_xvstelm_d(tmp2, dst, 0, 2); 1263 dst += dstStride; 1264 __lasx_xvstelm_d(tmp4, dst, 0, 0); 1265 dst += dstStride; 1266 __lasx_xvstelm_d(tmp4, dst, 0, 2); 1267 dst += dstStride; 1268 __lasx_xvstelm_d(tmp6, dst, 0, 0); 1269 dst += dstStride; 1270 __lasx_xvstelm_d(tmp6, dst, 0, 2); 1271} 1272 1273static av_always_inline void 1274avg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, 1275 int srcStride) 1276{ 1277 int dstStride_2x = dstStride << 1; 1278 int dstStride_4x = dstStride << 2; 1279 int dstStride_3x = dstStride_2x + dstStride; 1280 __m256i src00, src01, src02, src03, src04, src05, src10; 1281 __m256i dst00, dst01, dst0, dst1, dst2, dst3; 1282 __m256i out0, out1, out2, out3; 1283 __m256i h_20 = __lasx_xvldi(0x414); 1284 __m256i h_5 = __lasx_xvldi(0x405); 1285 __m256i h_16 = __lasx_xvldi(0x410); 1286 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0}; 1287 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0}; 1288 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0}; 1289 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0}; 1290 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0}; 1291 1292 QPEL8_H_LOWPASS(out0) 1293 QPEL8_H_LOWPASS(out1) 1294 QPEL8_H_LOWPASS(out2) 1295 QPEL8_H_LOWPASS(out3) 1296 src00 = __lasx_xvld(dst, 0); 1297 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst, 1298 dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04); 1299 dst += dstStride_4x; 1300 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, dst00); 1301 dst01 = __lasx_xvldx(dst, dstStride_3x); 1302 dst -= dstStride_4x; 1303 dst0 = __lasx_xvpermi_q(src00, src01, 0x02); 1304 dst1 = __lasx_xvpermi_q(src02, src03, 0x02); 1305 dst2 = __lasx_xvpermi_q(src04, src05, 0x02); 1306 dst3 = __lasx_xvpermi_q(dst00, dst01, 0x02); 1307 dst0 = __lasx_xvavgr_bu(dst0, out0); 1308 dst1 = __lasx_xvavgr_bu(dst1, out1); 1309 dst2 = __lasx_xvavgr_bu(dst2, out2); 1310 dst3 = __lasx_xvavgr_bu(dst3, out3); 1311 __lasx_xvstelm_d(dst0, dst, 0, 0); 1312 __lasx_xvstelm_d(dst0, dst + dstStride, 0, 2); 1313 __lasx_xvstelm_d(dst1, dst + dstStride_2x, 0, 0); 1314 __lasx_xvstelm_d(dst1, dst + dstStride_3x, 0, 2); 1315 dst += dstStride_4x; 1316 __lasx_xvstelm_d(dst2, dst, 0, 0); 1317 __lasx_xvstelm_d(dst2, dst + dstStride, 0, 2); 1318 __lasx_xvstelm_d(dst3, dst + dstStride_2x, 0, 0); 1319 __lasx_xvstelm_d(dst3, dst + dstStride_3x, 0, 2); 1320} 1321 1322static av_always_inline void 1323avg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1324 ptrdiff_t dstStride, ptrdiff_t srcStride) 1325{ 1326 __m256i src00, src01, src02, src03, src04, src05, src10; 1327 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; 1328 __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12; 1329 __m256i h_20 = __lasx_xvldi(0x414); 1330 __m256i h_5 = __lasx_xvldi(0x405); 1331 __m256i w_20 = __lasx_xvldi(0x814); 1332 __m256i w_5 = __lasx_xvldi(0x805); 1333 __m256i w_512 = {512}; 1334 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0}; 1335 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0}; 1336 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0}; 1337 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0}; 1338 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0}; 1339 ptrdiff_t dstStride_2x = dstStride << 1; 1340 ptrdiff_t dstStride_4x = dstStride << 2; 1341 ptrdiff_t dstStride_3x = dstStride_2x + dstStride; 1342 1343 w_512 = __lasx_xvreplve0_w(w_512); 1344 1345 src -= srcStride << 1; 1346 QPEL8_HV_LOWPASS_H(tmp0) 1347 QPEL8_HV_LOWPASS_H(tmp2) 1348 QPEL8_HV_LOWPASS_H(tmp4) 1349 QPEL8_HV_LOWPASS_H(tmp6) 1350 QPEL8_HV_LOWPASS_H(tmp8) 1351 QPEL8_HV_LOWPASS_H(tmp10) 1352 QPEL8_HV_LOWPASS_H(tmp12) 1353 tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21); 1354 tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21); 1355 tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21); 1356 tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21); 1357 tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21); 1358 tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21); 1359 1360 QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01, 1361 src02, src03, src04, src05, tmp0) 1362 QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01, 1363 src02, src03, src04, src05, tmp2) 1364 QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01, 1365 src02, src03, src04, src05, tmp4) 1366 QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01, 1367 src02, src03, src04, src05, tmp6) 1368 1369 src00 = __lasx_xvld(dst, 0); 1370 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst, 1371 dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04); 1372 dst += dstStride_4x; 1373 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, tmp8); 1374 tmp9 = __lasx_xvldx(dst, dstStride_3x); 1375 dst -= dstStride_4x; 1376 tmp1 = __lasx_xvpermi_q(src00, src01, 0x02); 1377 tmp3 = __lasx_xvpermi_q(src02, src03, 0x02); 1378 tmp5 = __lasx_xvpermi_q(src04, src05, 0x02); 1379 tmp7 = __lasx_xvpermi_q(tmp8, tmp9, 0x02); 1380 tmp0 = __lasx_xvavgr_bu(tmp0, tmp1); 1381 tmp2 = __lasx_xvavgr_bu(tmp2, tmp3); 1382 tmp4 = __lasx_xvavgr_bu(tmp4, tmp5); 1383 tmp6 = __lasx_xvavgr_bu(tmp6, tmp7); 1384 __lasx_xvstelm_d(tmp0, dst, 0, 0); 1385 dst += dstStride; 1386 __lasx_xvstelm_d(tmp0, dst, 0, 2); 1387 dst += dstStride; 1388 __lasx_xvstelm_d(tmp2, dst, 0, 0); 1389 dst += dstStride; 1390 __lasx_xvstelm_d(tmp2, dst, 0, 2); 1391 dst += dstStride; 1392 __lasx_xvstelm_d(tmp4, dst, 0, 0); 1393 dst += dstStride; 1394 __lasx_xvstelm_d(tmp4, dst, 0, 2); 1395 dst += dstStride; 1396 __lasx_xvstelm_d(tmp6, dst, 0, 0); 1397 dst += dstStride; 1398 __lasx_xvstelm_d(tmp6, dst, 0, 2); 1399} 1400 1401static av_always_inline void 1402put_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1403 int dstStride, int srcStride) 1404{ 1405 put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride); 1406 put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride); 1407 src += srcStride << 3; 1408 dst += dstStride << 3; 1409 put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride); 1410 put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride); 1411} 1412 1413static av_always_inline void 1414avg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1415 int dstStride, int srcStride) 1416{ 1417 avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride); 1418 avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride); 1419 src += srcStride << 3; 1420 dst += dstStride << 3; 1421 avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride); 1422 avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride); 1423} 1424 1425static void put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1426 int dstStride, int srcStride) 1427{ 1428 put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride); 1429 put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride); 1430 src += 8*srcStride; 1431 dst += 8*dstStride; 1432 put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride); 1433 put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride); 1434} 1435 1436static void avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1437 int dstStride, int srcStride) 1438{ 1439 avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride); 1440 avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride); 1441 src += 8*srcStride; 1442 dst += 8*dstStride; 1443 avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride); 1444 avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride); 1445} 1446 1447static void put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1448 ptrdiff_t dstStride, ptrdiff_t srcStride) 1449{ 1450 put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride); 1451 put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride); 1452 src += srcStride << 3; 1453 dst += dstStride << 3; 1454 put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride); 1455 put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride); 1456} 1457 1458static void avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1459 ptrdiff_t dstStride, ptrdiff_t srcStride) 1460{ 1461 avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride); 1462 avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride); 1463 src += srcStride << 3; 1464 dst += dstStride << 3; 1465 avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride); 1466 avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride); 1467} 1468 1469void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, 1470 ptrdiff_t stride) 1471{ 1472 /* In mmi optimization, it used function ff_put_pixels8_8_mmi 1473 * which implemented in hpeldsp_mmi.c */ 1474 put_pixels8_8_inline_asm(dst, src, stride); 1475} 1476 1477void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, 1478 ptrdiff_t stride) 1479{ 1480 uint8_t half[64]; 1481 1482 put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride); 1483 /* in qpel8, the stride of half and height of block is 8 */ 1484 put_pixels8_l2_8_lsx(dst, src, half, stride, stride); 1485} 1486 1487void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, 1488 ptrdiff_t stride) 1489{ 1490 put_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride); 1491} 1492 1493void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, 1494 ptrdiff_t stride) 1495{ 1496 uint8_t half[64]; 1497 1498 put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride); 1499 put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride); 1500} 1501 1502void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src, 1503 ptrdiff_t stride) 1504{ 1505 uint8_t half[64]; 1506 1507 put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride); 1508 put_pixels8_l2_8_lsx(dst, src, half, stride, stride); 1509} 1510 1511void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, 1512 ptrdiff_t stride) 1513{ 1514 uint8_t halfH[64]; 1515 uint8_t halfV[64]; 1516 1517 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1518 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride); 1519 put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1520} 1521 1522void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, 1523 ptrdiff_t stride) 1524{ 1525 uint8_t temp[128]; 1526 uint8_t *const halfH = temp; 1527 uint8_t *const halfHV = temp + 64; 1528 1529 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1530 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1531 put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1532} 1533 1534void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, 1535 ptrdiff_t stride) 1536{ 1537 uint8_t halfH[64]; 1538 uint8_t halfV[64]; 1539 1540 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1541 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride); 1542 put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1543} 1544 1545void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, 1546 ptrdiff_t stride) 1547{ 1548 put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride); 1549} 1550 1551void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, 1552 ptrdiff_t stride) 1553{ 1554 uint8_t temp[128]; 1555 uint8_t *const halfHV = temp; 1556 uint8_t *const halfH = temp + 64; 1557 1558 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1559 put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride); 1560 put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1561} 1562 1563void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, 1564 ptrdiff_t stride) 1565{ 1566 put_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride); 1567} 1568 1569void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, 1570 ptrdiff_t stride) 1571{ 1572 uint8_t temp[128]; 1573 uint8_t *const halfHV = temp; 1574 uint8_t *const halfH = temp + 64; 1575 1576 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1577 put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride); 1578 put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1579} 1580 1581void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src, 1582 ptrdiff_t stride) 1583{ 1584 uint8_t half[64]; 1585 1586 put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride); 1587 put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride); 1588} 1589 1590void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, 1591 ptrdiff_t stride) 1592{ 1593 uint8_t halfH[64]; 1594 uint8_t halfV[64]; 1595 1596 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1597 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride); 1598 put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1599} 1600 1601void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, 1602 ptrdiff_t stride) 1603{ 1604 uint8_t temp[128]; 1605 uint8_t *const halfH = temp; 1606 uint8_t *const halfHV = temp + 64; 1607 1608 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1609 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1610 put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1611} 1612 1613void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, 1614 ptrdiff_t stride) 1615{ 1616 uint8_t halfH[64]; 1617 uint8_t halfV[64]; 1618 1619 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1620 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride); 1621 put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1622} 1623 1624void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, 1625 ptrdiff_t stride) 1626{ 1627 /* In mmi optimization, it used function ff_avg_pixels8_8_mmi 1628 * which implemented in hpeldsp_mmi.c */ 1629 avg_pixels8_8_lsx(dst, src, stride); 1630} 1631 1632void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, 1633 ptrdiff_t stride) 1634{ 1635 uint8_t half[64]; 1636 1637 put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride); 1638 avg_pixels8_l2_8_lsx(dst, src, half, stride, stride); 1639} 1640 1641void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, 1642 ptrdiff_t stride) 1643{ 1644 avg_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride); 1645} 1646 1647void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, 1648 ptrdiff_t stride) 1649{ 1650 uint8_t half[64]; 1651 1652 put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride); 1653 avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride); 1654} 1655 1656void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, 1657 ptrdiff_t stride) 1658{ 1659 uint8_t halfH[64]; 1660 uint8_t halfV[64]; 1661 1662 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1663 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride); 1664 avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1665} 1666 1667void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, 1668 ptrdiff_t stride) 1669{ 1670 uint8_t temp[128]; 1671 uint8_t *const halfH = temp; 1672 uint8_t *const halfHV = temp + 64; 1673 1674 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1675 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1676 avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1677} 1678 1679void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, 1680 ptrdiff_t stride) 1681{ 1682 uint8_t halfH[64]; 1683 uint8_t halfV[64]; 1684 1685 put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1686 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride); 1687 avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1688} 1689 1690void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, 1691 ptrdiff_t stride) 1692{ 1693 avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride); 1694} 1695 1696void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, 1697 ptrdiff_t stride) 1698{ 1699 uint8_t temp[128]; 1700 uint8_t *const halfHV = temp; 1701 uint8_t *const halfH = temp + 64; 1702 1703 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1704 put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride); 1705 avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1706} 1707 1708void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, 1709 ptrdiff_t stride) 1710{ 1711 avg_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride); 1712} 1713 1714void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, 1715 ptrdiff_t stride) 1716{ 1717 uint8_t temp[128]; 1718 uint8_t *const halfHV = temp; 1719 uint8_t *const halfH = temp + 64; 1720 1721 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1722 put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride); 1723 avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1724} 1725 1726void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, 1727 ptrdiff_t stride) 1728{ 1729 uint8_t halfH[64]; 1730 uint8_t halfV[64]; 1731 1732 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1733 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride); 1734 avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1735} 1736 1737void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, 1738 ptrdiff_t stride) 1739{ 1740 uint8_t temp[128]; 1741 uint8_t *const halfH = temp; 1742 uint8_t *const halfHV = temp + 64; 1743 1744 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1745 put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1746 avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1747} 1748 1749void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, 1750 ptrdiff_t stride) 1751{ 1752 uint8_t halfH[64]; 1753 uint8_t halfV[64]; 1754 1755 put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1756 put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride); 1757 avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1758} 1759 1760void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, 1761 ptrdiff_t stride) 1762{ 1763 /* In mmi optimization, it used function ff_put_pixels16_8_mmi 1764 * which implemented in hpeldsp_mmi.c */ 1765 put_pixels16_8_lsx(dst, src, stride); 1766} 1767 1768void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, 1769 ptrdiff_t stride) 1770{ 1771 uint8_t half[256]; 1772 1773 put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride); 1774 put_pixels16_l2_8_lsx(dst, src, half, stride, stride); 1775} 1776 1777void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, 1778 ptrdiff_t stride) 1779{ 1780 put_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride); 1781} 1782 1783void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, 1784 ptrdiff_t stride) 1785{ 1786 uint8_t half[256]; 1787 1788 put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride); 1789 put_pixels16_l2_8_lsx(dst, src+1, half, stride, stride); 1790} 1791 1792void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, 1793 ptrdiff_t stride) 1794{ 1795 uint8_t half[256]; 1796 1797 put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride); 1798 put_pixels16_l2_8_lsx(dst, src, half, stride, stride); 1799} 1800 1801void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, 1802 ptrdiff_t stride) 1803{ 1804 avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2), 1805 dst, stride); 1806} 1807 1808void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, 1809 ptrdiff_t stride) 1810{ 1811 uint8_t temp[512]; 1812 uint8_t *const halfH = temp; 1813 uint8_t *const halfHV = temp + 256; 1814 1815 put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride); 1816 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1817 put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1818} 1819 1820void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, 1821 ptrdiff_t stride) 1822{ 1823 avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2) + 1, 1824 dst, stride); 1825} 1826 1827void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, 1828 ptrdiff_t stride) 1829{ 1830 put_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride); 1831} 1832 1833void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, 1834 ptrdiff_t stride) 1835{ 1836 uint8_t temp[512]; 1837 uint8_t *const halfHV = temp; 1838 uint8_t *const halfH = temp + 256; 1839 1840 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1841 put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride); 1842 put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1843} 1844 1845void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, 1846 ptrdiff_t stride) 1847{ 1848 put_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride); 1849} 1850 1851void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, 1852 ptrdiff_t stride) 1853{ 1854 uint8_t temp[512]; 1855 uint8_t *const halfHV = temp; 1856 uint8_t *const halfH = temp + 256; 1857 1858 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1859 put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride); 1860 put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1861} 1862 1863void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, 1864 ptrdiff_t stride) 1865{ 1866 uint8_t half[256]; 1867 1868 put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride); 1869 put_pixels16_l2_8_lsx(dst, src+stride, half, stride, stride); 1870} 1871 1872void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, 1873 ptrdiff_t stride) 1874{ 1875 avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2, (uint8_t*)src - (stride * 2), 1876 dst, stride); 1877} 1878 1879void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, 1880 ptrdiff_t stride) 1881{ 1882 uint8_t temp[512]; 1883 uint8_t *const halfH = temp; 1884 uint8_t *const halfHV = temp + 256; 1885 1886 put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride); 1887 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1888 put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1889} 1890 1891void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, 1892 ptrdiff_t stride) 1893{ 1894 avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2, 1895 (uint8_t*)src - (stride * 2) + 1, dst, stride); 1896} 1897 1898void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, 1899 ptrdiff_t stride) 1900{ 1901 /* In mmi optimization, it used function ff_avg_pixels16_8_mmi 1902 * which implemented in hpeldsp_mmi.c */ 1903 avg_pixels16_8_lsx(dst, src, stride); 1904} 1905 1906void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, 1907 ptrdiff_t stride) 1908{ 1909 uint8_t half[256]; 1910 1911 put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride); 1912 avg_pixels16_l2_8_lsx(dst, src, half, stride, stride); 1913} 1914 1915void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, 1916 ptrdiff_t stride) 1917{ 1918 avg_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride); 1919} 1920 1921void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, 1922 ptrdiff_t stride) 1923{ 1924 uint8_t half[256]; 1925 1926 put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride); 1927 avg_pixels16_l2_8_lsx(dst, src+1, half, stride, stride); 1928} 1929 1930void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, 1931 ptrdiff_t stride) 1932{ 1933 uint8_t half[256]; 1934 1935 put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride); 1936 avg_pixels16_l2_8_lsx(dst, src, half, stride, stride); 1937} 1938 1939void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, 1940 ptrdiff_t stride) 1941{ 1942 avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2, 1943 (uint8_t*)src - (stride * 2), 1944 dst, stride); 1945} 1946 1947void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, 1948 ptrdiff_t stride) 1949{ 1950 uint8_t temp[512]; 1951 uint8_t *const halfH = temp; 1952 uint8_t *const halfHV = temp + 256; 1953 1954 put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride); 1955 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1956 avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1957} 1958 1959void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, 1960 ptrdiff_t stride) 1961{ 1962 avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2, 1963 (uint8_t*)src - (stride * 2) + 1, 1964 dst, stride); 1965} 1966 1967void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, 1968 ptrdiff_t stride) 1969{ 1970 avg_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride); 1971} 1972 1973void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, 1974 ptrdiff_t stride) 1975{ 1976 uint8_t temp[512]; 1977 uint8_t *const halfHV = temp; 1978 uint8_t *const halfH = temp + 256; 1979 1980 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1981 put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride); 1982 avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1983} 1984 1985void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, 1986 ptrdiff_t stride) 1987{ 1988 avg_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride); 1989} 1990 1991void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, 1992 ptrdiff_t stride) 1993{ 1994 uint8_t temp[512]; 1995 uint8_t *const halfHV = temp; 1996 uint8_t *const halfH = temp + 256; 1997 1998 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1999 put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride); 2000 avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 2001} 2002 2003void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, 2004 ptrdiff_t stride) 2005{ 2006 uint8_t half[256]; 2007 2008 put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride); 2009 avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride); 2010} 2011 2012void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, 2013 ptrdiff_t stride) 2014{ 2015 avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2, 2016 (uint8_t*)src - (stride * 2), 2017 dst, stride); 2018} 2019 2020void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, 2021 ptrdiff_t stride) 2022{ 2023 uint8_t temp[512]; 2024 uint8_t *const halfH = temp; 2025 uint8_t *const halfHV = temp + 256; 2026 2027 put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride); 2028 put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 2029 avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 2030} 2031 2032void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, 2033 ptrdiff_t stride) 2034{ 2035 avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2, 2036 (uint8_t*)src - (stride * 2) + 1, 2037 dst, stride); 2038} 2039