1/* 2 * Loongson LASX optimized h264chroma 3 * 4 * Copyright (c) 2020 Loongson Technology Corporation Limited 5 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include "h264chroma_lasx.h" 25#include "libavutil/attributes.h" 26#include "libavutil/avassert.h" 27#include "libavutil/loongarch/loongson_intrinsics.h" 28 29static const uint8_t chroma_mask_arr[64] = { 30 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 31 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 32 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 33 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 34}; 35 36static av_always_inline void avc_chroma_hv_8x4_lasx(uint8_t *src, uint8_t *dst, 37 ptrdiff_t stride, uint32_t coef_hor0, 38 uint32_t coef_hor1, uint32_t coef_ver0, 39 uint32_t coef_ver1) 40{ 41 ptrdiff_t stride_2x = stride << 1; 42 ptrdiff_t stride_3x = stride_2x + stride; 43 ptrdiff_t stride_4x = stride_2x << 1; 44 __m256i src0, src1, src2, src3, src4, out; 45 __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1; 46 __m256i mask; 47 __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 48 __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 49 __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 50 __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 51 __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 52 53 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 54 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 55 src1, src2, src3, src4); 56 DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3); 57 src0 = __lasx_xvshuf_b(src0, src0, mask); 58 DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3); 59 DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1); 60 res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec); 61 res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0); 62 res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0); 63 res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20); 64 res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3); 65 res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1); 66 res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1); 67 out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6); 68 __lasx_xvstelm_d(out, dst, 0, 0); 69 __lasx_xvstelm_d(out, dst + stride, 0, 2); 70 __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 71 __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 72} 73 74static av_always_inline void avc_chroma_hv_8x8_lasx(uint8_t *src, uint8_t *dst, 75 ptrdiff_t stride, uint32_t coef_hor0, 76 uint32_t coef_hor1, uint32_t coef_ver0, 77 uint32_t coef_ver1) 78{ 79 ptrdiff_t stride_2x = stride << 1; 80 ptrdiff_t stride_3x = stride_2x + stride; 81 ptrdiff_t stride_4x = stride << 2; 82 __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 83 __m256i out0, out1; 84 __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; 85 __m256i res_vt0, res_vt1, res_vt2, res_vt3; 86 __m256i mask; 87 __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 88 __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 89 __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 90 __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 91 __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 92 93 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 94 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 95 src1, src2, src3, src4); 96 src += stride_4x; 97 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 98 src5, src6, src7, src8); 99 DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20, 100 src8, src7, 0x20, src1, src3, src5, src7); 101 src0 = __lasx_xvshuf_b(src0, src0, mask); 102 DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7, 103 src7, mask, src1, src3, src5, src7); 104 DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3, 105 coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3); 106 res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec); 107 res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0); 108 res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0); 109 res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0); 110 res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0); 111 res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20); 112 res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3); 113 res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3); 114 res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3); 115 DUP4_ARG3(__lasx_xvmadd_h, res_vt0, res_hz0, coeff_vt_vec1, res_vt1, res_hz1, coeff_vt_vec1, 116 res_vt2, res_hz2, coeff_vt_vec1, res_vt3, res_hz3, coeff_vt_vec1, 117 res_vt0, res_vt1, res_vt2, res_vt3); 118 DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6, out0, out1); 119 __lasx_xvstelm_d(out0, dst, 0, 0); 120 __lasx_xvstelm_d(out0, dst + stride, 0, 2); 121 __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 122 __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 123 dst += stride_4x; 124 __lasx_xvstelm_d(out1, dst, 0, 0); 125 __lasx_xvstelm_d(out1, dst + stride, 0, 2); 126 __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 127 __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 128} 129 130static av_always_inline void avc_chroma_hz_8x4_lasx(uint8_t *src, uint8_t *dst, 131 ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1) 132{ 133 ptrdiff_t stride_2x = stride << 1; 134 ptrdiff_t stride_3x = stride_2x + stride; 135 __m256i src0, src1, src2, src3, out; 136 __m256i res0, res1; 137 __m256i mask; 138 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 139 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 140 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 141 142 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 143 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 144 DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src1, src2); 145 src3 = __lasx_xvldx(src, stride_3x); 146 DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2); 147 DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2); 148 DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); 149 out = __lasx_xvssrarni_bu_h(res1, res0, 6); 150 __lasx_xvstelm_d(out, dst, 0, 0); 151 __lasx_xvstelm_d(out, dst + stride, 0, 2); 152 __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 153 __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 154 155} 156 157static av_always_inline void avc_chroma_hz_8x8_lasx(uint8_t *src, uint8_t *dst, 158 ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1) 159{ 160 ptrdiff_t stride_2x = stride << 1; 161 ptrdiff_t stride_3x = stride_2x + stride; 162 ptrdiff_t stride_4x = stride << 2; 163 __m256i src0, src1, src2, src3, src4, src5, src6, src7; 164 __m256i out0, out1; 165 __m256i res0, res1, res2, res3; 166 __m256i mask; 167 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 168 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 169 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 170 171 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 172 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 173 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 174 src1, src2, src3, src4); 175 src += stride_4x; 176 DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src5, src6); 177 src7 = __lasx_xvldx(src, stride_3x); 178 DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20, 179 src7, src6, 0x20, src0, src2, src4, src6); 180 DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4, mask, 181 src6, src6, mask, src0, src2, src4, src6); 182 DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6, 183 coeff_vec, res0, res1, res2, res3); 184 DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1); 185 __lasx_xvstelm_d(out0, dst, 0, 0); 186 __lasx_xvstelm_d(out0, dst + stride, 0, 2); 187 __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 188 __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 189 dst += stride_4x; 190 __lasx_xvstelm_d(out1, dst, 0, 0); 191 __lasx_xvstelm_d(out1, dst + stride, 0, 2); 192 __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 193 __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 194} 195 196static av_always_inline void avc_chroma_hz_nonmult_lasx(uint8_t *src, 197 uint8_t *dst, ptrdiff_t stride, uint32_t coeff0, 198 uint32_t coeff1, int32_t height) 199{ 200 uint32_t row; 201 ptrdiff_t stride_2x = stride << 1; 202 ptrdiff_t stride_3x = stride_2x + stride; 203 ptrdiff_t stride_4x = stride << 2; 204 __m256i src0, src1, src2, src3, out; 205 __m256i res0, res1; 206 __m256i mask; 207 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 208 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 209 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 210 211 mask = __lasx_xvld(chroma_mask_arr, 0); 212 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 213 214 for (row = height >> 2; row--;) { 215 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 216 src0, src1, src2, src3); 217 src += stride_4x; 218 DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2); 219 DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2); 220 DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); 221 out = __lasx_xvssrarni_bu_h(res1, res0, 6); 222 __lasx_xvstelm_d(out, dst, 0, 0); 223 __lasx_xvstelm_d(out, dst + stride, 0, 2); 224 __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 225 __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 226 dst += stride_4x; 227 } 228 229 if ((height & 3)) { 230 src0 = __lasx_xvld(src, 0); 231 src1 = __lasx_xvldx(src, stride); 232 src1 = __lasx_xvpermi_q(src1, src0, 0x20); 233 src0 = __lasx_xvshuf_b(src1, src1, mask); 234 res0 = __lasx_xvdp2_h_bu(src0, coeff_vec); 235 out = __lasx_xvssrarni_bu_h(res0, res0, 6); 236 __lasx_xvstelm_d(out, dst, 0, 0); 237 dst += stride; 238 __lasx_xvstelm_d(out, dst, 0, 2); 239 } 240} 241 242static av_always_inline void avc_chroma_vt_8x4_lasx(uint8_t *src, uint8_t *dst, 243 ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1) 244{ 245 ptrdiff_t stride_2x = stride << 1; 246 ptrdiff_t stride_3x = stride_2x + stride; 247 __m256i src0, src1, src2, src3, src4, out; 248 __m256i res0, res1; 249 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 250 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 251 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 252 253 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 254 src0 = __lasx_xvld(src, 0); 255 src += stride; 256 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 257 src1, src2, src3, src4); 258 DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, 259 src4, src3, 0x20, src0, src1, src2, src3); 260 DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2); 261 DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); 262 out = __lasx_xvssrarni_bu_h(res1, res0, 6); 263 __lasx_xvstelm_d(out, dst, 0, 0); 264 __lasx_xvstelm_d(out, dst + stride, 0, 2); 265 __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 266 __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 267} 268 269static av_always_inline void avc_chroma_vt_8x8_lasx(uint8_t *src, uint8_t *dst, 270 ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1) 271{ 272 ptrdiff_t stride_2x = stride << 1; 273 ptrdiff_t stride_3x = stride_2x + stride; 274 ptrdiff_t stride_4x = stride << 2; 275 __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 276 __m256i out0, out1; 277 __m256i res0, res1, res2, res3; 278 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 279 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 280 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 281 282 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 283 src0 = __lasx_xvld(src, 0); 284 src += stride; 285 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 286 src1, src2, src3, src4); 287 src += stride_4x; 288 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 289 src5, src6, src7, src8); 290 DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, 291 src4, src3, 0x20, src0, src1, src2, src3); 292 DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20, 293 src8, src7, 0x20, src4, src5, src6, src7); 294 DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6, 295 src0, src2, src4, src6); 296 DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, 297 src6, coeff_vec, res0, res1, res2, res3); 298 DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1); 299 __lasx_xvstelm_d(out0, dst, 0, 0); 300 __lasx_xvstelm_d(out0, dst + stride, 0, 2); 301 __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 302 __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 303 dst += stride_4x; 304 __lasx_xvstelm_d(out1, dst, 0, 0); 305 __lasx_xvstelm_d(out1, dst + stride, 0, 2); 306 __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 307 __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 308} 309 310static av_always_inline void copy_width8x8_lasx(uint8_t *src, uint8_t *dst, 311 ptrdiff_t stride) 312{ 313 uint64_t tmp[8]; 314 ptrdiff_t stride_2, stride_3, stride_4; 315 __asm__ volatile ( 316 "slli.d %[stride_2], %[stride], 1 \n\t" 317 "add.d %[stride_3], %[stride_2], %[stride] \n\t" 318 "slli.d %[stride_4], %[stride_2], 1 \n\t" 319 "ld.d %[tmp0], %[src], 0x0 \n\t" 320 "ldx.d %[tmp1], %[src], %[stride] \n\t" 321 "ldx.d %[tmp2], %[src], %[stride_2] \n\t" 322 "ldx.d %[tmp3], %[src], %[stride_3] \n\t" 323 "add.d %[src], %[src], %[stride_4] \n\t" 324 "ld.d %[tmp4], %[src], 0x0 \n\t" 325 "ldx.d %[tmp5], %[src], %[stride] \n\t" 326 "ldx.d %[tmp6], %[src], %[stride_2] \n\t" 327 "ldx.d %[tmp7], %[src], %[stride_3] \n\t" 328 329 "st.d %[tmp0], %[dst], 0x0 \n\t" 330 "stx.d %[tmp1], %[dst], %[stride] \n\t" 331 "stx.d %[tmp2], %[dst], %[stride_2] \n\t" 332 "stx.d %[tmp3], %[dst], %[stride_3] \n\t" 333 "add.d %[dst], %[dst], %[stride_4] \n\t" 334 "st.d %[tmp4], %[dst], 0x0 \n\t" 335 "stx.d %[tmp5], %[dst], %[stride] \n\t" 336 "stx.d %[tmp6], %[dst], %[stride_2] \n\t" 337 "stx.d %[tmp7], %[dst], %[stride_3] \n\t" 338 : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), 339 [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), 340 [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]), 341 [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]), 342 [dst]"+&r"(dst), [src]"+&r"(src), 343 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 344 [stride_4]"=&r"(stride_4) 345 : [stride]"r"(stride) 346 : "memory" 347 ); 348} 349 350static av_always_inline void copy_width8x4_lasx(uint8_t *src, uint8_t *dst, 351 ptrdiff_t stride) 352{ 353 uint64_t tmp[4]; 354 ptrdiff_t stride_2, stride_3; 355 __asm__ volatile ( 356 "slli.d %[stride_2], %[stride], 1 \n\t" 357 "add.d %[stride_3], %[stride_2], %[stride] \n\t" 358 "ld.d %[tmp0], %[src], 0x0 \n\t" 359 "ldx.d %[tmp1], %[src], %[stride] \n\t" 360 "ldx.d %[tmp2], %[src], %[stride_2] \n\t" 361 "ldx.d %[tmp3], %[src], %[stride_3] \n\t" 362 363 "st.d %[tmp0], %[dst], 0x0 \n\t" 364 "stx.d %[tmp1], %[dst], %[stride] \n\t" 365 "stx.d %[tmp2], %[dst], %[stride_2] \n\t" 366 "stx.d %[tmp3], %[dst], %[stride_3] \n\t" 367 : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), 368 [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), 369 [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3) 370 : [stride]"r"(stride), [dst]"r"(dst), [src]"r"(src) 371 : "memory" 372 ); 373} 374 375static void avc_chroma_hv_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 376 uint32_t coef_hor0, uint32_t coef_hor1, 377 uint32_t coef_ver0, uint32_t coef_ver1, 378 int32_t height) 379{ 380 if (4 == height) { 381 avc_chroma_hv_8x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 382 coef_ver1); 383 } else if (8 == height) { 384 avc_chroma_hv_8x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 385 coef_ver1); 386 } 387} 388 389static void avc_chroma_hv_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 390 uint32_t coef_hor0, uint32_t coef_hor1, 391 uint32_t coef_ver0, uint32_t coef_ver1) 392{ 393 ptrdiff_t stride_2 = stride << 1; 394 __m256i src0, src1, src2; 395 __m256i res_hz, res_vt; 396 __m256i mask; 397 __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 398 __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 399 __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 400 __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 401 __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 402 __m256i coeff_vt_vec = __lasx_xvpermi_q(coeff_vt_vec1, coeff_vt_vec0, 0x02); 403 404 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 405 DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2); 406 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src0, src1); 407 src0 = __lasx_xvpermi_q(src0, src1, 0x02); 408 res_hz = __lasx_xvdp2_h_bu(src0, coeff_hz_vec); 409 res_vt = __lasx_xvmul_h(res_hz, coeff_vt_vec); 410 res_hz = __lasx_xvpermi_q(res_hz, res_vt, 0x01); 411 res_vt = __lasx_xvadd_h(res_hz, res_vt); 412 res_vt = __lasx_xvssrarni_bu_h(res_vt, res_vt, 6); 413 __lasx_xvstelm_w(res_vt, dst, 0, 0); 414 __lasx_xvstelm_w(res_vt, dst + stride, 0, 1); 415} 416 417static void avc_chroma_hv_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 418 uint32_t coef_hor0, uint32_t coef_hor1, 419 uint32_t coef_ver0, uint32_t coef_ver1) 420{ 421 ptrdiff_t stride_2 = stride << 1; 422 ptrdiff_t stride_3 = stride_2 + stride; 423 ptrdiff_t stride_4 = stride_2 << 1; 424 __m256i src0, src1, src2, src3, src4; 425 __m256i res_hz0, res_hz1, res_vt0, res_vt1; 426 __m256i mask; 427 __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 428 __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 429 __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 430 __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 431 __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 432 433 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 434 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 435 src, stride_4, src1, src2, src3, src4); 436 DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask, 437 src4, src3, mask, src0, src1, src2, src3); 438 DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src0, src1); 439 DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1); 440 DUP2_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 441 res_hz0 = __lasx_xvadd_h(res_vt0, res_vt1); 442 res_hz0 = __lasx_xvssrarni_bu_h(res_hz0, res_hz0, 6); 443 __lasx_xvstelm_w(res_hz0, dst, 0, 0); 444 __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1); 445 __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4); 446 __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5); 447} 448 449static void avc_chroma_hv_4x8_lasx(uint8_t *src, uint8_t * dst, ptrdiff_t stride, 450 uint32_t coef_hor0, uint32_t coef_hor1, 451 uint32_t coef_ver0, uint32_t coef_ver1) 452{ 453 ptrdiff_t stride_2 = stride << 1; 454 ptrdiff_t stride_3 = stride_2 + stride; 455 ptrdiff_t stride_4 = stride_2 << 1; 456 __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 457 __m256i res_hz0, res_hz1, res_hz2, res_hz3; 458 __m256i res_vt0, res_vt1, res_vt2, res_vt3; 459 __m256i mask; 460 __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 461 __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 462 __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 463 __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 464 __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 465 466 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 467 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 468 src, stride_4, src1, src2, src3, src4); 469 src += stride_4; 470 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 471 src, stride_4, src5, src6, src7, src8); 472 DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask, 473 src4, src3, mask, src0, src1, src2, src3); 474 DUP4_ARG3(__lasx_xvshuf_b, src5, src4, mask, src6, src5, mask, src7, src6, mask, 475 src8, src7, mask, src4, src5, src6, src7); 476 DUP4_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src4, src6, 0x02, 477 src5, src7, 0x02, src0, src1, src4, src5); 478 DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src4, coeff_hz_vec, 479 src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3); 480 DUP4_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, 481 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 482 DUP2_ARG2(__lasx_xvadd_h, res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2); 483 res_hz0 = __lasx_xvssrarni_bu_h(res_vt2, res_vt0, 6); 484 __lasx_xvstelm_w(res_hz0, dst, 0, 0); 485 __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1); 486 __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4); 487 __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5); 488 dst += stride_4; 489 __lasx_xvstelm_w(res_hz0, dst, 0, 2); 490 __lasx_xvstelm_w(res_hz0, dst + stride, 0, 3); 491 __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 6); 492 __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 7); 493} 494 495static void avc_chroma_hv_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 496 uint32_t coef_hor0, uint32_t coef_hor1, 497 uint32_t coef_ver0, uint32_t coef_ver1, 498 int32_t height) 499{ 500 if (8 == height) { 501 avc_chroma_hv_4x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 502 coef_ver1); 503 } else if (4 == height) { 504 avc_chroma_hv_4x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 505 coef_ver1); 506 } else if (2 == height) { 507 avc_chroma_hv_4x2_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 508 coef_ver1); 509 } 510} 511 512static void avc_chroma_hz_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 513 uint32_t coeff0, uint32_t coeff1) 514{ 515 __m256i src0, src1; 516 __m256i res, mask; 517 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 518 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 519 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 520 521 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 522 src1 = __lasx_xvldx(src, stride); 523 src0 = __lasx_xvshuf_b(src1, src0, mask); 524 res = __lasx_xvdp2_h_bu(src0, coeff_vec); 525 res = __lasx_xvslli_h(res, 3); 526 res = __lasx_xvssrarni_bu_h(res, res, 6); 527 __lasx_xvstelm_w(res, dst, 0, 0); 528 __lasx_xvstelm_w(res, dst + stride, 0, 1); 529} 530 531static void avc_chroma_hz_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 532 uint32_t coeff0, uint32_t coeff1) 533{ 534 ptrdiff_t stride_2 = stride << 1; 535 ptrdiff_t stride_3 = stride_2 + stride; 536 __m256i src0, src1, src2, src3; 537 __m256i res, mask; 538 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 539 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 540 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 541 542 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 543 DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2); 544 src3 = __lasx_xvldx(src, stride_3); 545 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src0, src2); 546 src0 = __lasx_xvpermi_q(src0, src2, 0x02); 547 res = __lasx_xvdp2_h_bu(src0, coeff_vec); 548 res = __lasx_xvslli_h(res, 3); 549 res = __lasx_xvssrarni_bu_h(res, res, 6); 550 __lasx_xvstelm_w(res, dst, 0, 0); 551 __lasx_xvstelm_w(res, dst + stride, 0, 1); 552 __lasx_xvstelm_w(res, dst + stride_2, 0, 4); 553 __lasx_xvstelm_w(res, dst + stride_3, 0, 5); 554} 555 556static void avc_chroma_hz_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 557 uint32_t coeff0, uint32_t coeff1) 558{ 559 ptrdiff_t stride_2 = stride << 1; 560 ptrdiff_t stride_3 = stride_2 + stride; 561 ptrdiff_t stride_4 = stride_2 << 1; 562 __m256i src0, src1, src2, src3, src4, src5, src6, src7; 563 __m256i res0, res1, mask; 564 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 565 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 566 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 567 568 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 569 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 570 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 571 src, stride_4, src1, src2, src3, src4); 572 src += stride_4; 573 DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src5, src6); 574 src7 = __lasx_xvldx(src, stride_3); 575 DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask, 576 src7, src6, mask, src0, src2, src4, src6); 577 DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src4, src6, 0x02, src0, src4); 578 DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src4, coeff_vec, res0, res1); 579 res0 = __lasx_xvssrarni_bu_h(res1, res0, 6); 580 __lasx_xvstelm_w(res0, dst, 0, 0); 581 __lasx_xvstelm_w(res0, dst + stride, 0, 1); 582 __lasx_xvstelm_w(res0, dst + stride_2, 0, 4); 583 __lasx_xvstelm_w(res0, dst + stride_3, 0, 5); 584 dst += stride_4; 585 __lasx_xvstelm_w(res0, dst, 0, 2); 586 __lasx_xvstelm_w(res0, dst + stride, 0, 3); 587 __lasx_xvstelm_w(res0, dst + stride_2, 0, 6); 588 __lasx_xvstelm_w(res0, dst + stride_3, 0, 7); 589} 590 591static void avc_chroma_hz_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 592 uint32_t coeff0, uint32_t coeff1, 593 int32_t height) 594{ 595 if (8 == height) { 596 avc_chroma_hz_4x8_lasx(src, dst, stride, coeff0, coeff1); 597 } else if (4 == height) { 598 avc_chroma_hz_4x4_lasx(src, dst, stride, coeff0, coeff1); 599 } else if (2 == height) { 600 avc_chroma_hz_4x2_lasx(src, dst, stride, coeff0, coeff1); 601 } 602} 603 604static void avc_chroma_hz_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 605 uint32_t coeff0, uint32_t coeff1, 606 int32_t height) 607{ 608 if (4 == height) { 609 avc_chroma_hz_8x4_lasx(src, dst, stride, coeff0, coeff1); 610 } else if (8 == height) { 611 avc_chroma_hz_8x8_lasx(src, dst, stride, coeff0, coeff1); 612 } else { 613 avc_chroma_hz_nonmult_lasx(src, dst, stride, coeff0, coeff1, height); 614 } 615} 616 617static void avc_chroma_vt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 618 uint32_t coeff0, uint32_t coeff1) 619{ 620 __m256i src0, src1, src2; 621 __m256i tmp0, tmp1; 622 __m256i res; 623 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 624 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 625 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 626 627 src0 = __lasx_xvld(src, 0); 628 DUP2_ARG2(__lasx_xvldx, src, stride, src, stride << 1, src1, src2); 629 DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, tmp0, tmp1); 630 tmp0 = __lasx_xvilvl_d(tmp1, tmp0); 631 res = __lasx_xvdp2_h_bu(tmp0, coeff_vec); 632 res = __lasx_xvslli_h(res, 3); 633 res = __lasx_xvssrarni_bu_h(res, res, 6); 634 __lasx_xvstelm_w(res, dst, 0, 0); 635 __lasx_xvstelm_w(res, dst + stride, 0, 1); 636} 637 638static void avc_chroma_vt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 639 uint32_t coeff0, uint32_t coeff1) 640{ 641 ptrdiff_t stride_2 = stride << 1; 642 ptrdiff_t stride_3 = stride_2 + stride; 643 ptrdiff_t stride_4 = stride_2 << 1; 644 __m256i src0, src1, src2, src3, src4; 645 __m256i tmp0, tmp1, tmp2, tmp3; 646 __m256i res; 647 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 648 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 649 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 650 651 src0 = __lasx_xvld(src, 0); 652 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 653 src, stride_4, src1, src2, src3, src4); 654 DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, 655 tmp0, tmp1, tmp2, tmp3); 656 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 657 tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02); 658 res = __lasx_xvdp2_h_bu(tmp0, coeff_vec); 659 res = __lasx_xvslli_h(res, 3); 660 res = __lasx_xvssrarni_bu_h(res, res, 6); 661 __lasx_xvstelm_w(res, dst, 0, 0); 662 __lasx_xvstelm_w(res, dst + stride, 0, 1); 663 __lasx_xvstelm_w(res, dst + stride_2, 0, 4); 664 __lasx_xvstelm_w(res, dst + stride_3, 0, 5); 665} 666 667static void avc_chroma_vt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 668 uint32_t coeff0, uint32_t coeff1) 669{ 670 ptrdiff_t stride_2 = stride << 1; 671 ptrdiff_t stride_3 = stride_2 + stride; 672 ptrdiff_t stride_4 = stride_2 << 1; 673 __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 674 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 675 __m256i res0, res1; 676 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 677 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 678 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 679 680 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 681 src0 = __lasx_xvld(src, 0); 682 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 683 src, stride_4, src1, src2, src3, src4); 684 src += stride_4; 685 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 686 src, stride_4, src5, src6, src7, src8); 687 DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, 688 tmp0, tmp1, tmp2, tmp3); 689 DUP4_ARG2(__lasx_xvilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, 690 tmp4, tmp5, tmp6, tmp7); 691 DUP4_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, 692 tmp0, tmp2, tmp4, tmp6); 693 tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02); 694 tmp4 = __lasx_xvpermi_q(tmp4, tmp6, 0x02); 695 DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, coeff_vec, tmp4, coeff_vec, res0, res1); 696 res0 = __lasx_xvssrarni_bu_h(res1, res0, 6); 697 __lasx_xvstelm_w(res0, dst, 0, 0); 698 __lasx_xvstelm_w(res0, dst + stride, 0, 1); 699 __lasx_xvstelm_w(res0, dst + stride_2, 0, 4); 700 __lasx_xvstelm_w(res0, dst + stride_3, 0, 5); 701 dst += stride_4; 702 __lasx_xvstelm_w(res0, dst, 0, 2); 703 __lasx_xvstelm_w(res0, dst + stride, 0, 3); 704 __lasx_xvstelm_w(res0, dst + stride_2, 0, 6); 705 __lasx_xvstelm_w(res0, dst + stride_3, 0, 7); 706} 707 708static void avc_chroma_vt_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 709 uint32_t coeff0, uint32_t coeff1, 710 int32_t height) 711{ 712 if (8 == height) { 713 avc_chroma_vt_4x8_lasx(src, dst, stride, coeff0, coeff1); 714 } else if (4 == height) { 715 avc_chroma_vt_4x4_lasx(src, dst, stride, coeff0, coeff1); 716 } else if (2 == height) { 717 avc_chroma_vt_4x2_lasx(src, dst, stride, coeff0, coeff1); 718 } 719} 720 721static void avc_chroma_vt_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 722 uint32_t coeff0, uint32_t coeff1, 723 int32_t height) 724{ 725 if (4 == height) { 726 avc_chroma_vt_8x4_lasx(src, dst, stride, coeff0, coeff1); 727 } else if (8 == height) { 728 avc_chroma_vt_8x8_lasx(src, dst, stride, coeff0, coeff1); 729 } 730} 731 732static void copy_width4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 733 int32_t height) 734{ 735 uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; 736 737 if (8 == height) { 738 ptrdiff_t stride_2, stride_3, stride_4; 739 740 __asm__ volatile ( 741 "slli.d %[stride_2], %[stride], 1 \n\t" 742 "add.d %[stride_3], %[stride_2], %[stride] \n\t" 743 "slli.d %[stride_4], %[stride_2], 1 \n\t" 744 "ld.wu %[tp0], %[src], 0 \n\t" 745 "ldx.wu %[tp1], %[src], %[stride] \n\t" 746 "ldx.wu %[tp2], %[src], %[stride_2] \n\t" 747 "ldx.wu %[tp3], %[src], %[stride_3] \n\t" 748 "add.d %[src], %[src], %[stride_4] \n\t" 749 "ld.wu %[tp4], %[src], 0 \n\t" 750 "ldx.wu %[tp5], %[src], %[stride] \n\t" 751 "ldx.wu %[tp6], %[src], %[stride_2] \n\t" 752 "ldx.wu %[tp7], %[src], %[stride_3] \n\t" 753 "st.w %[tp0], %[dst], 0 \n\t" 754 "stx.w %[tp1], %[dst], %[stride] \n\t" 755 "stx.w %[tp2], %[dst], %[stride_2] \n\t" 756 "stx.w %[tp3], %[dst], %[stride_3] \n\t" 757 "add.d %[dst], %[dst], %[stride_4] \n\t" 758 "st.w %[tp4], %[dst], 0 \n\t" 759 "stx.w %[tp5], %[dst], %[stride] \n\t" 760 "stx.w %[tp6], %[dst], %[stride_2] \n\t" 761 "stx.w %[tp7], %[dst], %[stride_3] \n\t" 762 : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3), [stride_4]"+&r"(stride_4), 763 [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1), 764 [tp2]"+&r"(tp2), [tp3]"+&r"(tp3), [tp4]"+&r"(tp4), [tp5]"+&r"(tp5), 765 [tp6]"+&r"(tp6), [tp7]"+&r"(tp7) 766 : [stride]"r"(stride) 767 : "memory" 768 ); 769 } else if (4 == height) { 770 ptrdiff_t stride_2, stride_3; 771 772 __asm__ volatile ( 773 "slli.d %[stride_2], %[stride], 1 \n\t" 774 "add.d %[stride_3], %[stride_2], %[stride] \n\t" 775 "ld.wu %[tp0], %[src], 0 \n\t" 776 "ldx.wu %[tp1], %[src], %[stride] \n\t" 777 "ldx.wu %[tp2], %[src], %[stride_2] \n\t" 778 "ldx.wu %[tp3], %[src], %[stride_3] \n\t" 779 "st.w %[tp0], %[dst], 0 \n\t" 780 "stx.w %[tp1], %[dst], %[stride] \n\t" 781 "stx.w %[tp2], %[dst], %[stride_2] \n\t" 782 "stx.w %[tp3], %[dst], %[stride_3] \n\t" 783 : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3), 784 [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1), 785 [tp2]"+&r"(tp2), [tp3]"+&r"(tp3) 786 : [stride]"r"(stride) 787 : "memory" 788 ); 789 } else if (2 == height) { 790 __asm__ volatile ( 791 "ld.wu %[tp0], %[src], 0 \n\t" 792 "ldx.wu %[tp1], %[src], %[stride] \n\t" 793 "st.w %[tp0], %[dst], 0 \n\t" 794 "stx.w %[tp1], %[dst], %[stride] \n\t" 795 : [tp0]"+&r"(tp0), [tp1]"+&r"(tp1) 796 : [src]"r"(src), [dst]"r"(dst), [stride]"r"(stride) 797 : "memory" 798 ); 799 } 800} 801 802static void copy_width8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 803 int32_t height) 804{ 805 if (8 == height) { 806 copy_width8x8_lasx(src, dst, stride); 807 } else if (4 == height) { 808 copy_width8x4_lasx(src, dst, stride); 809 } 810} 811 812void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, 813 int height, int x, int y) 814{ 815 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 816 817 if(x && y) { 818 avc_chroma_hv_4w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height); 819 } else if (x) { 820 avc_chroma_hz_4w_lasx(src, dst, stride, x, (8 - x), height); 821 } else if (y) { 822 avc_chroma_vt_4w_lasx(src, dst, stride, y, (8 - y), height); 823 } else { 824 copy_width4_lasx(src, dst, stride, height); 825 } 826} 827 828void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, 829 int height, int x, int y) 830{ 831 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 832 833 if (!(x || y)) { 834 copy_width8_lasx(src, dst, stride, height); 835 } else if (x && y) { 836 avc_chroma_hv_8w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height); 837 } else if (x) { 838 avc_chroma_hz_8w_lasx(src, dst, stride, x, (8 - x), height); 839 } else { 840 avc_chroma_vt_8w_lasx(src, dst, stride, y, (8 - y), height); 841 } 842} 843 844static av_always_inline void avc_chroma_hv_and_aver_dst_8x4_lasx(uint8_t *src, 845 uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0, 846 uint32_t coef_hor1, uint32_t coef_ver0, 847 uint32_t coef_ver1) 848{ 849 ptrdiff_t stride_2x = stride << 1; 850 ptrdiff_t stride_3x = stride_2x + stride; 851 ptrdiff_t stride_4x = stride << 2; 852 __m256i tp0, tp1, tp2, tp3; 853 __m256i src0, src1, src2, src3, src4, out; 854 __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1; 855 __m256i mask; 856 __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 857 __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 858 __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 859 __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 860 __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 861 862 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 863 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 864 src1, src2, src3, src4); 865 DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3); 866 src0 = __lasx_xvshuf_b(src0, src0, mask); 867 DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3); 868 DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1); 869 res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec); 870 res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0); 871 res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0); 872 res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20); 873 res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3); 874 res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1); 875 res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1); 876 out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6); 877 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 878 tp0, tp1, tp2, tp3); 879 DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 880 tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 881 out = __lasx_xvavgr_bu(out, tp0); 882 __lasx_xvstelm_d(out, dst, 0, 0); 883 __lasx_xvstelm_d(out, dst + stride, 0, 2); 884 __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 885 __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 886} 887 888static av_always_inline void avc_chroma_hv_and_aver_dst_8x8_lasx(uint8_t *src, 889 uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0, 890 uint32_t coef_hor1, uint32_t coef_ver0, 891 uint32_t coef_ver1) 892{ 893 ptrdiff_t stride_2x = stride << 1; 894 ptrdiff_t stride_3x = stride_2x + stride; 895 ptrdiff_t stride_4x = stride << 2; 896 __m256i tp0, tp1, tp2, tp3, dst0, dst1; 897 __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 898 __m256i out0, out1; 899 __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; 900 __m256i res_vt0, res_vt1, res_vt2, res_vt3; 901 __m256i mask; 902 __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 903 __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 904 __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 905 __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 906 __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 907 908 DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 909 src += stride; 910 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 911 src1, src2, src3, src4); 912 src += stride_4x; 913 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 914 src5, src6, src7, src8); 915 DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20, 916 src8, src7, 0x20, src1, src3, src5, src7); 917 src0 = __lasx_xvshuf_b(src0, src0, mask); 918 DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7, 919 src7, mask, src1, src3, src5, src7); 920 DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3, 921 coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3); 922 res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec); 923 res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0); 924 res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0); 925 res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0); 926 res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0); 927 res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20); 928 res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3); 929 res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3); 930 res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3); 931 res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1); 932 res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1); 933 res_vt2 = __lasx_xvmadd_h(res_vt2, res_hz2, coeff_vt_vec1); 934 res_vt3 = __lasx_xvmadd_h(res_vt3, res_hz3, coeff_vt_vec1); 935 DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6, 936 out0, out1); 937 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 938 tp0, tp1, tp2, tp3); 939 DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 940 dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 941 dst += stride_4x; 942 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 943 tp0, tp1, tp2, tp3); 944 dst -= stride_4x; 945 DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 946 dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20); 947 out0 = __lasx_xvavgr_bu(out0, dst0); 948 out1 = __lasx_xvavgr_bu(out1, dst1); 949 __lasx_xvstelm_d(out0, dst, 0, 0); 950 __lasx_xvstelm_d(out0, dst + stride, 0, 2); 951 __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 952 __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 953 dst += stride_4x; 954 __lasx_xvstelm_d(out1, dst, 0, 0); 955 __lasx_xvstelm_d(out1, dst + stride, 0, 2); 956 __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 957 __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 958} 959 960static av_always_inline void avc_chroma_hz_and_aver_dst_8x4_lasx(uint8_t *src, 961 uint8_t *dst, ptrdiff_t stride, uint32_t coeff0, 962 uint32_t coeff1) 963{ 964 ptrdiff_t stride_2x = stride << 1; 965 ptrdiff_t stride_3x = stride_2x + stride; 966 __m256i tp0, tp1, tp2, tp3; 967 __m256i src0, src1, src2, src3, out; 968 __m256i res0, res1; 969 __m256i mask; 970 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 971 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 972 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 973 974 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 975 mask = __lasx_xvld(chroma_mask_arr, 0); 976 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 977 src0, src1, src2, src3); 978 DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2); 979 DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2); 980 DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); 981 out = __lasx_xvssrarni_bu_h(res1, res0, 6); 982 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 983 tp0, tp1, tp2, tp3); 984 DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 985 tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 986 out = __lasx_xvavgr_bu(out, tp0); 987 __lasx_xvstelm_d(out, dst, 0, 0); 988 __lasx_xvstelm_d(out, dst + stride, 0, 2); 989 __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 990 __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 991} 992 993static av_always_inline void avc_chroma_hz_and_aver_dst_8x8_lasx(uint8_t *src, 994 uint8_t *dst, ptrdiff_t stride, uint32_t coeff0, 995 uint32_t coeff1) 996{ 997 ptrdiff_t stride_2x = stride << 1; 998 ptrdiff_t stride_3x = stride_2x + stride; 999 ptrdiff_t stride_4x = stride << 2; 1000 __m256i tp0, tp1, tp2, tp3, dst0, dst1; 1001 __m256i src0, src1, src2, src3, src4, src5, src6, src7; 1002 __m256i out0, out1; 1003 __m256i res0, res1, res2, res3; 1004 __m256i mask; 1005 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 1006 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 1007 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 1008 1009 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 1010 mask = __lasx_xvld(chroma_mask_arr, 0); 1011 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 1012 src0, src1, src2, src3); 1013 src += stride_4x; 1014 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 1015 src4, src5, src6, src7); 1016 DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20, 1017 src7, src6, 0x20, src0, src2, src4, src6); 1018 DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4, 1019 mask, src6, src6, mask, src0, src2, src4, src6); 1020 DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6, 1021 coeff_vec, res0, res1, res2, res3); 1022 DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1); 1023 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 1024 tp0, tp1, tp2, tp3); 1025 DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 1026 dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 1027 dst += stride_4x; 1028 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 1029 tp0, tp1, tp2, tp3); 1030 dst -= stride_4x; 1031 DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 1032 dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20); 1033 out0 = __lasx_xvavgr_bu(out0, dst0); 1034 out1 = __lasx_xvavgr_bu(out1, dst1); 1035 __lasx_xvstelm_d(out0, dst, 0, 0); 1036 __lasx_xvstelm_d(out0, dst + stride, 0, 2); 1037 __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 1038 __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 1039 dst += stride_4x; 1040 __lasx_xvstelm_d(out1, dst, 0, 0); 1041 __lasx_xvstelm_d(out1, dst + stride, 0, 2); 1042 __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 1043 __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 1044} 1045 1046static av_always_inline void avc_chroma_vt_and_aver_dst_8x4_lasx(uint8_t *src, 1047 uint8_t *dst, ptrdiff_t stride, uint32_t coeff0, 1048 uint32_t coeff1) 1049{ 1050 ptrdiff_t stride_2x = stride << 1; 1051 ptrdiff_t stride_3x = stride_2x + stride; 1052 ptrdiff_t stride_4x = stride << 2; 1053 __m256i tp0, tp1, tp2, tp3; 1054 __m256i src0, src1, src2, src3, src4, out; 1055 __m256i res0, res1; 1056 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 1057 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 1058 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 1059 1060 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 1061 src0 = __lasx_xvld(src, 0); 1062 DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 1063 src1, src2, src3, src4); 1064 DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, 1065 src4, src3, 0x20, src0, src1, src2, src3); 1066 DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2); 1067 DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); 1068 out = __lasx_xvssrarni_bu_h(res1, res0, 6); 1069 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 1070 tp0, tp1, tp2, tp3); 1071 DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 1072 tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 1073 out = __lasx_xvavgr_bu(out, tp0); 1074 __lasx_xvstelm_d(out, dst, 0, 0); 1075 __lasx_xvstelm_d(out, dst + stride, 0, 2); 1076 __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 1077 __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 1078} 1079 1080static av_always_inline void avc_chroma_vt_and_aver_dst_8x8_lasx(uint8_t *src, 1081 uint8_t *dst, ptrdiff_t stride, uint32_t coeff0, 1082 uint32_t coeff1) 1083{ 1084 ptrdiff_t stride_2x = stride << 1; 1085 ptrdiff_t stride_3x = stride_2x + stride; 1086 ptrdiff_t stride_4x = stride << 2; 1087 __m256i tp0, tp1, tp2, tp3, dst0, dst1; 1088 __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 1089 __m256i out0, out1; 1090 __m256i res0, res1, res2, res3; 1091 __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 1092 __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 1093 __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 1094 1095 coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 1096 src0 = __lasx_xvld(src, 0); 1097 src += stride; 1098 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 1099 src1, src2, src3, src4); 1100 src += stride_4x; 1101 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 1102 src5, src6, src7, src8); 1103 DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, 1104 src4, src3, 0x20, src0, src1, src2, src3); 1105 DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20, 1106 src8, src7, 0x20, src4, src5, src6, src7); 1107 DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6, 1108 src0, src2, src4, src6); 1109 DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6, 1110 coeff_vec, res0, res1, res2, res3); 1111 DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1); 1112 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 1113 tp0, tp1, tp2, tp3); 1114 DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 1115 dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 1116 dst += stride_4x; 1117 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 1118 tp0, tp1, tp2, tp3); 1119 dst -= stride_4x; 1120 DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 1121 dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20); 1122 out0 = __lasx_xvavgr_bu(out0, dst0); 1123 out1 = __lasx_xvavgr_bu(out1, dst1); 1124 __lasx_xvstelm_d(out0, dst, 0, 0); 1125 __lasx_xvstelm_d(out0, dst + stride, 0, 2); 1126 __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 1127 __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 1128 dst += stride_4x; 1129 __lasx_xvstelm_d(out1, dst, 0, 0); 1130 __lasx_xvstelm_d(out1, dst + stride, 0, 2); 1131 __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 1132 __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 1133} 1134 1135static av_always_inline void avg_width8x8_lasx(uint8_t *src, uint8_t *dst, 1136 ptrdiff_t stride) 1137{ 1138 __m256i src0, src1, src2, src3; 1139 __m256i dst0, dst1, dst2, dst3; 1140 ptrdiff_t stride_2x = stride << 1; 1141 ptrdiff_t stride_3x = stride_2x + stride; 1142 ptrdiff_t stride_4x = stride << 2; 1143 1144 src0 = __lasx_xvldrepl_d(src, 0); 1145 src1 = __lasx_xvldrepl_d(src + stride, 0); 1146 src2 = __lasx_xvldrepl_d(src + stride_2x, 0); 1147 src3 = __lasx_xvldrepl_d(src + stride_3x, 0); 1148 dst0 = __lasx_xvldrepl_d(dst, 0); 1149 dst1 = __lasx_xvldrepl_d(dst + stride, 0); 1150 dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0); 1151 dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0); 1152 src0 = __lasx_xvpackev_d(src1,src0); 1153 src2 = __lasx_xvpackev_d(src3,src2); 1154 src0 = __lasx_xvpermi_q(src0, src2, 0x02); 1155 dst0 = __lasx_xvpackev_d(dst1,dst0); 1156 dst2 = __lasx_xvpackev_d(dst3,dst2); 1157 dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02); 1158 dst0 = __lasx_xvavgr_bu(src0, dst0); 1159 __lasx_xvstelm_d(dst0, dst, 0, 0); 1160 __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1161 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1162 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1163 1164 src += stride_4x; 1165 dst += stride_4x; 1166 src0 = __lasx_xvldrepl_d(src, 0); 1167 src1 = __lasx_xvldrepl_d(src + stride, 0); 1168 src2 = __lasx_xvldrepl_d(src + stride_2x, 0); 1169 src3 = __lasx_xvldrepl_d(src + stride_3x, 0); 1170 dst0 = __lasx_xvldrepl_d(dst, 0); 1171 dst1 = __lasx_xvldrepl_d(dst + stride, 0); 1172 dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0); 1173 dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0); 1174 src0 = __lasx_xvpackev_d(src1,src0); 1175 src2 = __lasx_xvpackev_d(src3,src2); 1176 src0 = __lasx_xvpermi_q(src0, src2, 0x02); 1177 dst0 = __lasx_xvpackev_d(dst1,dst0); 1178 dst2 = __lasx_xvpackev_d(dst3,dst2); 1179 dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02); 1180 dst0 = __lasx_xvavgr_bu(src0, dst0); 1181 __lasx_xvstelm_d(dst0, dst, 0, 0); 1182 __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1183 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1184 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1185} 1186 1187static av_always_inline void avg_width8x4_lasx(uint8_t *src, uint8_t *dst, 1188 ptrdiff_t stride) 1189{ 1190 __m256i src0, src1, src2, src3; 1191 __m256i dst0, dst1, dst2, dst3; 1192 ptrdiff_t stride_2x = stride << 1; 1193 ptrdiff_t stride_3x = stride_2x + stride; 1194 1195 src0 = __lasx_xvldrepl_d(src, 0); 1196 src1 = __lasx_xvldrepl_d(src + stride, 0); 1197 src2 = __lasx_xvldrepl_d(src + stride_2x, 0); 1198 src3 = __lasx_xvldrepl_d(src + stride_3x, 0); 1199 dst0 = __lasx_xvldrepl_d(dst, 0); 1200 dst1 = __lasx_xvldrepl_d(dst + stride, 0); 1201 dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0); 1202 dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0); 1203 src0 = __lasx_xvpackev_d(src1,src0); 1204 src2 = __lasx_xvpackev_d(src3,src2); 1205 src0 = __lasx_xvpermi_q(src0, src2, 0x02); 1206 dst0 = __lasx_xvpackev_d(dst1,dst0); 1207 dst2 = __lasx_xvpackev_d(dst3,dst2); 1208 dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02); 1209 dst0 = __lasx_xvavgr_bu(src0, dst0); 1210 __lasx_xvstelm_d(dst0, dst, 0, 0); 1211 __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1212 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1213 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1214} 1215 1216static void avc_chroma_hv_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst, 1217 ptrdiff_t stride, 1218 uint32_t coef_hor0, 1219 uint32_t coef_hor1, 1220 uint32_t coef_ver0, 1221 uint32_t coef_ver1, 1222 int32_t height) 1223{ 1224 if (4 == height) { 1225 avc_chroma_hv_and_aver_dst_8x4_lasx(src, dst, stride, coef_hor0, 1226 coef_hor1, coef_ver0, coef_ver1); 1227 } else if (8 == height) { 1228 avc_chroma_hv_and_aver_dst_8x8_lasx(src, dst, stride, coef_hor0, 1229 coef_hor1, coef_ver0, coef_ver1); 1230 } 1231} 1232 1233static void avc_chroma_hz_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst, 1234 ptrdiff_t stride, uint32_t coeff0, 1235 uint32_t coeff1, int32_t height) 1236{ 1237 if (4 == height) { 1238 avc_chroma_hz_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1); 1239 } else if (8 == height) { 1240 avc_chroma_hz_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1); 1241 } 1242} 1243 1244static void avc_chroma_vt_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst, 1245 ptrdiff_t stride, uint32_t coeff0, 1246 uint32_t coeff1, int32_t height) 1247{ 1248 if (4 == height) { 1249 avc_chroma_vt_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1); 1250 } else if (8 == height) { 1251 avc_chroma_vt_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1); 1252 } 1253} 1254 1255static void avg_width8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1256 int32_t height) 1257{ 1258 if (8 == height) { 1259 avg_width8x8_lasx(src, dst, stride); 1260 } else if (4 == height) { 1261 avg_width8x4_lasx(src, dst, stride); 1262 } 1263} 1264 1265void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, 1266 int height, int x, int y) 1267{ 1268 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1269 1270 if (!(x || y)) { 1271 avg_width8_lasx(src, dst, stride, height); 1272 } else if (x && y) { 1273 avc_chroma_hv_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), y, 1274 (8 - y), height); 1275 } else if (x) { 1276 avc_chroma_hz_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), height); 1277 } else { 1278 avc_chroma_vt_and_aver_dst_8w_lasx(src, dst, stride, y, (8 - y), height); 1279 } 1280} 1281