1/* 2 * Copyright (c) 2021 Loongson Technology Corporation Limited 3 * Contributed by Hao Chen <chenhao@loongson.cn> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "vc1dsp_loongarch.h" 23#include "libavutil/loongarch/loongson_intrinsics.h" 24 25void ff_vc1_inv_trans_8x8_lasx(int16_t block[64]) 26{ 27 int32_t con_4 = 4; 28 int32_t con_64 = 64; 29 __m256i in0, in1, in2, in3; 30 __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4, t5, t6, t7, t8; 31 __m256i const_1 = {0x000c000c000c000c, 0x000c000c000c000c, 32 0x000c000c000c000c, 0x000c000c000c000c}; 33 __m256i const_2 = {0xfff4000cfff4000c, 0xfff4000cfff4000c, 34 0xfff4000cfff4000c, 0xfff4000cfff4000c}; 35 __m256i const_3 = {0x0006001000060010, 0x0006001000060010, 36 0x0006001000060010, 0x0006001000060010}; 37 __m256i const_4 = {0xfff00006fff00006, 0xfff00006fff00006, 38 0xfff00006fff00006, 0xfff00006fff00006}; 39 __m256i const_5 = {0x000f0010000f0010, 0x000f0010000f0010, 40 0x000f0010000f0010, 0x000f0010000f0010}; 41 __m256i const_6 = {0x0004000900040009, 0x0004000900040009, 42 0x0004000900040009, 0x0004000900040009}; 43 __m256i const_7 = {0xfffc000ffffc000f, 0xfffc000ffffc000f, 44 0xfffc000ffffc000f, 0xfffc000ffffc000f}; 45 __m256i const_8 = {0xfff7fff0fff7fff0, 0xfff7fff0fff7fff0, 46 0xfff7fff0fff7fff0, 0xfff7fff0fff7fff0}; 47 __m256i const_9 = {0xfff00009fff00009, 0xfff00009fff00009, 48 0xfff00009fff00009, 0xfff00009fff00009}; 49 __m256i const_10 = {0x000f0004000f0004, 0x000f0004000f0004, 50 0x000f0004000f0004, 0x000f0004000f0004}; 51 __m256i const_11 = {0xfff70004fff70004, 0xfff70004fff70004, 52 0xfff70004fff70004, 0xfff70004fff70004}; 53 __m256i const_12 = {0xfff0000ffff0000f, 0xfff0000ffff0000f, 54 0xfff0000ffff0000f, 0xfff0000ffff0000f}; 55 56 DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96, 57 in0, in1, in2, in3); 58 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, 59 in0, in1, in2, in3); 60 /* first loops */ 61 DUP2_ARG2(__lasx_xvilvl_h, in2, in0, in3, in1, temp0, temp1); 62 t2 = __lasx_xvreplgr2vr_w(con_4); 63 DUP2_ARG3(__lasx_xvdp2add_w_h, t2, temp0, const_1, t2, temp0, 64 const_2, t1, t2); 65 DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4); 66 67 t5 = __lasx_xvadd_w(t1, t3); 68 t6 = __lasx_xvadd_w(t2, t4); 69 t7 = __lasx_xvsub_w(t2, t4); 70 t8 = __lasx_xvsub_w(t1, t3); 71 72 DUP2_ARG2(__lasx_xvilvh_h, in1, in0, in3, in2, temp0, temp1); 73 temp2 = __lasx_xvdp2_w_h(const_5, temp0); 74 t1 = __lasx_xvdp2add_w_h(temp2, temp1, const_6); 75 temp2 = __lasx_xvdp2_w_h(const_7, temp0); 76 t2 = __lasx_xvdp2add_w_h(temp2, temp1, const_8); 77 temp2 = __lasx_xvdp2_w_h(const_9, temp0); 78 t3 = __lasx_xvdp2add_w_h(temp2, temp1, const_10); 79 temp2 = __lasx_xvdp2_w_h(const_11, temp0); 80 t4 = __lasx_xvdp2add_w_h(temp2, temp1, const_12); 81 82 DUP4_ARG2(__lasx_xvadd_w, t1, t5, t6, t2, t7, t3, t8, t4, 83 temp0, temp1, temp2, temp3); 84 DUP4_ARG2(__lasx_xvsub_w, t8, t4, t7, t3, t6, t2, t5, t1, 85 in0, in1, in2, in3); 86 DUP4_ARG2(__lasx_xvsrai_w, temp0, 3, temp1, 3, temp2, 3, temp3, 3, 87 temp0, temp1, temp2, temp3); 88 DUP4_ARG2(__lasx_xvsrai_w, in0, 3, in1, 3, in2, 3, in3, 3, 89 in0, in1, in2, in3); 90 91 /* second loops */ 92 DUP4_ARG2(__lasx_xvpackev_h, temp1, temp0, temp3, temp2, in1, in0, 93 in3, in2, temp0, temp1, temp2, temp3); 94 DUP2_ARG2(__lasx_xvilvl_w, temp1, temp0, temp3, temp2, t1, t3); 95 DUP2_ARG2(__lasx_xvilvh_w, temp1, temp0, temp3, temp2, t2, t4); 96 DUP4_ARG3(__lasx_xvpermi_q, t3, t1, 0x20, t3, t1, 0x31, t4, t2, 0x20, 97 t4, t2, 0x31, in0, in1, in2, in3); 98 DUP2_ARG2(__lasx_xvilvl_h, in1, in0, in3, in2, temp0, temp1); 99 t3 = __lasx_xvreplgr2vr_w(con_64); 100 DUP2_ARG3(__lasx_xvdp2add_w_h, t3, temp0, const_1, t3, temp0, 101 const_2, t1, t2); 102 DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4); 103 104 t5 = __lasx_xvadd_w(t1, t3); 105 t6 = __lasx_xvadd_w(t2, t4); 106 t7 = __lasx_xvsub_w(t2, t4); 107 t8 = __lasx_xvsub_w(t1, t3); 108 109 DUP2_ARG2(__lasx_xvilvh_h, in2, in0, in3, in1, temp0, temp1); 110 temp2 = __lasx_xvdp2_w_h(const_5, temp0); 111 t1 = __lasx_xvdp2add_w_h(temp2, temp1, const_6); 112 temp2 = __lasx_xvdp2_w_h(const_7, temp0); 113 t2 = __lasx_xvdp2add_w_h(temp2, temp1, const_8); 114 temp2 = __lasx_xvdp2_w_h(const_9, temp0); 115 t3 = __lasx_xvdp2add_w_h(temp2, temp1, const_10); 116 temp2 = __lasx_xvdp2_w_h(const_11, temp0); 117 t4 = __lasx_xvdp2add_w_h(temp2, temp1, const_12); 118 119 DUP4_ARG2(__lasx_xvadd_w, t5, t1, t6, t2, t7, t3, t8, t4, 120 temp0, temp1, temp2, temp3); 121 DUP4_ARG2(__lasx_xvsub_w, t8, t4, t7, t3, t6, t2, t5, t1, 122 in0, in1, in2, in3); 123 DUP4_ARG2(__lasx_xvaddi_wu, in0, 1, in1, 1, in2, 1, in3, 1, 124 in0, in1, in2, in3); 125 DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 7, temp3, temp2, 7, 126 in1, in0, 7, in3, in2, 7, t1, t2, t3, t4); 127 DUP4_ARG2(__lasx_xvpermi_d, t1, 0xD8, t2, 0xD8, t3, 0xD8, t4, 0xD8, 128 in0, in1, in2, in3); 129 __lasx_xvst(in0, block, 0); 130 __lasx_xvst(in1, block, 32); 131 __lasx_xvst(in2, block, 64); 132 __lasx_xvst(in3, block, 96); 133} 134 135void ff_vc1_inv_trans_8x8_dc_lasx(uint8_t *dest, ptrdiff_t stride, 136 int16_t *block) 137{ 138 int dc = block[0]; 139 ptrdiff_t stride2 = stride << 1; 140 ptrdiff_t stride3 = stride2 + stride; 141 uint8_t *dst = dest + (stride2 << 1); 142 __m256i in0, in1, in2, in3, in4, in5, in6, in7; 143 __m256i const_dc, temp0, temp1, temp2, temp3; 144 __m256i reg0, reg1, reg2, reg3; 145 146 dc = (3 * dc + 1) >> 1; 147 dc = (3 * dc + 16) >> 5; 148 149 const_dc = __lasx_xvreplgr2vr_h(dc); 150 DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + stride, 0, dest + stride2, 151 0, dest + stride3, 0, in0, in1, in2, in3); 152 DUP4_ARG2(__lasx_xvldrepl_d, dst, 0, dst + stride, 0, dst + stride2, 153 0, dst + stride3, 0, in4, in5, in6, in7); 154 155 DUP4_ARG2(__lasx_xvilvl_d, in1, in0, in3, in2, in5, in4, in7, in6, 156 temp0, temp1, temp2, temp3); 157 DUP4_ARG1(__lasx_vext2xv_hu_bu, temp0, temp1, temp2, temp3, 158 temp0, temp1, temp2, temp3); 159 160 DUP4_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, temp2, 161 const_dc, temp3, const_dc, reg0, reg1, reg2, reg3); 162 DUP2_ARG3(__lasx_xvssrarni_bu_h, reg1, reg0, 0, reg3, reg2, 0, 163 temp0, temp1); 164 __lasx_xvstelm_d(temp0, dest, 0, 0); 165 __lasx_xvstelm_d(temp0, dest + stride, 0, 2); 166 __lasx_xvstelm_d(temp0, dest + stride2, 0, 1); 167 __lasx_xvstelm_d(temp0, dest + stride3, 0, 3); 168 __lasx_xvstelm_d(temp1, dst, 0, 0); 169 __lasx_xvstelm_d(temp1, dst + stride, 0, 2); 170 __lasx_xvstelm_d(temp1, dst + stride2, 0, 1); 171 __lasx_xvstelm_d(temp1, dst + stride3, 0, 3); 172} 173 174void ff_vc1_inv_trans_8x4_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block) 175{ 176 ptrdiff_t stride2 = stride << 1; 177 ptrdiff_t stride3 = stride2 + stride; 178 __m256i shift = {0x0000000400000000, 0x0000000500000001, 179 0x0000000600000002, 0x0000000700000003}; 180 __m256i const_64 = {0x0000004000000040, 0x0000004000000040, 181 0x0000004000000040, 0x0000004000000040}; 182 __m256i const_1 = {0x00060010000C000C, 0x00060010000C000C, 183 0x00060010000C000C, 0x00060010000C000C}; 184 __m256i const_2 = {0xFFF00006FFF4000C, 0xFFF00006FFF4000C, 185 0xFFF00006FFF4000C, 0xFFF00006FFF4000C}; 186 __m256i const_3 = {0x0004000F00090010, 0x0004000F00090010, 187 0x0004000F00090010, 0x0004000F00090010}; 188 __m256i const_4 = {0xFFF7FFFCFFF0000F, 0xFFF7FFFCFFF0000F, 189 0xFFF7FFFCFFF0000F, 0xFFF7FFFCFFF0000F}; 190 __m256i const_5 = {0x000FFFF000040009, 0x000FFFF000040009, 191 0x000FFFF000040009, 0x000FFFF000040009}; 192 __m256i const_6 = {0xFFF0FFF7000F0004, 0xFFF0FFF7000F0004, 193 0xFFF0FFF7000F0004, 0xFFF0FFF7000F0004}; 194 __m256i const_7 = {0x0000000000000004, 0x0000000000000004, 195 0x0000000000000004, 0x0000000000000004}; 196 __m256i const_8 = {0x0011001100110011, 0x0011001100110011, 197 0x0011001100110011, 0x0011001100110011}; 198 __m256i const_9 = {0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011, 199 0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011}; 200 __m256i const_10 = {0x000A0016000A0016, 0x000A0016000A0016, 201 0x000A0016000A0016, 0x000A0016000A0016}; 202 __m256i const_11 = {0x0016FFF60016FFF6, 0x0016FFF60016FFF6, 203 0x0016FFF60016FFF6, 0x0016FFF60016FFF6}; 204 __m256i in0, in1; 205 __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4; 206 207 DUP2_ARG2(__lasx_xvld, block, 0, block, 32, in0, in1); 208 /* first loops */ 209 temp0 = __lasx_xvpermi_d(in0, 0xB1); 210 temp1 = __lasx_xvpermi_d(in1, 0xB1); 211 DUP2_ARG2(__lasx_xvilvl_h, temp0, in0, temp1, in1, temp0, temp1); 212 temp2 = __lasx_xvpickev_w(temp1, temp0); 213 temp3 = __lasx_xvpickod_w(temp1, temp0); 214 215 DUP2_ARG2(__lasx_xvdp2_w_h, temp2, const_1, temp2, const_2, temp0, temp1); 216 t1 = __lasx_xvadd_w(temp0, const_7); 217 t2 = __lasx_xvadd_w(temp1, const_7); 218 temp0 = __lasx_xvpickev_w(t2, t1); 219 temp1 = __lasx_xvpickod_w(t2, t1); 220 t3 = __lasx_xvadd_w(temp0, temp1); 221 t4 = __lasx_xvsub_w(temp0, temp1); 222 t4 = __lasx_xvpermi_d(t4, 0xB1); 223 224 DUP4_ARG2(__lasx_xvdp4_d_h, temp3, const_3, temp3, const_4, temp3, 225 const_5, temp3, const_6, t1, t2, temp0, temp1); 226 temp2 = __lasx_xvpickev_w(t2, t1); 227 temp3 = __lasx_xvpickev_w(temp1, temp0); 228 229 t1 = __lasx_xvadd_w(temp2, t3); 230 t2 = __lasx_xvadd_w(temp3, t4); 231 temp0 = __lasx_xvsub_w(t4, temp3); 232 temp1 = __lasx_xvsub_w(t3, temp2); 233 /* second loops */ 234 DUP2_ARG3(__lasx_xvsrani_h_w, t2, t1, 3, temp1, temp0, 3, temp2, temp3); 235 temp3 = __lasx_xvshuf4i_h(temp3, 0x4E); 236 temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20); 237 temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31); 238 DUP2_ARG3(__lasx_xvdp2add_w_h, const_64, temp0, const_8, const_64, temp0, 239 const_9, t1, t2); 240 DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_10, temp1, const_11, t3, t4); 241 temp0 = __lasx_xvadd_w(t1, t3); 242 temp1 = __lasx_xvsub_w(t2, t4); 243 temp2 = __lasx_xvadd_w(t2, t4); 244 temp3 = __lasx_xvsub_w(t1, t3); 245 DUP4_ARG2(__lasx_xvsrai_w, temp0, 7, temp1, 7, temp2, 7, temp3, 7, 246 t1, t2, t3, t4); 247 248 temp0 = __lasx_xvldrepl_d(dest, 0); 249 DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + stride, 0, dest + stride2, 0, 250 dest + stride3, 0, temp0, temp1, temp2, temp3); 251 DUP4_ARG1(__lasx_vext2xv_wu_bu, temp0, temp1, temp2, temp3, 252 temp0, temp1, temp2, temp3); 253 DUP4_ARG2(__lasx_xvadd_w, temp0, t1, temp1, t2, temp2, t3, temp3, t4, 254 t1, t2, t3, t4); 255 DUP4_ARG1(__lasx_xvclip255_w, t1, t2, t3, t4, t1, t2, t3, t4); 256 DUP2_ARG2(__lasx_xvpickev_h, t2, t1, t4, t3, temp0, temp1); 257 temp2 = __lasx_xvpickev_b(temp1, temp0); 258 temp0 = __lasx_xvperm_w(temp2, shift); 259 __lasx_xvstelm_d(temp0, dest, 0, 0); 260 __lasx_xvstelm_d(temp0, dest + stride, 0, 1); 261 __lasx_xvstelm_d(temp0, dest + stride2, 0, 2); 262 __lasx_xvstelm_d(temp0, dest + stride3, 0, 3); 263} 264 265void ff_vc1_inv_trans_8x4_dc_lasx(uint8_t *dest, ptrdiff_t stride, 266 int16_t *block) 267{ 268 int dc = block[0]; 269 ptrdiff_t stride2 = stride << 1; 270 ptrdiff_t stride3 = stride2 + stride; 271 __m256i in0, in1, in2, in3; 272 __m256i const_dc, temp0, temp1, reg0, reg1; 273 274 dc = (3 * dc + 1) >> 1; 275 dc = (17 * dc + 64) >> 7; 276 const_dc = __lasx_xvreplgr2vr_h(dc); 277 278 DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + stride, 0, dest + stride2, 279 0, dest + stride3, 0, in0, in1, in2, in3); 280 DUP2_ARG2(__lasx_xvilvl_d, in1, in0, in3, in2, temp0, temp1); 281 DUP2_ARG1(__lasx_vext2xv_hu_bu, temp0, temp1, temp0, temp1); 282 DUP2_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, reg0, reg1); 283 temp0 = __lasx_xvssrarni_bu_h(reg1, reg0, 0); 284 __lasx_xvstelm_d(temp0, dest, 0, 0); 285 __lasx_xvstelm_d(temp0, dest + stride, 0, 2); 286 __lasx_xvstelm_d(temp0, dest + stride2, 0, 1); 287 __lasx_xvstelm_d(temp0, dest + stride3, 0, 3); 288} 289 290void ff_vc1_inv_trans_4x8_dc_lasx(uint8_t *dest, ptrdiff_t stride, 291 int16_t *block) 292{ 293 int dc = block[0]; 294 ptrdiff_t stride2 = stride << 1; 295 ptrdiff_t stride3 = stride2 + stride; 296 uint8_t *dst = dest + (stride2 << 1); 297 __m256i in0, in1, in2, in3, in4, in5, in6, in7; 298 __m256i const_dc, temp0, temp1, temp2, temp3, reg0, reg1; 299 300 dc = (17 * dc + 4) >> 3; 301 dc = (12 * dc + 64) >> 7; 302 const_dc = __lasx_xvreplgr2vr_h(dc); 303 304 DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dest + stride, 0, dest + stride2, 305 0, dest + stride3, 0, in0, in1, in2, in3); 306 DUP4_ARG2(__lasx_xvldrepl_w, dst, 0, dst + stride, 0, dst + stride2, 307 0, dst + stride3, 0, in4, in5, in6, in7); 308 309 DUP4_ARG2(__lasx_xvilvl_w, in1, in0, in3, in2, in5, in4, in7, in6, 310 temp0, temp1, temp2, temp3); 311 DUP2_ARG2(__lasx_xvilvl_d, temp1, temp0, temp3, temp2, reg0, reg1); 312 DUP2_ARG1(__lasx_vext2xv_hu_bu, reg0, reg1, temp0, temp1); 313 DUP2_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, reg0, reg1); 314 temp0 = __lasx_xvssrarni_bu_h(reg1, reg0, 0); 315 __lasx_xvstelm_w(temp0, dest, 0, 0); 316 __lasx_xvstelm_w(temp0, dest + stride, 0, 1); 317 __lasx_xvstelm_w(temp0, dest + stride2, 0, 4); 318 __lasx_xvstelm_w(temp0, dest + stride3, 0, 5); 319 __lasx_xvstelm_w(temp0, dst, 0, 2); 320 __lasx_xvstelm_w(temp0, dst + stride, 0, 3); 321 __lasx_xvstelm_w(temp0, dst + stride2, 0, 6); 322 __lasx_xvstelm_w(temp0, dst + stride3, 0, 7); 323} 324 325void ff_vc1_inv_trans_4x8_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block) 326{ 327 ptrdiff_t stride2 = stride << 1; 328 ptrdiff_t stride3 = stride2 + stride; 329 uint8_t *dst = dest + (stride2 << 1); 330 __m256i in0, in1, in2, in3; 331 __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4; 332 333 __m256i const_1 = {0x0011001100110011, 0x0011001100110011, 334 0x0011001100110011, 0x0011001100110011}; 335 __m256i const_2 = {0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011, 336 0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011}; 337 __m256i const_3 = {0x000A0016000A0016, 0x000A0016000A0016, 338 0x000A0016000A0016, 0x000A0016000A0016}; 339 __m256i const_4 = {0x0016FFF60016FFF6, 0x0016FFF60016FFF6, 340 0x0016FFF60016FFF6, 0x0016FFF60016FFF6}; 341 __m256i const_5 = {0x0000000400000004, 0x0000000400000004, 342 0x0000000400000004, 0x0000000400000004}; 343 __m256i const_6 = {0x0000004000000040, 0x0000004000000040, 344 0x0000004000000040, 0x0000004000000040}; 345 __m256i const_7 = {0x000C000C000C000C, 0X000C000C000C000C, 346 0xFFF4000CFFF4000C, 0xFFF4000CFFF4000C}; 347 __m256i const_8 = {0x0006001000060010, 0x0006001000060010, 348 0xFFF00006FFF00006, 0xFFF00006FFF00006}; 349 __m256i const_9 = {0x0009001000090010, 0x0009001000090010, 350 0x0004000F0004000F, 0x0004000F0004000F}; 351 __m256i const_10 = {0xFFF0000FFFF0000F, 0xFFF0000FFFF0000F, 352 0xFFF7FFFCFFF7FFFC, 0xFFF7FFFCFFF7FFFC}; 353 __m256i const_11 = {0x0004000900040009, 0x0004000900040009, 354 0x000FFFF0000FFFF0, 0x000FFFF0000FFFF0}; 355 __m256i const_12 = {0x000F0004000F0004, 0x000F0004000F0004, 356 0xFFF0FFF7FFF0FFF7, 0xFFF0FFF7FFF0FFF7}; 357 __m256i shift = {0x0000000400000000, 0x0000000600000002, 358 0x0000000500000001, 0x0000000700000003}; 359 360 /* first loops */ 361 DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96, 362 in0, in1, in2, in3); 363 in0 = __lasx_xvilvl_d(in1, in0); 364 in1 = __lasx_xvilvl_d(in3, in2); 365 temp0 = __lasx_xvpickev_h(in1, in0); 366 temp1 = __lasx_xvpickod_h(in1, in0); 367 temp0 = __lasx_xvperm_w(temp0, shift); 368 temp1 = __lasx_xvperm_w(temp1, shift); 369 370 DUP2_ARG3(__lasx_xvdp2add_w_h, const_5, temp0, const_1, const_5, temp0, 371 const_2, t1, t2); 372 DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4); 373 374 temp0 = __lasx_xvadd_w(t1, t3); 375 temp1 = __lasx_xvsub_w(t2, t4); 376 temp2 = __lasx_xvadd_w(t2, t4); 377 temp3 = __lasx_xvsub_w(t1, t3); 378 DUP4_ARG2(__lasx_xvsrai_w, temp0, 3, temp1, 3, temp2, 3, temp3, 3, 379 temp0, temp1, temp2, temp3); 380 381 /* second loops */ 382 t1 = __lasx_xvpickev_w(temp1, temp0); 383 t2 = __lasx_xvpickev_w(temp3, temp2); 384 t1 = __lasx_xvpickev_h(t2, t1); 385 t3 = __lasx_xvpickod_w(temp1, temp0); 386 t4 = __lasx_xvpickod_w(temp3, temp2); 387 temp1 = __lasx_xvpickev_h(t4, t3); 388 temp2 = __lasx_xvpermi_q(t1, t1, 0x00); 389 temp3 = __lasx_xvpermi_q(t1, t1, 0x11); 390 t1 = __lasx_xvdp2add_w_h(const_6, temp2, const_7); 391 t2 = __lasx_xvdp2_w_h(temp3, const_8); 392 t3 = __lasx_xvadd_w(t1, t2); 393 t4 = __lasx_xvsub_w(t1, t2); 394 t4 = __lasx_xvpermi_d(t4, 0x4E); 395 396 DUP4_ARG2(__lasx_xvdp2_w_h, temp1, const_9, temp1, const_10, temp1, 397 const_11, temp1, const_12, t1, t2, temp2, temp3); 398 399 temp0 = __lasx_xvpermi_q(t2, t1, 0x20); 400 temp1 = __lasx_xvpermi_q(t2, t1, 0x31); 401 t1 = __lasx_xvadd_w(temp0, temp1); 402 temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20); 403 temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31); 404 t2 = __lasx_xvadd_w(temp1, temp0); 405 temp0 = __lasx_xvadd_w(t1, t3); 406 temp1 = __lasx_xvadd_w(t2, t4); 407 temp2 = __lasx_xvsub_w(t4, t2); 408 temp3 = __lasx_xvsub_w(t3, t1); 409 temp2 = __lasx_xvaddi_wu(temp2, 1); 410 temp3 = __lasx_xvaddi_wu(temp3, 1); 411 DUP4_ARG2(__lasx_xvsrai_w, temp0, 7, temp1, 7, temp2, 7, temp3, 7, 412 temp0, temp1, temp2, temp3); 413 414 DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dest + stride, 0, dest + stride2, 0, 415 dest + stride3, 0, const_1, const_2, const_3, const_4); 416 DUP4_ARG2(__lasx_xvldrepl_w, dst, 0, dst + stride, 0, dst + stride2, 0, 417 dst + stride3, 0, const_5, const_6, const_7, const_8); 418 419 DUP4_ARG2(__lasx_xvilvl_w, const_2, const_1, const_4, const_3, const_5, 420 const_6, const_7, const_8, const_1, const_2, const_3, const_4); 421 DUP4_ARG1(__lasx_vext2xv_wu_bu, const_1, const_2, const_3, const_4, 422 const_1, const_2, const_3, const_4); 423 DUP4_ARG2(__lasx_xvadd_w, temp0, const_1, temp1, const_2, temp2, const_3, 424 temp3, const_4, temp0, temp1, temp2, temp3); 425 DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, 426 temp0, temp1, temp2, temp3); 427 DUP2_ARG2(__lasx_xvpickev_h, temp1, temp0, temp3, temp2, temp0, temp1); 428 temp0 = __lasx_xvpickev_b(temp1, temp0); 429 __lasx_xvstelm_w(temp0, dest, 0, 0); 430 __lasx_xvstelm_w(temp0, dest + stride, 0, 4); 431 __lasx_xvstelm_w(temp0, dest + stride2, 0, 1); 432 __lasx_xvstelm_w(temp0, dest + stride3, 0, 5); 433 __lasx_xvstelm_w(temp0, dst, 0, 6); 434 __lasx_xvstelm_w(temp0, dst + stride, 0, 2); 435 __lasx_xvstelm_w(temp0, dst + stride2, 0, 7); 436 __lasx_xvstelm_w(temp0, dst + stride3, 0, 3); 437} 438 439void ff_vc1_inv_trans_4x4_dc_lasx(uint8_t *dest, ptrdiff_t stride, 440 int16_t *block) 441{ 442 int dc = block[0]; 443 uint8_t *dst1 = dest + stride; 444 uint8_t *dst2 = dst1 + stride; 445 uint8_t *dst3 = dst2 + stride; 446 __m256i in0, in1, in2, in3, temp0, temp1, const_dc; 447 __m256i zero = {0}; 448 449 dc = (17 * dc + 4) >> 3; 450 dc = (17 * dc + 64) >> 7; 451 const_dc = __lasx_xvreplgr2vr_h(dc); 452 453 DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dst1, 0, dst2, 0, dst3, 0, 454 in0, in1, in2, in3); 455 DUP2_ARG2(__lasx_xvilvl_w, in1, in0, in3, in2, temp0, temp1); 456 in0 = __lasx_xvpermi_q(temp1, temp0, 0x20); 457 temp0 = __lasx_xvilvl_b(zero, in0); 458 in0 = __lasx_xvadd_h(temp0, const_dc); 459 temp0 = __lasx_xvssrarni_bu_h(in0, in0, 0); 460 __lasx_xvstelm_w(temp0, dest, 0, 0); 461 __lasx_xvstelm_w(temp0, dst1, 0, 1); 462 __lasx_xvstelm_w(temp0, dst2, 0, 4); 463 __lasx_xvstelm_w(temp0, dst3, 0, 5); 464} 465 466void ff_vc1_inv_trans_4x4_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block) 467{ 468 uint8_t *dst1 = dest + stride; 469 uint8_t *dst2 = dst1 + stride; 470 uint8_t *dst3 = dst2 + stride; 471 __m256i in0, in1, in2, in3; 472 __m256i temp0, temp1, temp2, temp3, t1, t2; 473 474 __m256i const_1 = {0x0011001100110011, 0xFFEF0011FFEF0011, 475 0x0011001100110011, 0xFFEF0011FFEF0011}; 476 __m256i const_2 = {0x000A0016000A0016, 0x0016FFF60016FFF6, 477 0x000A0016000A0016, 0x0016FFF60016FFF6}; 478 __m256i const_64 = {0x0000004000000040, 0x0000004000000040, 479 0x0000004000000040, 0x0000004000000040}; 480 481 DUP2_ARG2(__lasx_xvld, block, 0, block, 32, in0, in1); 482 /* first loops */ 483 temp0 = __lasx_xvilvl_d(in1, in0); 484 temp1 = __lasx_xvpickev_h(temp0, temp0); 485 temp2 = __lasx_xvpickod_h(temp0, temp0); 486 DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_1, temp2, const_2, t1, t2); 487 t1 = __lasx_xvaddi_wu(t1, 4); 488 in0 = __lasx_xvadd_w(t1, t2); 489 in1 = __lasx_xvsub_w(t1, t2); 490 DUP2_ARG2(__lasx_xvsrai_w, in0, 3, in1, 3, in0, in1); 491 /* second loops */ 492 temp0 = __lasx_xvpickev_h(in1, in0); 493 temp1 = __lasx_xvpermi_q(temp0, temp0, 0x00); 494 temp2 = __lasx_xvpermi_q(temp0, temp0, 0x11); 495 const_1 = __lasx_xvpermi_d(const_1, 0xD8); 496 const_2 = __lasx_xvpermi_d(const_2, 0xD8); 497 t1 = __lasx_xvdp2add_w_h(const_64, temp1, const_1); 498 t2 = __lasx_xvdp2_w_h(temp2, const_2); 499 in0 = __lasx_xvadd_w(t1, t2); 500 in1 = __lasx_xvsub_w(t1, t2); 501 DUP2_ARG2(__lasx_xvsrai_w, in0, 7, in1, 7, in0, in1); 502 temp0 = __lasx_xvshuf4i_w(in0, 0x9C); 503 temp1 = __lasx_xvshuf4i_w(in1, 0x9C); 504 505 DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dst1, 0, dst2, 0, dst3, 0, 506 in0, in1, in2, in3); 507 temp2 = __lasx_xvilvl_w(in2, in0); 508 temp2 = __lasx_vext2xv_wu_bu(temp2); 509 temp3 = __lasx_xvilvl_w(in1, in3); 510 temp3 = __lasx_vext2xv_wu_bu(temp3); 511 temp0 = __lasx_xvadd_w(temp0, temp2); 512 temp1 = __lasx_xvadd_w(temp1, temp3); 513 DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); 514 temp1 = __lasx_xvpickev_h(temp1, temp0); 515 temp0 = __lasx_xvpickev_b(temp1, temp1); 516 __lasx_xvstelm_w(temp0, dest, 0, 0); 517 __lasx_xvstelm_w(temp0, dst1, 0, 5); 518 __lasx_xvstelm_w(temp0, dst2, 0, 4); 519 __lasx_xvstelm_w(temp0, dst3, 0, 1); 520} 521 522static void put_vc1_mspel_mc_h_v_lasx(uint8_t *dst, const uint8_t *src, 523 ptrdiff_t stride, int hmode, int vmode, 524 int rnd) 525{ 526 __m256i in0, in1, in2, in3; 527 __m256i t0, t1, t2, t3, t4, t5, t6, t7; 528 __m256i temp0, temp1, const_para1_2, const_para0_3; 529 __m256i const_r, const_sh; 530 __m256i sh = {0x0000000400000000, 0x0000000500000001, 531 0x0000000600000002, 0x0000000700000003}; 532 static const uint8_t para_value[][4] = {{4, 3, 53, 18}, 533 {1, 1, 9, 9}, 534 {3, 4, 18, 53}}; 535 static const int shift_value[] = {0, 5, 1, 5}; 536 int shift = (shift_value[hmode] + shift_value[vmode]) >> 1; 537 int r = (1 << (shift - 1)) + rnd - 1; 538 const uint8_t *para_v = para_value[vmode - 1]; 539 ptrdiff_t stride2 = stride << 1; 540 ptrdiff_t stride4 = stride << 2; 541 ptrdiff_t stride3 = stride2 + stride; 542 543 const_r = __lasx_xvreplgr2vr_h(r); 544 const_sh = __lasx_xvreplgr2vr_h(shift); 545 src -= 1, src -= stride; 546 const_para0_3 = __lasx_xvldrepl_h(para_v, 0); 547 const_para1_2 = __lasx_xvldrepl_h(para_v, 2); 548 DUP4_ARG2(__lasx_xvld, src, 0, src + stride, 0, src + stride2, 0, 549 src + stride3, 0, in0, in1, in2, in3); 550 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, 551 in0, in1, in2, in3); 552 DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1); 553 t0 = __lasx_xvdp2_h_bu(temp0, const_para1_2); 554 t0 = __lasx_xvdp2sub_h_bu(t0, temp1, const_para0_3); 555 src += stride4; 556 in0 = __lasx_xvld(src, 0); 557 in0 = __lasx_xvpermi_d(in0, 0xD8); 558 DUP2_ARG2(__lasx_xvilvl_b, in3, in2, in0, in1, temp0, temp1); 559 t1 = __lasx_xvdp2_h_bu(temp0, const_para1_2); 560 t1 = __lasx_xvdp2sub_h_bu(t1, temp1, const_para0_3); 561 src += stride; 562 in1 = __lasx_xvld(src, 0); 563 in1 = __lasx_xvpermi_d(in1, 0xD8); 564 DUP2_ARG2(__lasx_xvilvl_b, in0, in3, in1, in2, temp0, temp1); 565 t2 = __lasx_xvdp2_h_bu(temp0, const_para1_2); 566 t2 = __lasx_xvdp2sub_h_bu(t2, temp1, const_para0_3); 567 src += stride; 568 in2 = __lasx_xvld(src, 0); 569 in2 = __lasx_xvpermi_d(in2, 0xD8); 570 DUP2_ARG2(__lasx_xvilvl_b, in1, in0, in2, in3, temp0, temp1); 571 t3 = __lasx_xvdp2_h_bu(temp0, const_para1_2); 572 t3 = __lasx_xvdp2sub_h_bu(t3, temp1, const_para0_3); 573 src += stride; 574 in3 = __lasx_xvld(src, 0); 575 in3 = __lasx_xvpermi_d(in3, 0xD8); 576 DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1); 577 t4 = __lasx_xvdp2_h_bu(temp0, const_para1_2); 578 t4 = __lasx_xvdp2sub_h_bu(t4, temp1, const_para0_3); 579 src += stride; 580 in0 = __lasx_xvld(src, 0); 581 in0 = __lasx_xvpermi_d(in0, 0xD8); 582 DUP2_ARG2(__lasx_xvilvl_b, in3, in2, in0, in1, temp0, temp1); 583 t5 = __lasx_xvdp2_h_bu(temp0, const_para1_2); 584 t5 = __lasx_xvdp2sub_h_bu(t5, temp1, const_para0_3); 585 src += stride; 586 in1 = __lasx_xvld(src, 0); 587 in1 = __lasx_xvpermi_d(in1, 0xD8); 588 DUP2_ARG2(__lasx_xvilvl_b, in0, in3, in1, in2, temp0, temp1); 589 t6 = __lasx_xvdp2_h_bu(temp0, const_para1_2); 590 t6 = __lasx_xvdp2sub_h_bu(t6, temp1, const_para0_3); 591 src += stride; 592 in2 = __lasx_xvld(src, 0); 593 in2 = __lasx_xvpermi_d(in2, 0xD8); 594 DUP2_ARG2(__lasx_xvilvl_b, in1, in0, in2, in3, temp0, temp1); 595 t7 = __lasx_xvdp2_h_bu(temp0, const_para1_2); 596 t7 = __lasx_xvdp2sub_h_bu(t7, temp1, const_para0_3); 597 DUP4_ARG2(__lasx_xvadd_h, t0, const_r, t1, const_r, t2, const_r, t3, 598 const_r, t0, t1, t2, t3); 599 DUP4_ARG2(__lasx_xvadd_h, t4, const_r, t5, const_r, t6, const_r, t7, 600 const_r, t4, t5, t6, t7); 601 DUP4_ARG2(__lasx_xvsra_h, t0, const_sh, t1, const_sh, t2, const_sh, 602 t3, const_sh, t0, t1, t2, t3); 603 DUP4_ARG2(__lasx_xvsra_h, t4, const_sh, t5, const_sh, t6, const_sh, 604 t7, const_sh, t4, t5, t6, t7); 605 LASX_TRANSPOSE8x8_H(t0, t1, t2, t3, t4, t5, t6, t7, t0, 606 t1, t2, t3, t4, t5, t6, t7); 607 para_v = para_value[hmode - 1]; 608 const_para0_3 = __lasx_xvldrepl_h(para_v, 0); 609 const_para1_2 = __lasx_xvldrepl_h(para_v, 2); 610 const_para0_3 = __lasx_vext2xv_h_b(const_para0_3); 611 const_para1_2 = __lasx_vext2xv_h_b(const_para1_2); 612 r = 64 - rnd; 613 const_r = __lasx_xvreplgr2vr_w(r); 614 DUP4_ARG2(__lasx_xvpermi_d, t0, 0x72, t1, 0x72, t2, 0x72, t0, 0xD8, 615 in0, in1, in2, t0); 616 DUP4_ARG2(__lasx_xvpermi_d, t1, 0xD8, t2, 0xD8, t3, 0xD8, t4, 0xD8, 617 t1, t2, t3, t4); 618 DUP2_ARG2(__lasx_xvpermi_d, t5, 0xD8, t6, 0xD8, t5, t6); 619 t7 = __lasx_xvpermi_d(t7, 0xD8); 620 DUP2_ARG2(__lasx_xvilvl_h, t2, t1, t3, t0, temp0, temp1); 621 t0 = __lasx_xvdp2_w_h(temp0, const_para1_2); 622 t0 = __lasx_xvdp2sub_w_h(t0, temp1, const_para0_3); 623 DUP2_ARG2(__lasx_xvilvl_h, t3, t2, t4, t1, temp0, temp1); 624 t1 = __lasx_xvdp2_w_h(temp0, const_para1_2); 625 t1 = __lasx_xvdp2sub_w_h(t1, temp1, const_para0_3); 626 DUP2_ARG2(__lasx_xvilvl_h, t4, t3, t5, t2, temp0, temp1); 627 t2 = __lasx_xvdp2_w_h(temp0, const_para1_2); 628 t2 = __lasx_xvdp2sub_w_h(t2, temp1, const_para0_3); 629 DUP2_ARG2(__lasx_xvilvl_h, t5, t4, t6, t3, temp0, temp1); 630 t3 = __lasx_xvdp2_w_h(temp0, const_para1_2); 631 t3 = __lasx_xvdp2sub_w_h(t3, temp1, const_para0_3); 632 DUP2_ARG2(__lasx_xvilvl_h, t6, t5, t7, t4, temp0, temp1); 633 t4 = __lasx_xvdp2_w_h(temp0, const_para1_2); 634 t4 = __lasx_xvdp2sub_w_h(t4, temp1, const_para0_3); 635 DUP2_ARG2(__lasx_xvilvl_h, t7, t6, in0, t5, temp0, temp1); 636 t5 = __lasx_xvdp2_w_h(temp0, const_para1_2); 637 t5 = __lasx_xvdp2sub_w_h(t5, temp1, const_para0_3); 638 DUP2_ARG2(__lasx_xvilvl_h, in0, t7, in1, t6, temp0, temp1); 639 t6 = __lasx_xvdp2_w_h(temp0, const_para1_2); 640 t6 = __lasx_xvdp2sub_w_h(t6, temp1, const_para0_3); 641 DUP2_ARG2(__lasx_xvilvl_h, in1, in0, in2, t7, temp0, temp1); 642 t7 = __lasx_xvdp2_w_h(temp0, const_para1_2); 643 t7 = __lasx_xvdp2sub_w_h(t7, temp1, const_para0_3); 644 DUP4_ARG2(__lasx_xvadd_w, t0, const_r, t1, const_r, t2, const_r, 645 t3, const_r, t0, t1, t2, t3); 646 DUP4_ARG2(__lasx_xvadd_w, t4, const_r, t5, const_r, t6, const_r, 647 t7, const_r, t4, t5, t6, t7); 648 DUP4_ARG2(__lasx_xvsrai_w, t0, 7, t1, 7, t2, 7, t3, 7, t0, t1, t2, t3); 649 DUP4_ARG2(__lasx_xvsrai_w, t4, 7, t5, 7, t6, 7, t7, 7, t4, t5, t6, t7); 650 LASX_TRANSPOSE8x8_W(t0, t1, t2, t3, t4, t5, t6, t7, 651 t0, t1, t2, t3, t4, t5, t6, t7); 652 DUP4_ARG1(__lasx_xvclip255_w, t0, t1, t2, t3, t0, t1, t2, t3); 653 DUP4_ARG1(__lasx_xvclip255_w, t4, t5, t6, t7, t4, t5, t6, t7); 654 DUP4_ARG2(__lasx_xvpickev_h, t1, t0, t3, t2, t5, t4, t7, t6, 655 t0, t1, t2, t3); 656 DUP2_ARG2(__lasx_xvpickev_b, t1, t0, t3, t2, t0, t1); 657 t0 = __lasx_xvperm_w(t0, sh); 658 t1 = __lasx_xvperm_w(t1, sh); 659 __lasx_xvstelm_d(t0, dst, 0, 0); 660 __lasx_xvstelm_d(t0, dst + stride, 0, 1); 661 __lasx_xvstelm_d(t0, dst + stride2, 0, 2); 662 __lasx_xvstelm_d(t0, dst + stride3, 0, 3); 663 dst += stride4; 664 __lasx_xvstelm_d(t1, dst, 0, 0); 665 __lasx_xvstelm_d(t1, dst + stride, 0, 1); 666 __lasx_xvstelm_d(t1, dst + stride2, 0, 2); 667 __lasx_xvstelm_d(t1, dst + stride3, 0, 3); 668} 669 670#define PUT_VC1_MSPEL_MC_LASX(hmode, vmode) \ 671void ff_put_vc1_mspel_mc ## hmode ## vmode ## _lasx(uint8_t *dst, \ 672 const uint8_t *src, \ 673 ptrdiff_t stride, int rnd) \ 674{ \ 675 put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd); \ 676} \ 677void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_lasx(uint8_t *dst, \ 678 const uint8_t *src, \ 679 ptrdiff_t stride, int rnd) \ 680{ \ 681 put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd); \ 682 put_vc1_mspel_mc_h_v_lasx(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 683 dst += 8 * stride, src += 8 * stride; \ 684 put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd); \ 685 put_vc1_mspel_mc_h_v_lasx(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 686} 687 688PUT_VC1_MSPEL_MC_LASX(1, 1); 689PUT_VC1_MSPEL_MC_LASX(1, 2); 690PUT_VC1_MSPEL_MC_LASX(1, 3); 691 692PUT_VC1_MSPEL_MC_LASX(2, 1); 693PUT_VC1_MSPEL_MC_LASX(2, 2); 694PUT_VC1_MSPEL_MC_LASX(2, 3); 695 696PUT_VC1_MSPEL_MC_LASX(3, 1); 697PUT_VC1_MSPEL_MC_LASX(3, 2); 698PUT_VC1_MSPEL_MC_LASX(3, 3); 699 700void ff_put_no_rnd_vc1_chroma_mc8_lasx(uint8_t *dst /* align 8 */, 701 uint8_t *src /* align 1 */, 702 ptrdiff_t stride, int h, int x, int y) 703{ 704 const int intA = (8 - x) * (8 - y); 705 const int intB = (x) * (8 - y); 706 const int intC = (8 - x) * (y); 707 const int intD = (x) * (y); 708 __m256i src00, src01, src10, src11; 709 __m256i A, B, C, D; 710 int i; 711 712 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 713 714 A = __lasx_xvreplgr2vr_h(intA); 715 B = __lasx_xvreplgr2vr_h(intB); 716 C = __lasx_xvreplgr2vr_h(intC); 717 D = __lasx_xvreplgr2vr_h(intD); 718 for(i = 0; i < h; i++){ 719 DUP2_ARG2(__lasx_xvld, src, 0, src, 1, src00, src01); 720 src += stride; 721 DUP2_ARG2(__lasx_xvld, src, 0, src, 1, src10, src11); 722 723 DUP4_ARG1(__lasx_vext2xv_hu_bu, src00, src01, src10, src11, 724 src00, src01, src10, src11); 725 DUP4_ARG2(__lasx_xvmul_h, src00, A, src01, B, src10, C, src11, D, 726 src00, src01, src10, src11); 727 src00 = __lasx_xvadd_h(src00, src01); 728 src10 = __lasx_xvadd_h(src10, src11); 729 src00 = __lasx_xvadd_h(src00, src10); 730 src00 = __lasx_xvaddi_hu(src00, 28); 731 src00 = __lasx_xvsrli_h(src00, 6); 732 src00 = __lasx_xvpickev_b(src00, src00); 733 __lasx_xvstelm_d(src00, dst, 0, 0); 734 dst += stride; 735 } 736} 737 738static void put_vc1_mspel_mc_v_lasx(uint8_t *dst, const uint8_t *src, 739 ptrdiff_t stride, int vmode, int rnd) 740{ 741 __m256i in0, in1, in2, in3, temp0, temp1, t0; 742 __m256i const_para0_3, const_para1_2, const_r, const_sh; 743 static const uint16_t para_value[][2] = {{0x0304, 0x1235}, 744 {0x0101, 0x0909}, 745 {0x0403, 0x3512}}; 746 const uint16_t *para_v = para_value[vmode - 1]; 747 static const int shift_value[] = {0, 6, 4, 6}; 748 static int add_value[3]; 749 ptrdiff_t stride_2x = stride << 1; 750 int i = 0; 751 add_value[2] = add_value[0] = 31 + rnd, add_value[1] = 7 + rnd; 752 753 const_r = __lasx_xvreplgr2vr_h(add_value[vmode - 1]); 754 const_sh = __lasx_xvreplgr2vr_h(shift_value[vmode]); 755 const_para0_3 = __lasx_xvreplgr2vr_h(*para_v); 756 const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1)); 757 758 DUP2_ARG2(__lasx_xvld, src - stride, 0, src, 0, in0, in1); 759 in2 = __lasx_xvld(src + stride, 0); 760 in0 = __lasx_xvpermi_d(in0, 0xD8); 761 in1 = __lasx_xvpermi_d(in1, 0xD8); 762 in2 = __lasx_xvpermi_d(in2, 0xD8); 763 for (; i < 16; i++) { 764 in3 = __lasx_xvld(src + stride_2x, 0); 765 in3 = __lasx_xvpermi_d(in3, 0xD8); 766 DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1); 767 t0 = __lasx_xvdp2_h_bu(temp0, const_para1_2); 768 t0 = __lasx_xvdp2sub_h_bu(t0, temp1, const_para0_3); 769 t0 = __lasx_xvadd_h(t0, const_r); 770 t0 = __lasx_xvsra_h(t0, const_sh); 771 t0 = __lasx_xvclip255_h(t0); 772 t0 = __lasx_xvpickev_b(t0, t0); 773 __lasx_xvstelm_d(t0, dst, 0, 0); 774 __lasx_xvstelm_d(t0, dst, 8, 2); 775 dst += stride; 776 src += stride; 777 in0 = in1; 778 in1 = in2; 779 in2 = in3; 780 } 781} 782 783#define PUT_VC1_MSPEL_MC_V_LASX(vmode) \ 784void ff_put_vc1_mspel_mc0 ## vmode ## _16_lasx(uint8_t *dst, \ 785 const uint8_t *src, \ 786 ptrdiff_t stride, int rnd) \ 787{ \ 788 put_vc1_mspel_mc_v_lasx(dst, src, stride, vmode, rnd); \ 789} 790 791PUT_VC1_MSPEL_MC_V_LASX(1); 792PUT_VC1_MSPEL_MC_V_LASX(2); 793PUT_VC1_MSPEL_MC_V_LASX(3); 794 795#define ROW_LASX(in0, in1, in2, in3, out0) \ 796 DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, tmp0_m, tmp1_m); \ 797 out0 = __lasx_xvdp2_h_bu(tmp0_m, const_para1_2); \ 798 out0 = __lasx_xvdp2sub_h_bu(out0, tmp1_m, const_para0_3); \ 799 out0 = __lasx_xvadd_h(out0, const_r); \ 800 out0 = __lasx_xvsra_h(out0, const_sh); \ 801 out0 = __lasx_xvclip255_h(out0); \ 802 out0 = __lasx_xvpickev_b(out0, out0); \ 803 out0 = __lasx_xvpermi_d(out0, 0xd8); \ 804 805static void put_vc1_mspel_mc_h_lasx(uint8_t *dst, const uint8_t *src, 806 ptrdiff_t stride, int hmode, int rnd) 807{ 808 __m256i in0, in1, in2, in3, in4, in5, in6, in7, 809 in8, in9, in10, in11, in12, in13, in14, in15; 810 __m256i out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, 811 out10, out11, out12, out13, out14, out15, out16, out17, out18; 812 __m256i const_para0_3, const_para1_2, const_r, const_sh; 813 __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m; 814 __m256i tmp4_m, tmp5_m, tmp6_m, tmp7_m; 815 __m256i t0, t1, t2, t3, t4, t5, t6, t7; 816 ptrdiff_t stride2 = stride << 1; 817 ptrdiff_t stride4 = stride << 2; 818 ptrdiff_t stride3 = stride2 + stride; 819 static const uint16_t para_value[][2] = {{0x0304, 0x1235}, 820 {0x0101, 0x0909}, 821 {0x0403, 0x3512}}; 822 const uint16_t *para_v = para_value[hmode - 1]; 823 static const int shift_value[] = {0, 6, 4, 6}; 824 static int add_value[3]; 825 uint8_t *_src = (uint8_t*)src - 1; 826 add_value[2] = add_value[0] = 32 - rnd, add_value[1] = 8 - rnd; 827 828 const_r = __lasx_xvreplgr2vr_h(add_value[hmode - 1]); 829 const_sh = __lasx_xvreplgr2vr_h(shift_value[hmode]); 830 const_para0_3 = __lasx_xvreplgr2vr_h(*para_v); 831 const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1)); 832 833 in0 = __lasx_xvld(_src, 0); 834 DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in1, in2); 835 in3 = __lasx_xvldx(_src, stride3); 836 _src += stride4; 837 in4 = __lasx_xvld(_src, 0); 838 DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in5, in6); 839 in7 = __lasx_xvldx(_src, stride3); 840 _src += stride4; 841 in8 = __lasx_xvld(_src, 0); 842 DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in9, in10); 843 in11 = __lasx_xvldx(_src, stride3); 844 _src += stride4; 845 in12 = __lasx_xvld(_src, 0); 846 DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in13, in14); 847 in15 = __lasx_xvldx(_src, stride3); 848 DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5, 849 tmp0_m, tmp1_m, tmp2_m, tmp3_m); 850 DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13, 851 tmp4_m, tmp5_m, tmp6_m, tmp7_m); 852 DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 853 tmp7_m, tmp6_m, t0, t2, t4, t6); 854 DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 855 tmp7_m, tmp6_m, t1, t3, t5, t7); 856 DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m, 857 tmp1_m, tmp5_m); 858 DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m, 859 tmp3_m, tmp7_m); 860 DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 861 tmp7_m, tmp6_m, out0, out2, out4, out6); 862 DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 863 tmp7_m, tmp6_m, out1, out3, out5, out7); 864 865 DUP4_ARG2(__lasx_xvilvh_b, in2, in0, in3, in1, in6, in4, in7, in5, 866 tmp0_m, tmp1_m, tmp2_m, tmp3_m); 867 DUP4_ARG2(__lasx_xvilvh_b, in10, in8, in11, in9, in14, in12, in15, in13, 868 tmp4_m, tmp5_m, tmp6_m, tmp7_m); 869 DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 870 tmp7_m, tmp6_m, t0, t2, t4, t6); 871 DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 872 tmp7_m, tmp6_m, t1, t3, t5, t7); 873 DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m, 874 tmp1_m, tmp5_m); 875 DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m, 876 tmp3_m, tmp7_m); 877 DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 878 tmp7_m, tmp6_m, out8, out10, out12, out14); 879 DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 880 tmp7_m, tmp6_m, out9, out11, out13, out15); 881 DUP2_ARG3(__lasx_xvpermi_q, out0, out0, 0x31, out1, out1, 0x31, out16, out17); 882 out18 = __lasx_xvpermi_q(out2, out2, 0x31); 883 884 DUP4_ARG2(__lasx_xvpermi_d, out0, 0xD8, out1, 0xD8, out2, 0xD8, out3, 0xD8, 885 out0, out1, out2, out3); 886 DUP4_ARG2(__lasx_xvpermi_d, out4, 0xD8, out5, 0xD8, out6, 0xD8, out7, 0xD8, 887 out4, out5, out6, out7); 888 DUP4_ARG2(__lasx_xvpermi_d, out8, 0xD8, out9, 0xD8, out10, 0xD8, out11, 889 0xD8, out8, out9, out10, out11); 890 DUP4_ARG2(__lasx_xvpermi_d, out12, 0xD8, out13, 0xD8, out14, 0xD8, out15, 891 0xD8, out12, out13, out14, out15); 892 out16 = __lasx_xvpermi_d(out16, 0xD8); 893 out17 = __lasx_xvpermi_d(out17, 0xD8); 894 out18 = __lasx_xvpermi_d(out18, 0xD8); 895 896 ROW_LASX(out0, out1, out2, out3, in0); 897 ROW_LASX(out1, out2, out3, out4, in1); 898 ROW_LASX(out2, out3, out4, out5, in2); 899 ROW_LASX(out3, out4, out5, out6, in3); 900 ROW_LASX(out4, out5, out6, out7, in4); 901 ROW_LASX(out5, out6, out7, out8, in5); 902 ROW_LASX(out6, out7, out8, out9, in6); 903 ROW_LASX(out7, out8, out9, out10, in7); 904 ROW_LASX(out8, out9, out10, out11, in8); 905 ROW_LASX(out9, out10, out11, out12, in9); 906 ROW_LASX(out10, out11, out12, out13, in10); 907 ROW_LASX(out11, out12, out13, out14, in11); 908 ROW_LASX(out12, out13, out14, out15, in12); 909 ROW_LASX(out13, out14, out15, out16, in13); 910 ROW_LASX(out14, out15, out16, out17, in14); 911 ROW_LASX(out15, out16, out17, out18, in15); 912 913 DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5, 914 tmp0_m, tmp1_m, tmp2_m, tmp3_m); 915 DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13, 916 tmp4_m, tmp5_m, tmp6_m, tmp7_m); 917 DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 918 tmp7_m, tmp6_m, t0, t2, t4, t6); 919 DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 920 tmp7_m, tmp6_m, t1, t3, t5, t7); 921 DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m, 922 tmp1_m, tmp5_m); 923 DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m, 924 tmp3_m, tmp7_m); 925 DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 926 tmp7_m, tmp6_m, out0, out2, out4, out6); 927 DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 928 tmp7_m, tmp6_m, out1, out3, out5, out7); 929 930 DUP4_ARG2(__lasx_xvilvh_b, in2, in0, in3, in1, in6, in4, in7, in5, 931 tmp0_m, tmp1_m, tmp2_m, tmp3_m); 932 DUP4_ARG2(__lasx_xvilvh_b, in10, in8, in11, in9, in14, in12, in15, in13, 933 tmp4_m, tmp5_m, tmp6_m, tmp7_m); 934 DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 935 tmp7_m, tmp6_m, t0, t2, t4, t6); 936 DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 937 tmp7_m, tmp6_m, t1, t3, t5, t7); 938 DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m, 939 tmp1_m, tmp5_m); 940 DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m, 941 tmp3_m, tmp7_m); 942 DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 943 tmp7_m, tmp6_m, out8, out10, out12, out14); 944 DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m, 945 tmp7_m, tmp6_m, out9, out11, out13, out15); 946 __lasx_xvstelm_d(out0, dst, 0, 0); 947 __lasx_xvstelm_d(out0, dst, 8, 1); 948 dst += stride; 949 __lasx_xvstelm_d(out1, dst, 0, 0); 950 __lasx_xvstelm_d(out1, dst, 8, 1); 951 dst += stride; 952 __lasx_xvstelm_d(out2, dst, 0, 0); 953 __lasx_xvstelm_d(out2, dst, 8, 1); 954 dst += stride; 955 __lasx_xvstelm_d(out3, dst, 0, 0); 956 __lasx_xvstelm_d(out3, dst, 8, 1); 957 dst += stride; 958 __lasx_xvstelm_d(out4, dst, 0, 0); 959 __lasx_xvstelm_d(out4, dst, 8, 1); 960 dst += stride; 961 __lasx_xvstelm_d(out5, dst, 0, 0); 962 __lasx_xvstelm_d(out5, dst, 8, 1); 963 dst += stride; 964 __lasx_xvstelm_d(out6, dst, 0, 0); 965 __lasx_xvstelm_d(out6, dst, 8, 1); 966 dst += stride; 967 __lasx_xvstelm_d(out7, dst, 0, 0); 968 __lasx_xvstelm_d(out7, dst, 8, 1); 969 dst += stride; 970 __lasx_xvstelm_d(out8, dst, 0, 0); 971 __lasx_xvstelm_d(out8, dst, 8, 1); 972 dst += stride; 973 __lasx_xvstelm_d(out9, dst, 0, 0); 974 __lasx_xvstelm_d(out9, dst, 8, 1); 975 dst += stride; 976 __lasx_xvstelm_d(out10, dst, 0, 0); 977 __lasx_xvstelm_d(out10, dst, 8, 1); 978 dst += stride; 979 __lasx_xvstelm_d(out11, dst, 0, 0); 980 __lasx_xvstelm_d(out11, dst, 8, 1); 981 dst += stride; 982 __lasx_xvstelm_d(out12, dst, 0, 0); 983 __lasx_xvstelm_d(out12, dst, 8, 1); 984 dst += stride; 985 __lasx_xvstelm_d(out13, dst, 0, 0); 986 __lasx_xvstelm_d(out13, dst, 8, 1); 987 dst += stride; 988 __lasx_xvstelm_d(out14, dst, 0, 0); 989 __lasx_xvstelm_d(out14, dst, 8, 1); 990 dst += stride; 991 __lasx_xvstelm_d(out15, dst, 0, 0); 992 __lasx_xvstelm_d(out15, dst, 8, 1); 993} 994 995#define PUT_VC1_MSPEL_MC_H_LASX(hmode) \ 996void ff_put_vc1_mspel_mc ## hmode ## 0_16_lasx(uint8_t *dst, \ 997 const uint8_t *src, \ 998 ptrdiff_t stride, int rnd) \ 999{ \ 1000 put_vc1_mspel_mc_h_lasx(dst, src, stride, hmode, rnd); \ 1001} 1002 1003PUT_VC1_MSPEL_MC_H_LASX(1); 1004PUT_VC1_MSPEL_MC_H_LASX(2); 1005PUT_VC1_MSPEL_MC_H_LASX(3); 1006