1/* 2 * Copyright (c) 2021 Loongson Technology Corporation Limited 3 * Contributed by Hao Chen <chenhao@loongson.cn> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/loongarch/loongson_intrinsics.h" 23#include "idctdsp_loongarch.h" 24 25#define LASX_TRANSPOSE4x16(in_0, in_1, in_2, in_3, out_0, out_1, out_2, out_3) \ 26{ \ 27 __m256i temp_0, temp_1, temp_2, temp_3; \ 28 __m256i temp_4, temp_5, temp_6, temp_7; \ 29 DUP4_ARG3(__lasx_xvpermi_q, in_2, in_0, 0x20, in_2, in_0, 0x31, in_3, in_1,\ 30 0x20, in_3, in_1, 0x31, temp_0, temp_1, temp_2, temp_3); \ 31 DUP2_ARG2(__lasx_xvilvl_h, temp_1, temp_0, temp_3, temp_2, temp_4, temp_6);\ 32 DUP2_ARG2(__lasx_xvilvh_h, temp_1, temp_0, temp_3, temp_2, temp_5, temp_7);\ 33 DUP2_ARG2(__lasx_xvilvl_w, temp_6, temp_4, temp_7, temp_5, out_0, out_2); \ 34 DUP2_ARG2(__lasx_xvilvh_w, temp_6, temp_4, temp_7, temp_5, out_1, out_3); \ 35} 36 37#define LASX_IDCTROWCONDDC \ 38 const_val = 16383 * ((1 << 19) / 16383); \ 39 const_val1 = __lasx_xvreplgr2vr_w(const_val); \ 40 DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96, \ 41 in0, in1, in2, in3); \ 42 LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3); \ 43 a0 = __lasx_xvpermi_d(in0, 0xD8); \ 44 a0 = __lasx_vext2xv_w_h(a0); \ 45 temp = __lasx_xvslli_w(a0, 3); \ 46 a1 = __lasx_xvpermi_d(in0, 0x8D); \ 47 a1 = __lasx_vext2xv_w_h(a1); \ 48 a2 = __lasx_xvpermi_d(in1, 0xD8); \ 49 a2 = __lasx_vext2xv_w_h(a2); \ 50 a3 = __lasx_xvpermi_d(in1, 0x8D); \ 51 a3 = __lasx_vext2xv_w_h(a3); \ 52 b0 = __lasx_xvpermi_d(in2, 0xD8); \ 53 b0 = __lasx_vext2xv_w_h(b0); \ 54 b1 = __lasx_xvpermi_d(in2, 0x8D); \ 55 b1 = __lasx_vext2xv_w_h(b1); \ 56 b2 = __lasx_xvpermi_d(in3, 0xD8); \ 57 b2 = __lasx_vext2xv_w_h(b2); \ 58 b3 = __lasx_xvpermi_d(in3, 0x8D); \ 59 b3 = __lasx_vext2xv_w_h(b3); \ 60 select_vec = a0 | a1 | a2 | a3 | b0 | b1 | b2 | b3; \ 61 select_vec = __lasx_xvslti_wu(select_vec, 1); \ 62 \ 63 DUP4_ARG2(__lasx_xvrepl128vei_h, w1, 2, w1, 3, w1, 4, w1, 5, \ 64 w2, w3, w4, w5); \ 65 DUP2_ARG2(__lasx_xvrepl128vei_h, w1, 6, w1, 7, w6, w7); \ 66 w1 = __lasx_xvrepl128vei_h(w1, 1); \ 67 \ 68 /* part of FUNC6(idctRowCondDC) */ \ 69 temp0 = __lasx_xvmaddwl_w_h(const_val0, in0, w4); \ 70 DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2); \ 71 a0 = __lasx_xvadd_w(temp0, temp1); \ 72 a1 = __lasx_xvadd_w(temp0, temp2); \ 73 a2 = __lasx_xvsub_w(temp0, temp2); \ 74 a3 = __lasx_xvsub_w(temp0, temp1); \ 75 \ 76 DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1); \ 77 b0 = __lasx_xvdp2_w_h(temp0, temp1); \ 78 temp1 = __lasx_xvneg_h(w7); \ 79 temp2 = __lasx_xvilvl_h(temp1, w3); \ 80 b1 = __lasx_xvdp2_w_h(temp0, temp2); \ 81 temp1 = __lasx_xvneg_h(w1); \ 82 temp2 = __lasx_xvilvl_h(temp1, w5); \ 83 b2 = __lasx_xvdp2_w_h(temp0, temp2); \ 84 temp1 = __lasx_xvneg_h(w5); \ 85 temp2 = __lasx_xvilvl_h(temp1, w7); \ 86 b3 = __lasx_xvdp2_w_h(temp0, temp2); \ 87 \ 88 /* if (AV_RAN64A(row + 4)) */ \ 89 DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1); \ 90 a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1); \ 91 temp1 = __lasx_xvilvl_h(w2, w4); \ 92 a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1); \ 93 temp1 = __lasx_xvneg_h(w4); \ 94 temp2 = __lasx_xvilvl_h(w2, temp1); \ 95 a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2); \ 96 temp1 = __lasx_xvneg_h(w6); \ 97 temp2 = __lasx_xvilvl_h(temp1, w4); \ 98 a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2); \ 99 \ 100 DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1); \ 101 b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1); \ 102 DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2); \ 103 b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1); \ 104 b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2); \ 105 temp1 = __lasx_xvneg_h(w1); \ 106 temp2 = __lasx_xvilvl_h(temp1, w3); \ 107 b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2); \ 108 \ 109 DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3, \ 110 temp0, temp1, temp2, temp3); \ 111 DUP4_ARG2(__lasx_xvsub_w, a0, b0, a1, b1, a2, b2, a3, b3, \ 112 a0, a1, a2, a3); \ 113 DUP4_ARG2(__lasx_xvsrai_w, temp0, 11, temp1, 11, temp2, 11, temp3, 11, \ 114 temp0, temp1, temp2, temp3); \ 115 DUP4_ARG2(__lasx_xvsrai_w, a0, 11, a1, 11, a2, 11, a3, 11, a0, a1, a2, a3);\ 116 DUP4_ARG3(__lasx_xvbitsel_v, temp0, temp, select_vec, temp1, temp, \ 117 select_vec, temp2, temp, select_vec, temp3, temp, select_vec, \ 118 in0, in1, in2, in3); \ 119 DUP4_ARG3(__lasx_xvbitsel_v, a0, temp, select_vec, a1, temp, \ 120 select_vec, a2, temp, select_vec, a3, temp, select_vec, \ 121 a0, a1, a2, a3); \ 122 DUP4_ARG2(__lasx_xvpickev_h, in1, in0, in3, in2, a2, a3, a0, a1, \ 123 in0, in1, in2, in3); \ 124 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, \ 125 in0, in1, in2, in3); \ 126 127#define LASX_IDCTCOLS \ 128 /* part of FUNC6(idctSparaseCol) */ \ 129 LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3); \ 130 temp0 = __lasx_xvmaddwl_w_h(const_val1, in0, w4); \ 131 DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2); \ 132 a0 = __lasx_xvadd_w(temp0, temp1); \ 133 a1 = __lasx_xvadd_w(temp0, temp2); \ 134 a2 = __lasx_xvsub_w(temp0, temp2); \ 135 a3 = __lasx_xvsub_w(temp0, temp1); \ 136 \ 137 DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1); \ 138 b0 = __lasx_xvdp2_w_h(temp0, temp1); \ 139 temp1 = __lasx_xvneg_h(w7); \ 140 temp2 = __lasx_xvilvl_h(temp1, w3); \ 141 b1 = __lasx_xvdp2_w_h(temp0, temp2); \ 142 temp1 = __lasx_xvneg_h(w1); \ 143 temp2 = __lasx_xvilvl_h(temp1, w5); \ 144 b2 = __lasx_xvdp2_w_h(temp0, temp2); \ 145 temp1 = __lasx_xvneg_h(w5); \ 146 temp2 = __lasx_xvilvl_h(temp1, w7); \ 147 b3 = __lasx_xvdp2_w_h(temp0, temp2); \ 148 \ 149 /* if (AV_RAN64A(row + 4)) */ \ 150 DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1); \ 151 a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1); \ 152 temp1 = __lasx_xvilvl_h(w2, w4); \ 153 a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1); \ 154 temp1 = __lasx_xvneg_h(w4); \ 155 temp2 = __lasx_xvilvl_h(w2, temp1); \ 156 a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2); \ 157 temp1 = __lasx_xvneg_h(w6); \ 158 temp2 = __lasx_xvilvl_h(temp1, w4); \ 159 a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2); \ 160 \ 161 DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1); \ 162 b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1); \ 163 DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2); \ 164 b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1); \ 165 b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2); \ 166 temp1 = __lasx_xvneg_h(w1); \ 167 temp2 = __lasx_xvilvl_h(temp1, w3); \ 168 b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2); \ 169 \ 170 DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3, \ 171 temp0, temp1, temp2, temp3); \ 172 DUP4_ARG2(__lasx_xvsub_w, a3, b3, a2, b2, a1, b1, a0, b0, \ 173 a3, a2, a1, a0); \ 174 DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 20, temp3, temp2, 20, a2, a3, \ 175 20, a0, a1, 20, in0, in1, in2, in3); \ 176 177void ff_simple_idct_lasx(int16_t *block) 178{ 179 int32_t const_val = 1 << 10; 180 __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF, 181 0x4B42539F58C50000, 0x11A822A332493FFF}; 182 __m256i in0, in1, in2, in3; 183 __m256i w2, w3, w4, w5, w6, w7; 184 __m256i a0, a1, a2, a3; 185 __m256i b0, b1, b2, b3; 186 __m256i temp0, temp1, temp2, temp3; 187 __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val); 188 __m256i const_val1, select_vec, temp; 189 190 LASX_IDCTROWCONDDC 191 LASX_IDCTCOLS 192 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, 193 in0, in1, in2, in3); 194 __lasx_xvst(in0, block, 0); 195 __lasx_xvst(in1, block, 32); 196 __lasx_xvst(in2, block, 64); 197 __lasx_xvst(in3, block, 96); 198} 199 200void ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride, 201 int16_t *block) 202{ 203 int32_t const_val = 1 << 10; 204 ptrdiff_t dst_stride_2x = dst_stride << 1; 205 ptrdiff_t dst_stride_4x = dst_stride << 2; 206 ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride; 207 __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF, 208 0x4B42539F58C50000, 0x11A822A332493FFF}; 209 __m256i in0, in1, in2, in3; 210 __m256i w2, w3, w4, w5, w6, w7; 211 __m256i a0, a1, a2, a3; 212 __m256i b0, b1, b2, b3; 213 __m256i temp0, temp1, temp2, temp3; 214 __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val); 215 __m256i const_val1, select_vec, temp; 216 217 LASX_IDCTROWCONDDC 218 LASX_IDCTCOLS 219 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, 220 in0, in1, in2, in3); 221 DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3); 222 DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1); 223 __lasx_xvstelm_d(in0, dst, 0, 0); 224 __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2); 225 __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1); 226 __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3); 227 dst += dst_stride_4x; 228 __lasx_xvstelm_d(in1, dst, 0, 0); 229 __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2); 230 __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1); 231 __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3); 232} 233 234void ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride, 235 int16_t *block) 236{ 237 int32_t const_val = 1 << 10; 238 uint8_t *dst1 = dst; 239 ptrdiff_t dst_stride_2x = dst_stride << 1; 240 ptrdiff_t dst_stride_4x = dst_stride << 2; 241 ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride; 242 243 __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF, 244 0x4B42539F58C50000, 0x11A822A332493FFF}; 245 __m256i sh = {0x0003000200010000, 0x000B000A00090008, 246 0x0007000600050004, 0x000F000E000D000C}; 247 __m256i in0, in1, in2, in3; 248 __m256i w2, w3, w4, w5, w6, w7; 249 __m256i a0, a1, a2, a3; 250 __m256i b0, b1, b2, b3; 251 __m256i temp0, temp1, temp2, temp3; 252 __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val); 253 __m256i const_val1, select_vec, temp; 254 255 LASX_IDCTROWCONDDC 256 LASX_IDCTCOLS 257 a0 = __lasx_xvldrepl_d(dst1, 0); 258 a0 = __lasx_vext2xv_hu_bu(a0); 259 dst1 += dst_stride; 260 a1 = __lasx_xvldrepl_d(dst1, 0); 261 a1 = __lasx_vext2xv_hu_bu(a1); 262 dst1 += dst_stride; 263 a2 = __lasx_xvldrepl_d(dst1, 0); 264 a2 = __lasx_vext2xv_hu_bu(a2); 265 dst1 += dst_stride; 266 a3 = __lasx_xvldrepl_d(dst1, 0); 267 a3 = __lasx_vext2xv_hu_bu(a3); 268 dst1 += dst_stride; 269 b0 = __lasx_xvldrepl_d(dst1, 0); 270 b0 = __lasx_vext2xv_hu_bu(b0); 271 dst1 += dst_stride; 272 b1 = __lasx_xvldrepl_d(dst1, 0); 273 b1 = __lasx_vext2xv_hu_bu(b1); 274 dst1 += dst_stride; 275 b2 = __lasx_xvldrepl_d(dst1, 0); 276 b2 = __lasx_vext2xv_hu_bu(b2); 277 dst1 += dst_stride; 278 b3 = __lasx_xvldrepl_d(dst1, 0); 279 b3 = __lasx_vext2xv_hu_bu(b3); 280 DUP4_ARG3(__lasx_xvshuf_h, sh, a1, a0, sh, a3, a2, sh, b1, b0, sh, b3, b2, 281 temp0, temp1, temp2, temp3); 282 DUP4_ARG2(__lasx_xvadd_h, temp0, in0, temp1, in1, temp2, in2, temp3, in3, 283 in0, in1, in2, in3); 284 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, 285 in0, in1, in2, in3); 286 DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3); 287 DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1); 288 __lasx_xvstelm_d(in0, dst, 0, 0); 289 __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2); 290 __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1); 291 __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3); 292 dst += dst_stride_4x; 293 __lasx_xvstelm_d(in1, dst, 0, 0); 294 __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2); 295 __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1); 296 __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3); 297} 298