1/* 2 * Copyright (c) 2022 Loongson Technology Corporation Limited 3 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 4 * Hao Chen <chenhao@loongson.cn> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/loongarch/loongson_intrinsics.h" 24#include "hevcdsp_lsx.h" 25 26static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = { 27 64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18 28}; 29 30static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = { 31 64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43, 32 64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90, 33 64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57, 34 64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25 35}; 36 37static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = { 38 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 39 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 40 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, 41 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 42 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, 43 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 44 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, 45 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 46 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, 47 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 48 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, 49 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 50 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, 51 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 52 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, 53 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 54}; 55 56static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = { 57 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25, 58 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57, 59 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80, 60 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90 61}; 62 63static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = { 64 89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89 65}; 66 67#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \ 68 sum0, sum1, sum2, sum3, shift) \ 69{ \ 70 __m128i vec0, vec1, vec2, vec3, vec4, vec5; \ 71 __m128i cnst64 = __lsx_vldi(0x0840); \ 72 __m128i cnst83 = __lsx_vldi(0x0853); \ 73 __m128i cnst36 = __lsx_vldi(0x0824); \ 74 \ 75 vec0 = __lsx_vdp2_w_h(in_r0, cnst64); \ 76 vec1 = __lsx_vdp2_w_h(in_l0, cnst83); \ 77 vec2 = __lsx_vdp2_w_h(in_r1, cnst64); \ 78 vec3 = __lsx_vdp2_w_h(in_l1, cnst36); \ 79 vec4 = __lsx_vdp2_w_h(in_l0, cnst36); \ 80 vec5 = __lsx_vdp2_w_h(in_l1, cnst83); \ 81 \ 82 sum0 = __lsx_vadd_w(vec0, vec2); \ 83 sum1 = __lsx_vsub_w(vec0, vec2); \ 84 vec1 = __lsx_vadd_w(vec1, vec3); \ 85 vec4 = __lsx_vsub_w(vec4, vec5); \ 86 sum2 = __lsx_vsub_w(sum1, vec4); \ 87 sum3 = __lsx_vsub_w(sum0, vec1); \ 88 sum0 = __lsx_vadd_w(sum0, vec1); \ 89 sum1 = __lsx_vadd_w(sum1, vec4); \ 90 \ 91 sum0 = __lsx_vsrari_w(sum0, shift); \ 92 sum1 = __lsx_vsrari_w(sum1, shift); \ 93 sum2 = __lsx_vsrari_w(sum2, shift); \ 94 sum3 = __lsx_vsrari_w(sum3, shift); \ 95 sum0 = __lsx_vsat_w(sum0, 15); \ 96 sum1 = __lsx_vsat_w(sum1, 15); \ 97 sum2 = __lsx_vsat_w(sum2, 15); \ 98 sum3 = __lsx_vsat_w(sum3, 15); \ 99} 100 101#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \ 102{ \ 103 __m128i src0_r, src1_r, src2_r, src3_r; \ 104 __m128i src0_l, src1_l, src2_l, src3_l; \ 105 __m128i filter0, filter1, filter2, filter3; \ 106 __m128i temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \ 107 __m128i temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \ 108 __m128i sum0_r, sum1_r, sum2_r, sum3_r; \ 109 __m128i sum0_l, sum1_l, sum2_l, sum3_l; \ 110 \ 111 DUP4_ARG2(__lsx_vilvl_h, in4, in0, in6, in2, in5, in1, in3, in7, \ 112 src0_r, src1_r, src2_r, src3_r); \ 113 DUP4_ARG2(__lsx_vilvh_h, in4, in0, in6, in2, in5, in1, in3, in7, \ 114 src0_l, src1_l, src2_l, src3_l); \ 115 \ 116 DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 8, \ 117 filter, 12, filter0, filter1, filter2, filter3); \ 118 DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \ 119 src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \ 120 temp1_r, temp1_l); \ 121 \ 122 LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\ 123 sum1_l, sum1_r); \ 124 sum2_r = sum1_r; \ 125 sum2_l = sum1_l; \ 126 sum3_r = sum0_r; \ 127 sum3_l = sum0_l; \ 128 \ 129 DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \ 130 src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \ 131 temp3_r, temp3_l); \ 132 temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \ 133 temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \ 134 sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \ 135 sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \ 136 sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \ 137 sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \ 138 \ 139 in0 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \ 140 in7 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \ 141 \ 142 DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \ 143 src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \ 144 temp5_r, temp5_l); \ 145 temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \ 146 temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \ 147 sum1_r = __lsx_vadd_w(sum1_r, temp4_r); \ 148 sum1_l = __lsx_vadd_w(sum1_l, temp4_l); \ 149 sum2_r = __lsx_vsub_w(sum2_r, temp4_r); \ 150 sum2_l = __lsx_vsub_w(sum2_l, temp4_l); \ 151 \ 152 in3 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \ 153 in4 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \ 154 \ 155 DUP4_ARG2(__lsx_vldrepl_w, filter, 16, filter, 20, filter, 24, \ 156 filter, 28, filter0, filter1, filter2, filter3); \ 157 DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \ 158 src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \ 159 temp1_r, temp1_l); \ 160 \ 161 LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\ 162 sum1_l, sum1_r); \ 163 sum2_r = sum1_r; \ 164 sum2_l = sum1_l; \ 165 sum3_r = sum0_r; \ 166 sum3_l = sum0_l; \ 167 \ 168 DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \ 169 src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \ 170 temp3_r, temp3_l); \ 171 temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \ 172 temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \ 173 sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \ 174 sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \ 175 sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \ 176 sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \ 177 \ 178 in1 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \ 179 in6 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \ 180 \ 181 DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \ 182 src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \ 183 temp5_r, temp5_l); \ 184 temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \ 185 temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \ 186 sum1_r = __lsx_vsub_w(sum1_r, temp4_r); \ 187 sum1_l = __lsx_vsub_w(sum1_l, temp4_l); \ 188 sum2_r = __lsx_vadd_w(sum2_r, temp4_r); \ 189 sum2_l = __lsx_vadd_w(sum2_l, temp4_l); \ 190 \ 191 in2 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \ 192 in5 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \ 193} 194 195#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \ 196 src4_r, src5_r, src6_r, src7_r, \ 197 src0_l, src1_l, src2_l, src3_l, \ 198 src4_l, src5_l, src6_l, src7_l, shift) \ 199{ \ 200 int16_t *ptr0, *ptr1; \ 201 __m128i dst0, dst1; \ 202 __m128i filter0, filter1, filter2, filter3; \ 203 __m128i temp0_r, temp1_r, temp0_l, temp1_l; \ 204 __m128i sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \ 205 __m128i sum3_l, res0_r, res1_r, res0_l, res1_l; \ 206 \ 207 ptr0 = (buf_ptr + 112); \ 208 ptr1 = (buf_ptr + 128); \ 209 k = -1; \ 210 \ 211 for (j = 0; j < 4; j++) \ 212 { \ 213 DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 16, \ 214 filter, 20, filter0, filter1, filter2, filter3); \ 215 DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \ 216 src4_r, filter2, src4_l, filter2, sum0_r, sum0_l, \ 217 sum2_r, sum2_l); \ 218 DUP2_ARG2(__lsx_vdp2_w_h, src7_r, filter2, src7_l, filter2, \ 219 sum3_r, sum3_l); \ 220 DUP4_ARG3(__lsx_vdp2add_w_h, sum0_r, src1_r, filter1, sum0_l, \ 221 src1_l, filter1, sum2_r, src5_r, filter3, sum2_l, \ 222 src5_l, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \ 223 DUP2_ARG3(__lsx_vdp2add_w_h, sum3_r, src6_r, filter3, sum3_l, \ 224 src6_l, filter3, sum3_r, sum3_l); \ 225 \ 226 sum1_r = sum0_r; \ 227 sum1_l = sum0_l; \ 228 \ 229 DUP4_ARG2(__lsx_vldrepl_w, filter, 8, filter, 12, filter, 24, \ 230 filter, 28, filter0, filter1, filter2, filter3); \ 231 filter += 16; \ 232 DUP2_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0, \ 233 temp0_r, temp0_l); \ 234 DUP2_ARG3(__lsx_vdp2add_w_h, sum2_r, src6_r, filter2, sum2_l, \ 235 src6_l, filter2, sum2_r, sum2_l); \ 236 DUP2_ARG2(__lsx_vdp2_w_h, src5_r, filter2, src5_l, filter2, \ 237 temp1_r, temp1_l); \ 238 \ 239 sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \ 240 sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \ 241 sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \ 242 sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \ 243 sum3_r = __lsx_vsub_w(temp1_r, sum3_r); \ 244 sum3_l = __lsx_vsub_w(temp1_l, sum3_l); \ 245 \ 246 DUP2_ARG2(__lsx_vdp2_w_h, src3_r, filter1, src3_l, filter1, \ 247 temp0_r, temp0_l); \ 248 DUP4_ARG3(__lsx_vdp2add_w_h, sum2_r, src7_r, filter3, sum2_l, \ 249 src7_l, filter3, sum3_r, src4_r, filter3, sum3_l, \ 250 src4_l, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \ 251 \ 252 sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \ 253 sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \ 254 sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \ 255 sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \ 256 \ 257 LSX_BUTTERFLY_4_W(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \ 258 res1_l, res1_r); \ 259 dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \ 260 dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \ 261 __lsx_vst(dst0, buf_ptr, 0); \ 262 __lsx_vst(dst1, (buf_ptr + ((15 - (j * 2)) << 4)), 0); \ 263 \ 264 LSX_BUTTERFLY_4_W(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \ 265 res1_l, res1_r); \ 266 \ 267 dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \ 268 dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \ 269 __lsx_vst(dst0, (ptr0 + ((((j + 1) >> 1) * 2 * k) << 4)), 0); \ 270 __lsx_vst(dst1, (ptr1 - ((((j + 1) >> 1) * 2 * k) << 4)), 0); \ 271 \ 272 k *= -1; \ 273 buf_ptr += 16; \ 274 } \ 275} 276 277#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \ 278{ \ 279 tmp0_r = __lsx_vld(input + load_idx * 8, 0); \ 280 tmp0_l = __lsx_vld(input + load_idx * 8, 16); \ 281 tmp1_r = sum0_r; \ 282 tmp1_l = sum0_l; \ 283 sum0_r = __lsx_vadd_w(sum0_r, tmp0_r); \ 284 sum0_l = __lsx_vadd_w(sum0_l, tmp0_l); \ 285 __lsx_vst(sum0_r, (input + load_idx * 8), 0); \ 286 __lsx_vst(sum0_l, (input + load_idx * 8), 16); \ 287 tmp1_r = __lsx_vsub_w(tmp1_r, tmp0_r); \ 288 tmp1_l = __lsx_vsub_w(tmp1_l, tmp0_l); \ 289 __lsx_vst(tmp1_r, (input + store_idx * 8), 0); \ 290 __lsx_vst(tmp1_l, (input + store_idx * 8), 16); \ 291} 292 293#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \ 294 res0, res1, res2, res3, shift) \ 295{ \ 296 __m128i vec0, vec1, vec2, vec3; \ 297 __m128i cnst74 = __lsx_vldi(0x84a); \ 298 __m128i cnst55 = __lsx_vldi(0x837); \ 299 __m128i cnst29 = __lsx_vldi(0x81d); \ 300 \ 301 vec0 = __lsx_vadd_w(in_r0, in_r1); \ 302 vec2 = __lsx_vsub_w(in_r0, in_l1); \ 303 res0 = __lsx_vmul_w(vec0, cnst29); \ 304 res1 = __lsx_vmul_w(vec2, cnst55); \ 305 res2 = __lsx_vsub_w(in_r0, in_r1); \ 306 vec1 = __lsx_vadd_w(in_r1, in_l1); \ 307 res2 = __lsx_vadd_w(res2, in_l1); \ 308 vec3 = __lsx_vmul_w(in_l0, cnst74); \ 309 res3 = __lsx_vmul_w(vec0, cnst55); \ 310 \ 311 res0 = __lsx_vadd_w(res0, __lsx_vmul_w(vec1, cnst55)); \ 312 res1 = __lsx_vsub_w(res1, __lsx_vmul_w(vec1, cnst29)); \ 313 res2 = __lsx_vmul_w(res2, cnst74); \ 314 res3 = __lsx_vadd_w(res3, __lsx_vmul_w(vec2, cnst29)); \ 315 \ 316 res0 = __lsx_vadd_w(res0, vec3); \ 317 res1 = __lsx_vadd_w(res1, vec3); \ 318 res3 = __lsx_vsub_w(res3, vec3); \ 319 \ 320 res0 = __lsx_vsrari_w(res0, shift); \ 321 res1 = __lsx_vsrari_w(res1, shift); \ 322 res2 = __lsx_vsrari_w(res2, shift); \ 323 res3 = __lsx_vsrari_w(res3, shift); \ 324 res0 = __lsx_vsat_w(res0, 15); \ 325 res1 = __lsx_vsat_w(res1, 15); \ 326 res2 = __lsx_vsat_w(res2, 15); \ 327 res3 = __lsx_vsat_w(res3, 15); \ 328} 329 330void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit) 331{ 332 __m128i in0, in1; 333 __m128i in_r0, in_l0, in_r1, in_l1; 334 __m128i sum0, sum1, sum2, sum3; 335 __m128i zero = __lsx_vldi(0x00); 336 337 in0 = __lsx_vld(coeffs, 0); 338 in1 = __lsx_vld(coeffs, 16); 339 in_r0 = __lsx_vilvl_h(zero, in0); 340 in_l0 = __lsx_vilvh_h(zero, in0); 341 in_r1 = __lsx_vilvl_h(zero, in1); 342 in_l1 = __lsx_vilvh_h(zero, in1); 343 344 HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7); 345 LSX_TRANSPOSE4x4_W(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1); 346 HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12); 347 348 /* Pack and transpose */ 349 in0 = __lsx_vpickev_h(sum2, sum0); 350 in1 = __lsx_vpickev_h(sum3, sum1); 351 sum0 = __lsx_vilvl_h(in1, in0); 352 sum1 = __lsx_vilvh_h(in1, in0); 353 in0 = __lsx_vilvl_w(sum1, sum0); 354 in1 = __lsx_vilvh_w(sum1, sum0); 355 356 __lsx_vst(in0, coeffs, 0); 357 __lsx_vst(in1, coeffs, 16); 358} 359 360void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit) 361{ 362 const int16_t *filter = >8x8_cnst[0]; 363 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 364 365 DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 16, coeffs, 32, 366 coeffs, 48, in0, in1, in2, in3); 367 DUP4_ARG2(__lsx_vld, coeffs, 64, coeffs, 80, coeffs, 96, 368 coeffs, 112, in4, in5, in6, in7); 369 HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7); 370 LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, 371 in0, in1, in2, in3, in4, in5, in6, in7); 372 HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12); 373 LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, 374 in0, in1, in2, in3, in4, in5, in6, in7); 375 376 __lsx_vst(in0, coeffs, 0); 377 __lsx_vst(in1, coeffs, 16); 378 __lsx_vst(in2, coeffs, 32); 379 __lsx_vst(in3, coeffs, 48); 380 __lsx_vst(in4, coeffs, 64); 381 __lsx_vst(in5, coeffs, 80); 382 __lsx_vst(in6, coeffs, 96); 383 __lsx_vst(in7, coeffs, 112); 384} 385 386void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit) 387{ 388 int16_t i, j, k; 389 int16_t buf[256]; 390 int16_t *buf_ptr = &buf[0]; 391 int16_t *src = coeffs; 392 const int16_t *filter = >16x16_cnst[0]; 393 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 394 __m128i in8, in9, in10, in11, in12, in13, in14, in15; 395 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 396 __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 397 __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; 398 399 for (i = 2; i--;) { 400 DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, 401 in0, in1, in2, in3); 402 DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224, 403 in4, in5, in6, in7); 404 DUP4_ARG2(__lsx_vld, src, 256, src, 288, src, 320, src, 352, 405 in8, in9, in10, in11); 406 DUP4_ARG2(__lsx_vld, src, 384, src, 416, src, 448, src, 480, 407 in12, in13, in14, in15); 408 409 DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10, 410 src0_r, src1_r, src2_r, src3_r); 411 DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15, 412 src4_r, src5_r, src6_r, src7_r); 413 DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10, 414 src0_l, src1_l, src2_l, src3_l); 415 DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15, 416 src4_l, src5_l, src6_l, src7_l); 417 418 HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, 419 src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, 420 src4_l, src5_l, src6_l, src7_l, 7); 421 422 src += 8; 423 buf_ptr = (&buf[0] + 8); 424 filter = >16x16_cnst[0]; 425 } 426 427 src = &buf[0]; 428 buf_ptr = coeffs; 429 filter = >16x16_cnst[0]; 430 431 for (i = 2; i--;) { 432 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 433 in0, in8, in1, in9); 434 DUP4_ARG2(__lsx_vld, src, 64, src, 80, src, 96, src, 112, 435 in2, in10, in3, in11); 436 DUP4_ARG2(__lsx_vld, src, 128, src, 144, src, 160, src, 176, 437 in4, in12, in5, in13); 438 DUP4_ARG2(__lsx_vld, src, 192, src, 208, src, 224, src, 240, 439 in6, in14, in7, in15); 440 LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, 441 in0, in1, in2, in3, in4, in5, in6, in7); 442 LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, 443 in8, in9, in10, in11, in12, in13, in14, in15); 444 DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10, 445 src0_r, src1_r, src2_r, src3_r); 446 DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15, 447 src4_r, src5_r, src6_r, src7_r); 448 DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10, 449 src0_l, src1_l, src2_l, src3_l); 450 DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15, 451 src4_l, src5_l, src6_l, src7_l); 452 HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, 453 src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, 454 src4_l, src5_l, src6_l, src7_l, 12); 455 456 src += 128; 457 buf_ptr = coeffs + 8; 458 filter = >16x16_cnst[0]; 459 } 460 461 DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 32, coeffs, 64, coeffs, 96, 462 in0, in1, in2, in3); 463 DUP4_ARG2(__lsx_vld, coeffs, 128, coeffs, 160, coeffs, 192, coeffs, 224, 464 in4, in5, in6, in7); 465 LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, 466 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); 467 __lsx_vst(vec0, coeffs, 0); 468 __lsx_vst(vec1, coeffs, 32); 469 __lsx_vst(vec2, coeffs, 64); 470 __lsx_vst(vec3, coeffs, 96); 471 __lsx_vst(vec4, coeffs, 128); 472 __lsx_vst(vec5, coeffs, 160); 473 __lsx_vst(vec6, coeffs, 192); 474 __lsx_vst(vec7, coeffs, 224); 475 476 src = coeffs + 8; 477 DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, in0, in1, in2, in3); 478 DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224, 479 in4, in5, in6, in7); 480 LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, 481 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); 482 src = coeffs + 128; 483 DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, 484 in8, in9, in10, in11); 485 DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224, 486 in12, in13, in14, in15); 487 488 __lsx_vst(vec0, src, 0); 489 __lsx_vst(vec1, src, 32); 490 __lsx_vst(vec2, src, 64); 491 __lsx_vst(vec3, src, 96); 492 __lsx_vst(vec4, src, 128); 493 __lsx_vst(vec5, src, 160); 494 __lsx_vst(vec6, src, 192); 495 __lsx_vst(vec7, src, 224); 496 LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, 497 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); 498 src = coeffs + 8; 499 __lsx_vst(vec0, src, 0); 500 __lsx_vst(vec1, src, 32); 501 __lsx_vst(vec2, src, 64); 502 __lsx_vst(vec3, src, 96); 503 __lsx_vst(vec4, src, 128); 504 __lsx_vst(vec5, src, 160); 505 __lsx_vst(vec6, src, 192); 506 __lsx_vst(vec7, src, 224); 507 508 src = coeffs + 136; 509 DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, 510 in0, in1, in2, in3); 511 DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224, 512 in4, in5, in6, in7); 513 LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, 514 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); 515 __lsx_vst(vec0, src, 0); 516 __lsx_vst(vec1, src, 32); 517 __lsx_vst(vec2, src, 64); 518 __lsx_vst(vec3, src, 96); 519 __lsx_vst(vec4, src, 128); 520 __lsx_vst(vec5, src, 160); 521 __lsx_vst(vec6, src, 192); 522 __lsx_vst(vec7, src, 224); 523} 524 525static void hevc_idct_8x32_column_lsx(int16_t *coeffs, int32_t buf_pitch, 526 uint8_t round) 527{ 528 uint8_t i; 529 int32_t buf_pitch_2 = buf_pitch << 1; 530 int32_t buf_pitch_4 = buf_pitch << 2; 531 int32_t buf_pitch_8 = buf_pitch << 3; 532 int32_t buf_pitch_16 = buf_pitch << 4; 533 534 const int16_t *filter_ptr0 = >32x32_cnst0[0]; 535 const int16_t *filter_ptr1 = >32x32_cnst1[0]; 536 const int16_t *filter_ptr2 = >32x32_cnst2[0]; 537 const int16_t *filter_ptr3 = >8x8_cnst[0]; 538 int16_t *src0 = (coeffs + buf_pitch); 539 int16_t *src1 = (coeffs + buf_pitch_2); 540 int16_t *src2 = (coeffs + buf_pitch_4); 541 int16_t *src3 = (coeffs); 542 int32_t tmp_buf[8 * 32 + 15]; 543 int32_t *tmp_buf_ptr = tmp_buf + 15; 544 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 545 __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 546 __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; 547 __m128i filter0, filter1, filter2, filter3; 548 __m128i sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l; 549 550 /* Align pointer to 64 byte boundary */ 551 tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63); 552 553 /* process coeff 4, 12, 20, 28 */ 554 in0 = __lsx_vld(src2, 0); 555 in1 = __lsx_vld(src2 + buf_pitch_8, 0); 556 in2 = __lsx_vld(src2 + buf_pitch_16, 0); 557 in3 = __lsx_vld(src2 + buf_pitch_16 + buf_pitch_8, 0); 558 in4 = __lsx_vld(src3, 0); 559 in5 = __lsx_vld(src3 + buf_pitch_8, 0); 560 in6 = __lsx_vld(src3 + buf_pitch_16, 0); 561 in7 = __lsx_vld(src3 + buf_pitch_16 + buf_pitch_8, 0); 562 DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in6, in4, in7, in5, 563 src0_r, src1_r, src2_r, src3_r); 564 DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in6, in4, in7, in5, 565 src0_l, src1_l, src2_l, src3_l); 566 567 filter0 = __lsx_vldrepl_w(filter_ptr2, 0); 568 filter1 = __lsx_vldrepl_w(filter_ptr2, 4); 569 sum0_r = __lsx_vdp2_w_h(src0_r, filter0); 570 sum0_l = __lsx_vdp2_w_h(src0_l, filter0); 571 sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); 572 sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); 573 __lsx_vst(sum0_r, tmp_buf_ptr, 0); 574 __lsx_vst(sum0_l, tmp_buf_ptr, 16); 575 576 filter0 = __lsx_vldrepl_w(filter_ptr2, 8); 577 filter1 = __lsx_vldrepl_w(filter_ptr2, 12); 578 sum0_r = __lsx_vdp2_w_h(src0_r, filter0); 579 sum0_l = __lsx_vdp2_w_h(src0_l, filter0); 580 sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); 581 sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); 582 __lsx_vst(sum0_r, tmp_buf_ptr, 32); 583 __lsx_vst(sum0_l, tmp_buf_ptr, 48); 584 585 filter0 = __lsx_vldrepl_w(filter_ptr2, 16); 586 filter1 = __lsx_vldrepl_w(filter_ptr2, 20); 587 sum0_r = __lsx_vdp2_w_h(src0_r, filter0); 588 sum0_l = __lsx_vdp2_w_h(src0_l, filter0); 589 sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); 590 sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); 591 __lsx_vst(sum0_r, tmp_buf_ptr, 64); 592 __lsx_vst(sum0_l, tmp_buf_ptr, 80); 593 594 filter0 = __lsx_vldrepl_w(filter_ptr2, 24); 595 filter1 = __lsx_vldrepl_w(filter_ptr2, 28); 596 sum0_r = __lsx_vdp2_w_h(src0_r, filter0); 597 sum0_l = __lsx_vdp2_w_h(src0_l, filter0); 598 sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); 599 sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); 600 __lsx_vst(sum0_r, tmp_buf_ptr, 96); 601 __lsx_vst(sum0_l, tmp_buf_ptr, 112); 602 603 /* process coeff 0, 8, 16, 24 */ 604 filter0 = __lsx_vldrepl_w(filter_ptr3, 0); 605 filter1 = __lsx_vldrepl_w(filter_ptr3, 4); 606 607 DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0, 608 src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l); 609 sum1_r = __lsx_vsub_w(sum0_r, tmp1_r); 610 sum1_l = __lsx_vsub_w(sum0_l, tmp1_l); 611 sum0_r = __lsx_vadd_w(sum0_r, tmp1_r); 612 sum0_l = __lsx_vadd_w(sum0_l, tmp1_l); 613 614 HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 0, 7); 615 HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 3, 4); 616 617 filter0 = __lsx_vldrepl_w(filter_ptr3, 16); 618 filter1 = __lsx_vldrepl_w(filter_ptr3, 20); 619 620 DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0, 621 src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l); 622 sum1_r = __lsx_vsub_w(sum0_r, tmp1_r); 623 sum1_l = __lsx_vsub_w(sum0_l, tmp1_l); 624 sum0_r = __lsx_vadd_w(sum0_r, tmp1_r); 625 sum0_l = __lsx_vadd_w(sum0_l, tmp1_l); 626 627 HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 1, 6); 628 HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 2, 5); 629 630 /* process coeff 2 6 10 14 18 22 26 30 */ 631 in0 = __lsx_vld(src1, 0); 632 in1 = __lsx_vld(src1 + buf_pitch_4, 0); 633 in2 = __lsx_vld(src1 + buf_pitch_8, 0); 634 in3 = __lsx_vld(src1 + buf_pitch_8 + buf_pitch_4, 0); 635 in4 = __lsx_vld(src1 + buf_pitch_16, 0); 636 in5 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_4, 0); 637 in6 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8, 0); 638 in7 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8 + buf_pitch_4, 0); 639 640 DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6, 641 src0_r, src1_r, src2_r, src3_r); 642 DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6, 643 src0_l, src1_l, src2_l, src3_l); 644 645 /* loop for all columns of constants */ 646 for (i = 0; i < 8; i++) { 647 /* processing single column of constants */ 648 filter0 = __lsx_vldrepl_w(filter_ptr1, 0); 649 filter1 = __lsx_vldrepl_w(filter_ptr1, 4); 650 filter2 = __lsx_vldrepl_w(filter_ptr1, 8); 651 filter3 = __lsx_vldrepl_w(filter_ptr1, 12); 652 sum0_r = __lsx_vdp2_w_h(src0_r, filter0); 653 sum0_l = __lsx_vdp2_w_h(src0_l, filter0); 654 sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); 655 sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); 656 sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2); 657 sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2); 658 sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3); 659 sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3); 660 661 tmp0_r = __lsx_vld(tmp_buf_ptr + (i << 3), 0); 662 tmp0_l = __lsx_vld(tmp_buf_ptr + (i << 3), 16); 663 tmp1_r = tmp0_r; 664 tmp1_l = tmp0_l; 665 tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r); 666 tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l); 667 tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r); 668 tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l); 669 __lsx_vst(tmp0_r, tmp_buf_ptr + (i << 3), 0); 670 __lsx_vst(tmp0_l, tmp_buf_ptr + (i << 3), 16); 671 __lsx_vst(tmp1_r, tmp_buf_ptr + ((15 - i) * 8), 0); 672 __lsx_vst(tmp1_l, tmp_buf_ptr + ((15 - i) * 8), 16); 673 674 filter_ptr1 += 8; 675 } 676 677 /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */ 678 in0 = __lsx_vld(src0, 0); 679 in1 = __lsx_vld(src0 + buf_pitch_2, 0); 680 in2 = __lsx_vld(src0 + buf_pitch_4, 0); 681 in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0); 682 in4 = __lsx_vld(src0 + buf_pitch_8, 0); 683 in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0); 684 in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0); 685 in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0); 686 687 src0 += 16 * buf_pitch; 688 DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6, 689 src0_r, src1_r, src2_r, src3_r); 690 DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6, 691 src0_l, src1_l, src2_l, src3_l); 692 in0 = __lsx_vld(src0, 0); 693 in1 = __lsx_vld(src0 + buf_pitch_2, 0); 694 in2 = __lsx_vld(src0 + buf_pitch_4, 0); 695 in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0); 696 in4 = __lsx_vld(src0 + buf_pitch_8, 0); 697 in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0); 698 in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0); 699 in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0); 700 701 DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6, 702 src4_r, src5_r, src6_r, src7_r); 703 DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6, 704 src4_l, src5_l, src6_l, src7_l); 705 706 /* loop for all columns of filter constants */ 707 for (i = 0; i < 16; i++) { 708 /* processing single column of constants */ 709 filter0 = __lsx_vldrepl_w(filter_ptr0, 0); 710 filter1 = __lsx_vldrepl_w(filter_ptr0, 4); 711 filter2 = __lsx_vldrepl_w(filter_ptr0, 8); 712 filter3 = __lsx_vldrepl_w(filter_ptr0, 12); 713 sum0_r = __lsx_vdp2_w_h(src0_r, filter0); 714 sum0_l = __lsx_vdp2_w_h(src0_l, filter0); 715 sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1); 716 sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1); 717 sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2); 718 sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2); 719 sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3); 720 sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3); 721 tmp1_r = sum0_r; 722 tmp1_l = sum0_l; 723 724 filter0 = __lsx_vldrepl_w(filter_ptr0, 16); 725 filter1 = __lsx_vldrepl_w(filter_ptr0, 20); 726 filter2 = __lsx_vldrepl_w(filter_ptr0, 24); 727 filter3 = __lsx_vldrepl_w(filter_ptr0, 28); 728 sum0_r = __lsx_vdp2_w_h(src4_r, filter0); 729 sum0_l = __lsx_vdp2_w_h(src4_l, filter0); 730 sum0_r = __lsx_vdp2add_w_h(sum0_r, src5_r, filter1); 731 sum0_l = __lsx_vdp2add_w_h(sum0_l, src5_l, filter1); 732 sum0_r = __lsx_vdp2add_w_h(sum0_r, src6_r, filter2); 733 sum0_l = __lsx_vdp2add_w_h(sum0_l, src6_l, filter2); 734 sum0_r = __lsx_vdp2add_w_h(sum0_r, src7_r, filter3); 735 sum0_l = __lsx_vdp2add_w_h(sum0_l, src7_l, filter3); 736 sum0_r = __lsx_vadd_w(sum0_r, tmp1_r); 737 sum0_l = __lsx_vadd_w(sum0_l, tmp1_l); 738 739 tmp0_r = __lsx_vld(tmp_buf_ptr + i * 8, 0); 740 tmp0_l = __lsx_vld(tmp_buf_ptr + i * 8, 16); 741 tmp1_r = tmp0_r; 742 tmp1_l = tmp0_l; 743 tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r); 744 tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l); 745 sum1_r = __lsx_vreplgr2vr_w(round); 746 tmp0_r = __lsx_vssrarn_h_w(tmp0_r, sum1_r); 747 tmp0_l = __lsx_vssrarn_h_w(tmp0_l, sum1_r); 748 in0 = __lsx_vpackev_d(tmp0_l, tmp0_r); 749 __lsx_vst(in0, (coeffs + i * buf_pitch), 0); 750 tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r); 751 tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l); 752 tmp1_r = __lsx_vssrarn_h_w(tmp1_r, sum1_r); 753 tmp1_l = __lsx_vssrarn_h_w(tmp1_l, sum1_r); 754 in0 = __lsx_vpackev_d(tmp1_l, tmp1_r); 755 __lsx_vst(in0, (coeffs + (31 - i) * buf_pitch), 0); 756 757 filter_ptr0 += 16; 758 } 759} 760 761static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf) 762{ 763 uint8_t i; 764 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 765 766 for (i = 0; i < 4; i++) { 767 DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 64, coeffs, 128, 768 coeffs, 192, in0, in1, in2, in3); 769 DUP4_ARG2(__lsx_vld, coeffs, 256, coeffs, 320, coeffs, 384, 770 coeffs, 448, in4, in5, in6, in7); 771 coeffs += 8; 772 LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, 773 in0, in1, in2, in3, in4, in5, in6, in7); 774 __lsx_vst(in0, tmp_buf, 0); 775 __lsx_vst(in1, tmp_buf, 16); 776 __lsx_vst(in2, tmp_buf, 32); 777 __lsx_vst(in3, tmp_buf, 48); 778 __lsx_vst(in4, tmp_buf, 64); 779 __lsx_vst(in5, tmp_buf, 80); 780 __lsx_vst(in6, tmp_buf, 96); 781 __lsx_vst(in7, tmp_buf, 112); 782 tmp_buf += 64; 783 } 784} 785 786static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs) 787{ 788 uint8_t i; 789 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 790 791 for (i = 0; i < 4; i++) { 792 DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 16, tmp_buf, 32, 793 tmp_buf, 48, in0, in1, in2, in3); 794 DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 80, tmp_buf, 96, 795 tmp_buf, 112, in4, in5, in6, in7); 796 tmp_buf += 64; 797 LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, 798 in0, in1, in2, in3, in4, in5, in6, in7); 799 __lsx_vst(in0, coeffs, 0); 800 __lsx_vst(in1, coeffs, 64); 801 __lsx_vst(in2, coeffs, 128); 802 __lsx_vst(in3, coeffs, 192); 803 __lsx_vst(in4, coeffs, 256); 804 __lsx_vst(in5, coeffs, 320); 805 __lsx_vst(in6, coeffs, 384); 806 __lsx_vst(in7, coeffs, 448); 807 coeffs += 8; 808 } 809} 810 811void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit) 812{ 813 uint8_t row_cnt, col_cnt; 814 int16_t *src = coeffs; 815 int16_t tmp_buf[8 * 32 + 31]; 816 int16_t *tmp_buf_ptr = tmp_buf + 31; 817 uint8_t round; 818 int32_t buf_pitch; 819 820 /* Align pointer to 64 byte boundary */ 821 tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63); 822 823 /* column transform */ 824 round = 7; 825 buf_pitch = 32; 826 for (col_cnt = 0; col_cnt < 4; col_cnt++) { 827 /* process 8x32 blocks */ 828 hevc_idct_8x32_column_lsx((coeffs + col_cnt * 8), buf_pitch, round); 829 } 830 831 /* row transform */ 832 round = 12; 833 buf_pitch = 8; 834 for (row_cnt = 0; row_cnt < 4; row_cnt++) { 835 /* process 32x8 blocks */ 836 src = (coeffs + 32 * 8 * row_cnt); 837 838 hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr); 839 hevc_idct_8x32_column_lsx(tmp_buf_ptr, buf_pitch, round); 840 hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src); 841 } 842} 843