1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Loongson LASX optimized h264dsp 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited 5cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 6cabdff1aSopenharmony_ci * Xiwei Gu <guxiwei-hf@loongson.cn> 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * This file is part of FFmpeg. 9cabdff1aSopenharmony_ci * 10cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 11cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 12cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 13cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 14cabdff1aSopenharmony_ci * 15cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 16cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 17cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18cabdff1aSopenharmony_ci * Lesser General Public License for more details. 19cabdff1aSopenharmony_ci * 20cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 21cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 22cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23cabdff1aSopenharmony_ci */ 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 26cabdff1aSopenharmony_ci#include "h264dsp_lasx.h" 27cabdff1aSopenharmony_ci#include "libavcodec/bit_depth_template.c" 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ci#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3) \ 30cabdff1aSopenharmony_ci{ \ 31cabdff1aSopenharmony_ci __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 32cabdff1aSopenharmony_ci \ 33cabdff1aSopenharmony_ci tmp0_m = __lasx_xvadd_h(in0, in2); \ 34cabdff1aSopenharmony_ci tmp1_m = __lasx_xvsub_h(in0, in2); \ 35cabdff1aSopenharmony_ci tmp2_m = __lasx_xvsrai_h(in1, 1); \ 36cabdff1aSopenharmony_ci tmp2_m = __lasx_xvsub_h(tmp2_m, in3); \ 37cabdff1aSopenharmony_ci tmp3_m = __lasx_xvsrai_h(in3, 1); \ 38cabdff1aSopenharmony_ci tmp3_m = __lasx_xvadd_h(in1, tmp3_m); \ 39cabdff1aSopenharmony_ci \ 40cabdff1aSopenharmony_ci LASX_BUTTERFLY_4_H(tmp0_m, tmp1_m, tmp2_m, tmp3_m, \ 41cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 42cabdff1aSopenharmony_ci} 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_civoid ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride) 45cabdff1aSopenharmony_ci{ 46cabdff1aSopenharmony_ci __m256i src0_m, src1_m, src2_m, src3_m; 47cabdff1aSopenharmony_ci __m256i dst0_m, dst1_m; 48cabdff1aSopenharmony_ci __m256i hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3; 49cabdff1aSopenharmony_ci __m256i inp0_m, inp1_m, res0_m, src1, src3; 50cabdff1aSopenharmony_ci __m256i src0 = __lasx_xvld(src, 0); 51cabdff1aSopenharmony_ci __m256i src2 = __lasx_xvld(src, 16); 52cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 53cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 54cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci __lasx_xvst(zero, src, 0); 57cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_d, src0, src0, src2, src2, src1, src3); 58cabdff1aSopenharmony_ci AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3); 59cabdff1aSopenharmony_ci LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3); 60cabdff1aSopenharmony_ci AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3); 61cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x, 62cabdff1aSopenharmony_ci dst, dst_stride_3x, src0_m, src1_m, src2_m, src3_m); 63cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvld, dst, 0, dst + dst_stride, 0, dst + dst_stride_2x, 64cabdff1aSopenharmony_ci 0, dst + dst_stride_3x, 0, src0_m, src1_m, src2_m, src3_m); 65cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, vres1, vres0, vres3, vres2, inp0_m, inp1_m); 66cabdff1aSopenharmony_ci inp0_m = __lasx_xvpermi_q(inp1_m, inp0_m, 0x20); 67cabdff1aSopenharmony_ci inp0_m = __lasx_xvsrari_h(inp0_m, 6); 68cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_w, src1_m, src0_m, src3_m, src2_m, dst0_m, dst1_m); 69cabdff1aSopenharmony_ci dst0_m = __lasx_xvilvl_d(dst1_m, dst0_m); 70cabdff1aSopenharmony_ci res0_m = __lasx_vext2xv_hu_bu(dst0_m); 71cabdff1aSopenharmony_ci res0_m = __lasx_xvadd_h(res0_m, inp0_m); 72cabdff1aSopenharmony_ci res0_m = __lasx_xvclip255_h(res0_m); 73cabdff1aSopenharmony_ci dst0_m = __lasx_xvpickev_b(res0_m, res0_m); 74cabdff1aSopenharmony_ci __lasx_xvstelm_w(dst0_m, dst, 0, 0); 75cabdff1aSopenharmony_ci __lasx_xvstelm_w(dst0_m, dst + dst_stride, 0, 1); 76cabdff1aSopenharmony_ci __lasx_xvstelm_w(dst0_m, dst + dst_stride_2x, 0, 4); 77cabdff1aSopenharmony_ci __lasx_xvstelm_w(dst0_m, dst + dst_stride_3x, 0, 5); 78cabdff1aSopenharmony_ci} 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_civoid ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src, 81cabdff1aSopenharmony_ci int32_t dst_stride) 82cabdff1aSopenharmony_ci{ 83cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 84cabdff1aSopenharmony_ci __m256i vec0, vec1, vec2, vec3; 85cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 86cabdff1aSopenharmony_ci __m256i res0, res1, res2, res3, res4, res5, res6, res7; 87cabdff1aSopenharmony_ci __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 88cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 89cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 90cabdff1aSopenharmony_ci int32_t dst_stride_4x = dst_stride << 2; 91cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci src[0] += 32; 94cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvld, src, 0, src, 16, src, 32, src, 48, 95cabdff1aSopenharmony_ci src0, src1, src2, src3); 96cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvld, src, 64, src, 80, src, 96, src, 112, 97cabdff1aSopenharmony_ci src4, src5, src6, src7); 98cabdff1aSopenharmony_ci __lasx_xvst(zero, src, 0); 99cabdff1aSopenharmony_ci __lasx_xvst(zero, src, 32); 100cabdff1aSopenharmony_ci __lasx_xvst(zero, src, 64); 101cabdff1aSopenharmony_ci __lasx_xvst(zero, src, 96); 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ci vec0 = __lasx_xvadd_h(src0, src4); 104cabdff1aSopenharmony_ci vec1 = __lasx_xvsub_h(src0, src4); 105cabdff1aSopenharmony_ci vec2 = __lasx_xvsrai_h(src2, 1); 106cabdff1aSopenharmony_ci vec2 = __lasx_xvsub_h(vec2, src6); 107cabdff1aSopenharmony_ci vec3 = __lasx_xvsrai_h(src6, 1); 108cabdff1aSopenharmony_ci vec3 = __lasx_xvadd_h(src2, vec3); 109cabdff1aSopenharmony_ci 110cabdff1aSopenharmony_ci LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3); 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci vec0 = __lasx_xvsrai_h(src7, 1); 113cabdff1aSopenharmony_ci vec0 = __lasx_xvsub_h(src5, vec0); 114cabdff1aSopenharmony_ci vec0 = __lasx_xvsub_h(vec0, src3); 115cabdff1aSopenharmony_ci vec0 = __lasx_xvsub_h(vec0, src7); 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci vec1 = __lasx_xvsrai_h(src3, 1); 118cabdff1aSopenharmony_ci vec1 = __lasx_xvsub_h(src1, vec1); 119cabdff1aSopenharmony_ci vec1 = __lasx_xvadd_h(vec1, src7); 120cabdff1aSopenharmony_ci vec1 = __lasx_xvsub_h(vec1, src3); 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_ci vec2 = __lasx_xvsrai_h(src5, 1); 123cabdff1aSopenharmony_ci vec2 = __lasx_xvsub_h(vec2, src1); 124cabdff1aSopenharmony_ci vec2 = __lasx_xvadd_h(vec2, src7); 125cabdff1aSopenharmony_ci vec2 = __lasx_xvadd_h(vec2, src5); 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci vec3 = __lasx_xvsrai_h(src1, 1); 128cabdff1aSopenharmony_ci vec3 = __lasx_xvadd_h(src3, vec3); 129cabdff1aSopenharmony_ci vec3 = __lasx_xvadd_h(vec3, src5); 130cabdff1aSopenharmony_ci vec3 = __lasx_xvadd_h(vec3, src1); 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci tmp4 = __lasx_xvsrai_h(vec3, 2); 133cabdff1aSopenharmony_ci tmp4 = __lasx_xvadd_h(tmp4, vec0); 134cabdff1aSopenharmony_ci tmp5 = __lasx_xvsrai_h(vec2, 2); 135cabdff1aSopenharmony_ci tmp5 = __lasx_xvadd_h(tmp5, vec1); 136cabdff1aSopenharmony_ci tmp6 = __lasx_xvsrai_h(vec1, 2); 137cabdff1aSopenharmony_ci tmp6 = __lasx_xvsub_h(tmp6, vec2); 138cabdff1aSopenharmony_ci tmp7 = __lasx_xvsrai_h(vec0, 2); 139cabdff1aSopenharmony_ci tmp7 = __lasx_xvsub_h(vec3, tmp7); 140cabdff1aSopenharmony_ci 141cabdff1aSopenharmony_ci LASX_BUTTERFLY_8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 142cabdff1aSopenharmony_ci res0, res1, res2, res3, res4, res5, res6, res7); 143cabdff1aSopenharmony_ci LASX_TRANSPOSE8x8_H(res0, res1, res2, res3, res4, res5, res6, res7, 144cabdff1aSopenharmony_ci res0, res1, res2, res3, res4, res5, res6, res7); 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_vext2xv_w_h, res0, res1, res2, res3, 147cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 148cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_vext2xv_w_h, res4, res5, res6, res7, 149cabdff1aSopenharmony_ci tmp4, tmp5, tmp6, tmp7); 150cabdff1aSopenharmony_ci vec0 = __lasx_xvadd_w(tmp0, tmp4); 151cabdff1aSopenharmony_ci vec1 = __lasx_xvsub_w(tmp0, tmp4); 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ci vec2 = __lasx_xvsrai_w(tmp2, 1); 154cabdff1aSopenharmony_ci vec2 = __lasx_xvsub_w(vec2, tmp6); 155cabdff1aSopenharmony_ci vec3 = __lasx_xvsrai_w(tmp6, 1); 156cabdff1aSopenharmony_ci vec3 = __lasx_xvadd_w(vec3, tmp2); 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_ci tmp0 = __lasx_xvadd_w(vec0, vec3); 159cabdff1aSopenharmony_ci tmp2 = __lasx_xvadd_w(vec1, vec2); 160cabdff1aSopenharmony_ci tmp4 = __lasx_xvsub_w(vec1, vec2); 161cabdff1aSopenharmony_ci tmp6 = __lasx_xvsub_w(vec0, vec3); 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci vec0 = __lasx_xvsrai_w(tmp7, 1); 164cabdff1aSopenharmony_ci vec0 = __lasx_xvsub_w(tmp5, vec0); 165cabdff1aSopenharmony_ci vec0 = __lasx_xvsub_w(vec0, tmp3); 166cabdff1aSopenharmony_ci vec0 = __lasx_xvsub_w(vec0, tmp7); 167cabdff1aSopenharmony_ci 168cabdff1aSopenharmony_ci vec1 = __lasx_xvsrai_w(tmp3, 1); 169cabdff1aSopenharmony_ci vec1 = __lasx_xvsub_w(tmp1, vec1); 170cabdff1aSopenharmony_ci vec1 = __lasx_xvadd_w(vec1, tmp7); 171cabdff1aSopenharmony_ci vec1 = __lasx_xvsub_w(vec1, tmp3); 172cabdff1aSopenharmony_ci 173cabdff1aSopenharmony_ci vec2 = __lasx_xvsrai_w(tmp5, 1); 174cabdff1aSopenharmony_ci vec2 = __lasx_xvsub_w(vec2, tmp1); 175cabdff1aSopenharmony_ci vec2 = __lasx_xvadd_w(vec2, tmp7); 176cabdff1aSopenharmony_ci vec2 = __lasx_xvadd_w(vec2, tmp5); 177cabdff1aSopenharmony_ci 178cabdff1aSopenharmony_ci vec3 = __lasx_xvsrai_w(tmp1, 1); 179cabdff1aSopenharmony_ci vec3 = __lasx_xvadd_w(tmp3, vec3); 180cabdff1aSopenharmony_ci vec3 = __lasx_xvadd_w(vec3, tmp5); 181cabdff1aSopenharmony_ci vec3 = __lasx_xvadd_w(vec3, tmp1); 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci tmp1 = __lasx_xvsrai_w(vec3, 2); 184cabdff1aSopenharmony_ci tmp1 = __lasx_xvadd_w(tmp1, vec0); 185cabdff1aSopenharmony_ci tmp3 = __lasx_xvsrai_w(vec2, 2); 186cabdff1aSopenharmony_ci tmp3 = __lasx_xvadd_w(tmp3, vec1); 187cabdff1aSopenharmony_ci tmp5 = __lasx_xvsrai_w(vec1, 2); 188cabdff1aSopenharmony_ci tmp5 = __lasx_xvsub_w(tmp5, vec2); 189cabdff1aSopenharmony_ci tmp7 = __lasx_xvsrai_w(vec0, 2); 190cabdff1aSopenharmony_ci tmp7 = __lasx_xvsub_w(vec3, tmp7); 191cabdff1aSopenharmony_ci 192cabdff1aSopenharmony_ci LASX_BUTTERFLY_4_W(tmp0, tmp2, tmp5, tmp7, res0, res1, res6, res7); 193cabdff1aSopenharmony_ci LASX_BUTTERFLY_4_W(tmp4, tmp6, tmp1, tmp3, res2, res3, res4, res5); 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsrai_w, res0, 6, res1, 6, res2, 6, res3, 6, 196cabdff1aSopenharmony_ci res0, res1, res2, res3); 197cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsrai_w, res4, 6, res5, 6, res6, 6, res7, 6, 198cabdff1aSopenharmony_ci res4, res5, res6, res7); 199cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_h, res1, res0, res3, res2, res5, res4, res7, 200cabdff1aSopenharmony_ci res6, res0, res1, res2, res3); 201cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpermi_d, res0, 0xd8, res1, 0xd8, res2, 0xd8, res3, 0xd8, 202cabdff1aSopenharmony_ci res0, res1, res2, res3); 203cabdff1aSopenharmony_ci 204cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x, 205cabdff1aSopenharmony_ci dst, dst_stride_3x, dst0, dst1, dst2, dst3); 206cabdff1aSopenharmony_ci dst += dst_stride_4x; 207cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x, 208cabdff1aSopenharmony_ci dst, dst_stride_3x, dst4, dst5, dst6, dst7); 209cabdff1aSopenharmony_ci dst -= dst_stride_4x; 210cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3, 211cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 212cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, zero, dst4, zero, dst5, zero, dst6, zero, dst7, 213cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 214cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5, 215cabdff1aSopenharmony_ci dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3); 216cabdff1aSopenharmony_ci res0 = __lasx_xvadd_h(res0, dst0); 217cabdff1aSopenharmony_ci res1 = __lasx_xvadd_h(res1, dst1); 218cabdff1aSopenharmony_ci res2 = __lasx_xvadd_h(res2, dst2); 219cabdff1aSopenharmony_ci res3 = __lasx_xvadd_h(res3, dst3); 220cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, res0, res1, res2, res3, res0, res1, 221cabdff1aSopenharmony_ci res2, res3); 222cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, res1, res0, res3, res2, res0, res1); 223cabdff1aSopenharmony_ci __lasx_xvstelm_d(res0, dst, 0, 0); 224cabdff1aSopenharmony_ci __lasx_xvstelm_d(res0, dst + dst_stride, 0, 2); 225cabdff1aSopenharmony_ci __lasx_xvstelm_d(res0, dst + dst_stride_2x, 0, 1); 226cabdff1aSopenharmony_ci __lasx_xvstelm_d(res0, dst + dst_stride_3x, 0, 3); 227cabdff1aSopenharmony_ci dst += dst_stride_4x; 228cabdff1aSopenharmony_ci __lasx_xvstelm_d(res1, dst, 0, 0); 229cabdff1aSopenharmony_ci __lasx_xvstelm_d(res1, dst + dst_stride, 0, 2); 230cabdff1aSopenharmony_ci __lasx_xvstelm_d(res1, dst + dst_stride_2x, 0, 1); 231cabdff1aSopenharmony_ci __lasx_xvstelm_d(res1, dst + dst_stride_3x, 0, 3); 232cabdff1aSopenharmony_ci} 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_civoid ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src, 235cabdff1aSopenharmony_ci int32_t dst_stride) 236cabdff1aSopenharmony_ci{ 237cabdff1aSopenharmony_ci const int16_t dc = (src[0] + 32) >> 6; 238cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 239cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 240cabdff1aSopenharmony_ci __m256i pred, out; 241cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3; 242cabdff1aSopenharmony_ci __m256i input_dc = __lasx_xvreplgr2vr_h(dc); 243cabdff1aSopenharmony_ci 244cabdff1aSopenharmony_ci src[0] = 0; 245cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x, 246cabdff1aSopenharmony_ci dst, dst_stride_3x, src0, src1, src2, src3); 247cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src0, src1); 248cabdff1aSopenharmony_ci 249cabdff1aSopenharmony_ci pred = __lasx_xvpermi_q(src0, src1, 0x02); 250cabdff1aSopenharmony_ci pred = __lasx_xvaddw_h_h_bu(input_dc, pred); 251cabdff1aSopenharmony_ci pred = __lasx_xvclip255_h(pred); 252cabdff1aSopenharmony_ci out = __lasx_xvpickev_b(pred, pred); 253cabdff1aSopenharmony_ci __lasx_xvstelm_w(out, dst, 0, 0); 254cabdff1aSopenharmony_ci __lasx_xvstelm_w(out, dst + dst_stride, 0, 1); 255cabdff1aSopenharmony_ci __lasx_xvstelm_w(out, dst + dst_stride_2x, 0, 4); 256cabdff1aSopenharmony_ci __lasx_xvstelm_w(out, dst + dst_stride_3x, 0, 5); 257cabdff1aSopenharmony_ci} 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_civoid ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src, 260cabdff1aSopenharmony_ci int32_t dst_stride) 261cabdff1aSopenharmony_ci{ 262cabdff1aSopenharmony_ci int32_t dc_val; 263cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 264cabdff1aSopenharmony_ci int32_t dst_stride_4x = dst_stride << 2; 265cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 266cabdff1aSopenharmony_ci __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 267cabdff1aSopenharmony_ci __m256i dc; 268cabdff1aSopenharmony_ci 269cabdff1aSopenharmony_ci dc_val = (src[0] + 32) >> 6; 270cabdff1aSopenharmony_ci dc = __lasx_xvreplgr2vr_h(dc_val); 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci src[0] = 0; 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x, 275cabdff1aSopenharmony_ci dst, dst_stride_3x, dst0, dst1, dst2, dst3); 276cabdff1aSopenharmony_ci dst += dst_stride_4x; 277cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x, 278cabdff1aSopenharmony_ci dst, dst_stride_3x, dst4, dst5, dst6, dst7); 279cabdff1aSopenharmony_ci dst -= dst_stride_4x; 280cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_vext2xv_hu_bu, dst0, dst1, dst2, dst3, 281cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 282cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_vext2xv_hu_bu, dst4, dst5, dst6, dst7, 283cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 284cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5, 285cabdff1aSopenharmony_ci dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3); 286cabdff1aSopenharmony_ci dst0 = __lasx_xvadd_h(dst0, dc); 287cabdff1aSopenharmony_ci dst1 = __lasx_xvadd_h(dst1, dc); 288cabdff1aSopenharmony_ci dst2 = __lasx_xvadd_h(dst2, dc); 289cabdff1aSopenharmony_ci dst3 = __lasx_xvadd_h(dst3, dc); 290cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, dst0, dst1, dst2, dst3, 291cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 292cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, dst1, dst0, dst3, dst2, dst0, dst1); 293cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 0); 294cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + dst_stride, 0, 2); 295cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + dst_stride_2x, 0, 1); 296cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + dst_stride_3x, 0, 3); 297cabdff1aSopenharmony_ci dst += dst_stride_4x; 298cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 0, 0); 299cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + dst_stride, 0, 2); 300cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + dst_stride_2x, 0, 1); 301cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + dst_stride_3x, 0, 3); 302cabdff1aSopenharmony_ci} 303cabdff1aSopenharmony_ci 304cabdff1aSopenharmony_civoid ff_h264_idct_add16_lasx(uint8_t *dst, 305cabdff1aSopenharmony_ci const int32_t *blk_offset, 306cabdff1aSopenharmony_ci int16_t *block, int32_t dst_stride, 307cabdff1aSopenharmony_ci const uint8_t nzc[15 * 8]) 308cabdff1aSopenharmony_ci{ 309cabdff1aSopenharmony_ci int32_t i; 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci for (i = 0; i < 16; i++) { 312cabdff1aSopenharmony_ci int32_t nnz = nzc[scan8[i]]; 313cabdff1aSopenharmony_ci 314cabdff1aSopenharmony_ci if (nnz) { 315cabdff1aSopenharmony_ci if (nnz == 1 && ((dctcoef *) block)[i * 16]) 316cabdff1aSopenharmony_ci ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i], 317cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 318cabdff1aSopenharmony_ci dst_stride); 319cabdff1aSopenharmony_ci else 320cabdff1aSopenharmony_ci ff_h264_idct_add_lasx(dst + blk_offset[i], 321cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 322cabdff1aSopenharmony_ci dst_stride); 323cabdff1aSopenharmony_ci } 324cabdff1aSopenharmony_ci } 325cabdff1aSopenharmony_ci} 326cabdff1aSopenharmony_ci 327cabdff1aSopenharmony_civoid ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset, 328cabdff1aSopenharmony_ci int16_t *block, int32_t dst_stride, 329cabdff1aSopenharmony_ci const uint8_t nzc[15 * 8]) 330cabdff1aSopenharmony_ci{ 331cabdff1aSopenharmony_ci int32_t cnt; 332cabdff1aSopenharmony_ci 333cabdff1aSopenharmony_ci for (cnt = 0; cnt < 16; cnt += 4) { 334cabdff1aSopenharmony_ci int32_t nnz = nzc[scan8[cnt]]; 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci if (nnz) { 337cabdff1aSopenharmony_ci if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) 338cabdff1aSopenharmony_ci ff_h264_idct8_dc_addblk_lasx(dst + blk_offset[cnt], 339cabdff1aSopenharmony_ci block + cnt * 16 * sizeof(pixel), 340cabdff1aSopenharmony_ci dst_stride); 341cabdff1aSopenharmony_ci else 342cabdff1aSopenharmony_ci ff_h264_idct8_addblk_lasx(dst + blk_offset[cnt], 343cabdff1aSopenharmony_ci block + cnt * 16 * sizeof(pixel), 344cabdff1aSopenharmony_ci dst_stride); 345cabdff1aSopenharmony_ci } 346cabdff1aSopenharmony_ci } 347cabdff1aSopenharmony_ci} 348cabdff1aSopenharmony_ci 349cabdff1aSopenharmony_ci 350cabdff1aSopenharmony_civoid ff_h264_idct_add8_lasx(uint8_t **dst, 351cabdff1aSopenharmony_ci const int32_t *blk_offset, 352cabdff1aSopenharmony_ci int16_t *block, int32_t dst_stride, 353cabdff1aSopenharmony_ci const uint8_t nzc[15 * 8]) 354cabdff1aSopenharmony_ci{ 355cabdff1aSopenharmony_ci int32_t i; 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_ci for (i = 16; i < 20; i++) { 358cabdff1aSopenharmony_ci if (nzc[scan8[i]]) 359cabdff1aSopenharmony_ci ff_h264_idct_add_lasx(dst[0] + blk_offset[i], 360cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 361cabdff1aSopenharmony_ci dst_stride); 362cabdff1aSopenharmony_ci else if (((dctcoef *) block)[i * 16]) 363cabdff1aSopenharmony_ci ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i], 364cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 365cabdff1aSopenharmony_ci dst_stride); 366cabdff1aSopenharmony_ci } 367cabdff1aSopenharmony_ci for (i = 32; i < 36; i++) { 368cabdff1aSopenharmony_ci if (nzc[scan8[i]]) 369cabdff1aSopenharmony_ci ff_h264_idct_add_lasx(dst[1] + blk_offset[i], 370cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 371cabdff1aSopenharmony_ci dst_stride); 372cabdff1aSopenharmony_ci else if (((dctcoef *) block)[i * 16]) 373cabdff1aSopenharmony_ci ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i], 374cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 375cabdff1aSopenharmony_ci dst_stride); 376cabdff1aSopenharmony_ci } 377cabdff1aSopenharmony_ci} 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_civoid ff_h264_idct_add8_422_lasx(uint8_t **dst, 380cabdff1aSopenharmony_ci const int32_t *blk_offset, 381cabdff1aSopenharmony_ci int16_t *block, int32_t dst_stride, 382cabdff1aSopenharmony_ci const uint8_t nzc[15 * 8]) 383cabdff1aSopenharmony_ci{ 384cabdff1aSopenharmony_ci int32_t i; 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_ci for (i = 16; i < 20; i++) { 387cabdff1aSopenharmony_ci if (nzc[scan8[i]]) 388cabdff1aSopenharmony_ci ff_h264_idct_add_lasx(dst[0] + blk_offset[i], 389cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 390cabdff1aSopenharmony_ci dst_stride); 391cabdff1aSopenharmony_ci else if (((dctcoef *) block)[i * 16]) 392cabdff1aSopenharmony_ci ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i], 393cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 394cabdff1aSopenharmony_ci dst_stride); 395cabdff1aSopenharmony_ci } 396cabdff1aSopenharmony_ci for (i = 32; i < 36; i++) { 397cabdff1aSopenharmony_ci if (nzc[scan8[i]]) 398cabdff1aSopenharmony_ci ff_h264_idct_add_lasx(dst[1] + blk_offset[i], 399cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 400cabdff1aSopenharmony_ci dst_stride); 401cabdff1aSopenharmony_ci else if (((dctcoef *) block)[i * 16]) 402cabdff1aSopenharmony_ci ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i], 403cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 404cabdff1aSopenharmony_ci dst_stride); 405cabdff1aSopenharmony_ci } 406cabdff1aSopenharmony_ci for (i = 20; i < 24; i++) { 407cabdff1aSopenharmony_ci if (nzc[scan8[i + 4]]) 408cabdff1aSopenharmony_ci ff_h264_idct_add_lasx(dst[0] + blk_offset[i + 4], 409cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 410cabdff1aSopenharmony_ci dst_stride); 411cabdff1aSopenharmony_ci else if (((dctcoef *) block)[i * 16]) 412cabdff1aSopenharmony_ci ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i + 4], 413cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 414cabdff1aSopenharmony_ci dst_stride); 415cabdff1aSopenharmony_ci } 416cabdff1aSopenharmony_ci for (i = 36; i < 40; i++) { 417cabdff1aSopenharmony_ci if (nzc[scan8[i + 4]]) 418cabdff1aSopenharmony_ci ff_h264_idct_add_lasx(dst[1] + blk_offset[i + 4], 419cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 420cabdff1aSopenharmony_ci dst_stride); 421cabdff1aSopenharmony_ci else if (((dctcoef *) block)[i * 16]) 422cabdff1aSopenharmony_ci ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i + 4], 423cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 424cabdff1aSopenharmony_ci dst_stride); 425cabdff1aSopenharmony_ci } 426cabdff1aSopenharmony_ci} 427cabdff1aSopenharmony_ci 428cabdff1aSopenharmony_civoid ff_h264_idct_add16_intra_lasx(uint8_t *dst, 429cabdff1aSopenharmony_ci const int32_t *blk_offset, 430cabdff1aSopenharmony_ci int16_t *block, 431cabdff1aSopenharmony_ci int32_t dst_stride, 432cabdff1aSopenharmony_ci const uint8_t nzc[15 * 8]) 433cabdff1aSopenharmony_ci{ 434cabdff1aSopenharmony_ci int32_t i; 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci for (i = 0; i < 16; i++) { 437cabdff1aSopenharmony_ci if (nzc[scan8[i]]) 438cabdff1aSopenharmony_ci ff_h264_idct_add_lasx(dst + blk_offset[i], 439cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), dst_stride); 440cabdff1aSopenharmony_ci else if (((dctcoef *) block)[i * 16]) 441cabdff1aSopenharmony_ci ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i], 442cabdff1aSopenharmony_ci block + i * 16 * sizeof(pixel), 443cabdff1aSopenharmony_ci dst_stride); 444cabdff1aSopenharmony_ci } 445cabdff1aSopenharmony_ci} 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_civoid ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src, 448cabdff1aSopenharmony_ci int32_t de_qval) 449cabdff1aSopenharmony_ci{ 450cabdff1aSopenharmony_ci#define DC_DEST_STRIDE 16 451cabdff1aSopenharmony_ci 452cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3; 453cabdff1aSopenharmony_ci __m256i vec0, vec1, vec2, vec3; 454cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3; 455cabdff1aSopenharmony_ci __m256i hres0, hres1, hres2, hres3; 456cabdff1aSopenharmony_ci __m256i vres0, vres1, vres2, vres3; 457cabdff1aSopenharmony_ci __m256i de_q_vec = __lasx_xvreplgr2vr_w(de_qval); 458cabdff1aSopenharmony_ci 459cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvld, src, 0, src, 8, src, 16, src, 24, 460cabdff1aSopenharmony_ci src0, src1, src2, src3); 461cabdff1aSopenharmony_ci LASX_TRANSPOSE4x4_H(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); 462cabdff1aSopenharmony_ci LASX_BUTTERFLY_4_H(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1); 463cabdff1aSopenharmony_ci LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1); 464cabdff1aSopenharmony_ci LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3, 465cabdff1aSopenharmony_ci hres0, hres1, hres2, hres3); 466cabdff1aSopenharmony_ci LASX_BUTTERFLY_4_H(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1); 467cabdff1aSopenharmony_ci LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3); 468cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_vext2xv_w_h, vres0, vres1, vres2, vres3, 469cabdff1aSopenharmony_ci vres0, vres1, vres2, vres3); 470cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, vres1, vres0, 0x20, vres3, vres2, 0x20, 471cabdff1aSopenharmony_ci vres0, vres1); 472cabdff1aSopenharmony_ci 473cabdff1aSopenharmony_ci vres0 = __lasx_xvmul_w(vres0, de_q_vec); 474cabdff1aSopenharmony_ci vres1 = __lasx_xvmul_w(vres1, de_q_vec); 475cabdff1aSopenharmony_ci 476cabdff1aSopenharmony_ci vres0 = __lasx_xvsrari_w(vres0, 8); 477cabdff1aSopenharmony_ci vres1 = __lasx_xvsrari_w(vres1, 8); 478cabdff1aSopenharmony_ci vec0 = __lasx_xvpickev_h(vres1, vres0); 479cabdff1aSopenharmony_ci vec0 = __lasx_xvpermi_d(vec0, 0xd8); 480cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 0 * DC_DEST_STRIDE, 0, 0); 481cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 2 * DC_DEST_STRIDE, 0, 1); 482cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 8 * DC_DEST_STRIDE, 0, 2); 483cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 10 * DC_DEST_STRIDE, 0, 3); 484cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 1 * DC_DEST_STRIDE, 0, 4); 485cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 3 * DC_DEST_STRIDE, 0, 5); 486cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 9 * DC_DEST_STRIDE, 0, 6); 487cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 11 * DC_DEST_STRIDE, 0, 7); 488cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 4 * DC_DEST_STRIDE, 0, 8); 489cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 6 * DC_DEST_STRIDE, 0, 9); 490cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 12 * DC_DEST_STRIDE, 0, 10); 491cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 14 * DC_DEST_STRIDE, 0, 11); 492cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 5 * DC_DEST_STRIDE, 0, 12); 493cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 7 * DC_DEST_STRIDE, 0, 13); 494cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 13 * DC_DEST_STRIDE, 0, 14); 495cabdff1aSopenharmony_ci __lasx_xvstelm_h(vec0, dst + 15 * DC_DEST_STRIDE, 0, 15); 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci#undef DC_DEST_STRIDE 498cabdff1aSopenharmony_ci} 499