1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Loongson LASX optimized h264chroma 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (c) 2020 Loongson Technology Corporation Limited 5cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * This file is part of FFmpeg. 8cabdff1aSopenharmony_ci * 9cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci * 14cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci * Lesser General Public License for more details. 18cabdff1aSopenharmony_ci * 19cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci */ 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci#include "h264chroma_lasx.h" 25cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 26cabdff1aSopenharmony_ci#include "libavutil/avassert.h" 27cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_cistatic const uint8_t chroma_mask_arr[64] = { 30cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 31cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 32cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 33cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 34cabdff1aSopenharmony_ci}; 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hv_8x4_lasx(uint8_t *src, uint8_t *dst, 37cabdff1aSopenharmony_ci ptrdiff_t stride, uint32_t coef_hor0, 38cabdff1aSopenharmony_ci uint32_t coef_hor1, uint32_t coef_ver0, 39cabdff1aSopenharmony_ci uint32_t coef_ver1) 40cabdff1aSopenharmony_ci{ 41cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 42cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 43cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride_2x << 1; 44cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, out; 45cabdff1aSopenharmony_ci __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1; 46cabdff1aSopenharmony_ci __m256i mask; 47cabdff1aSopenharmony_ci __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 48cabdff1aSopenharmony_ci __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 49cabdff1aSopenharmony_ci __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 50cabdff1aSopenharmony_ci __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 51cabdff1aSopenharmony_ci __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 52cabdff1aSopenharmony_ci 53cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 54cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 55cabdff1aSopenharmony_ci src1, src2, src3, src4); 56cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3); 57cabdff1aSopenharmony_ci src0 = __lasx_xvshuf_b(src0, src0, mask); 58cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3); 59cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1); 60cabdff1aSopenharmony_ci res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec); 61cabdff1aSopenharmony_ci res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0); 62cabdff1aSopenharmony_ci res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0); 63cabdff1aSopenharmony_ci res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20); 64cabdff1aSopenharmony_ci res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3); 65cabdff1aSopenharmony_ci res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1); 66cabdff1aSopenharmony_ci res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1); 67cabdff1aSopenharmony_ci out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6); 68cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst, 0, 0); 69cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride, 0, 2); 70cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 71cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 72cabdff1aSopenharmony_ci} 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hv_8x8_lasx(uint8_t *src, uint8_t *dst, 75cabdff1aSopenharmony_ci ptrdiff_t stride, uint32_t coef_hor0, 76cabdff1aSopenharmony_ci uint32_t coef_hor1, uint32_t coef_ver0, 77cabdff1aSopenharmony_ci uint32_t coef_ver1) 78cabdff1aSopenharmony_ci{ 79cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 80cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 81cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 82cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 83cabdff1aSopenharmony_ci __m256i out0, out1; 84cabdff1aSopenharmony_ci __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; 85cabdff1aSopenharmony_ci __m256i res_vt0, res_vt1, res_vt2, res_vt3; 86cabdff1aSopenharmony_ci __m256i mask; 87cabdff1aSopenharmony_ci __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 88cabdff1aSopenharmony_ci __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 89cabdff1aSopenharmony_ci __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 90cabdff1aSopenharmony_ci __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 91cabdff1aSopenharmony_ci __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 94cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 95cabdff1aSopenharmony_ci src1, src2, src3, src4); 96cabdff1aSopenharmony_ci src += stride_4x; 97cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 98cabdff1aSopenharmony_ci src5, src6, src7, src8); 99cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20, 100cabdff1aSopenharmony_ci src8, src7, 0x20, src1, src3, src5, src7); 101cabdff1aSopenharmony_ci src0 = __lasx_xvshuf_b(src0, src0, mask); 102cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7, 103cabdff1aSopenharmony_ci src7, mask, src1, src3, src5, src7); 104cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3, 105cabdff1aSopenharmony_ci coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3); 106cabdff1aSopenharmony_ci res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec); 107cabdff1aSopenharmony_ci res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0); 108cabdff1aSopenharmony_ci res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0); 109cabdff1aSopenharmony_ci res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0); 110cabdff1aSopenharmony_ci res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0); 111cabdff1aSopenharmony_ci res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20); 112cabdff1aSopenharmony_ci res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3); 113cabdff1aSopenharmony_ci res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3); 114cabdff1aSopenharmony_ci res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3); 115cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvmadd_h, res_vt0, res_hz0, coeff_vt_vec1, res_vt1, res_hz1, coeff_vt_vec1, 116cabdff1aSopenharmony_ci res_vt2, res_hz2, coeff_vt_vec1, res_vt3, res_hz3, coeff_vt_vec1, 117cabdff1aSopenharmony_ci res_vt0, res_vt1, res_vt2, res_vt3); 118cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6, out0, out1); 119cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst, 0, 0); 120cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride, 0, 2); 121cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 122cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 123cabdff1aSopenharmony_ci dst += stride_4x; 124cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst, 0, 0); 125cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride, 0, 2); 126cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 127cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 128cabdff1aSopenharmony_ci} 129cabdff1aSopenharmony_ci 130cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hz_8x4_lasx(uint8_t *src, uint8_t *dst, 131cabdff1aSopenharmony_ci ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1) 132cabdff1aSopenharmony_ci{ 133cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 134cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 135cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, out; 136cabdff1aSopenharmony_ci __m256i res0, res1; 137cabdff1aSopenharmony_ci __m256i mask; 138cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 139cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 140cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 143cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 144cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src1, src2); 145cabdff1aSopenharmony_ci src3 = __lasx_xvldx(src, stride_3x); 146cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2); 147cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2); 148cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); 149cabdff1aSopenharmony_ci out = __lasx_xvssrarni_bu_h(res1, res0, 6); 150cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst, 0, 0); 151cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride, 0, 2); 152cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 153cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ci} 156cabdff1aSopenharmony_ci 157cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hz_8x8_lasx(uint8_t *src, uint8_t *dst, 158cabdff1aSopenharmony_ci ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1) 159cabdff1aSopenharmony_ci{ 160cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 161cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 162cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 163cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 164cabdff1aSopenharmony_ci __m256i out0, out1; 165cabdff1aSopenharmony_ci __m256i res0, res1, res2, res3; 166cabdff1aSopenharmony_ci __m256i mask; 167cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 168cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 169cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 172cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 173cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 174cabdff1aSopenharmony_ci src1, src2, src3, src4); 175cabdff1aSopenharmony_ci src += stride_4x; 176cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src5, src6); 177cabdff1aSopenharmony_ci src7 = __lasx_xvldx(src, stride_3x); 178cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20, 179cabdff1aSopenharmony_ci src7, src6, 0x20, src0, src2, src4, src6); 180cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4, mask, 181cabdff1aSopenharmony_ci src6, src6, mask, src0, src2, src4, src6); 182cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6, 183cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 184cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1); 185cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst, 0, 0); 186cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride, 0, 2); 187cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 188cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 189cabdff1aSopenharmony_ci dst += stride_4x; 190cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst, 0, 0); 191cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride, 0, 2); 192cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 193cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 194cabdff1aSopenharmony_ci} 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hz_nonmult_lasx(uint8_t *src, 197cabdff1aSopenharmony_ci uint8_t *dst, ptrdiff_t stride, uint32_t coeff0, 198cabdff1aSopenharmony_ci uint32_t coeff1, int32_t height) 199cabdff1aSopenharmony_ci{ 200cabdff1aSopenharmony_ci uint32_t row; 201cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 202cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 203cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 204cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, out; 205cabdff1aSopenharmony_ci __m256i res0, res1; 206cabdff1aSopenharmony_ci __m256i mask; 207cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 208cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 209cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci mask = __lasx_xvld(chroma_mask_arr, 0); 212cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 213cabdff1aSopenharmony_ci 214cabdff1aSopenharmony_ci for (row = height >> 2; row--;) { 215cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 216cabdff1aSopenharmony_ci src0, src1, src2, src3); 217cabdff1aSopenharmony_ci src += stride_4x; 218cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2); 219cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2); 220cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); 221cabdff1aSopenharmony_ci out = __lasx_xvssrarni_bu_h(res1, res0, 6); 222cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst, 0, 0); 223cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride, 0, 2); 224cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 225cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 226cabdff1aSopenharmony_ci dst += stride_4x; 227cabdff1aSopenharmony_ci } 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci if ((height & 3)) { 230cabdff1aSopenharmony_ci src0 = __lasx_xvld(src, 0); 231cabdff1aSopenharmony_ci src1 = __lasx_xvldx(src, stride); 232cabdff1aSopenharmony_ci src1 = __lasx_xvpermi_q(src1, src0, 0x20); 233cabdff1aSopenharmony_ci src0 = __lasx_xvshuf_b(src1, src1, mask); 234cabdff1aSopenharmony_ci res0 = __lasx_xvdp2_h_bu(src0, coeff_vec); 235cabdff1aSopenharmony_ci out = __lasx_xvssrarni_bu_h(res0, res0, 6); 236cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst, 0, 0); 237cabdff1aSopenharmony_ci dst += stride; 238cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst, 0, 2); 239cabdff1aSopenharmony_ci } 240cabdff1aSopenharmony_ci} 241cabdff1aSopenharmony_ci 242cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_vt_8x4_lasx(uint8_t *src, uint8_t *dst, 243cabdff1aSopenharmony_ci ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1) 244cabdff1aSopenharmony_ci{ 245cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 246cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 247cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, out; 248cabdff1aSopenharmony_ci __m256i res0, res1; 249cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 250cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 251cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 254cabdff1aSopenharmony_ci src0 = __lasx_xvld(src, 0); 255cabdff1aSopenharmony_ci src += stride; 256cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 257cabdff1aSopenharmony_ci src1, src2, src3, src4); 258cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, 259cabdff1aSopenharmony_ci src4, src3, 0x20, src0, src1, src2, src3); 260cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2); 261cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); 262cabdff1aSopenharmony_ci out = __lasx_xvssrarni_bu_h(res1, res0, 6); 263cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst, 0, 0); 264cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride, 0, 2); 265cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 266cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 267cabdff1aSopenharmony_ci} 268cabdff1aSopenharmony_ci 269cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_vt_8x8_lasx(uint8_t *src, uint8_t *dst, 270cabdff1aSopenharmony_ci ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1) 271cabdff1aSopenharmony_ci{ 272cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 273cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 274cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 275cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 276cabdff1aSopenharmony_ci __m256i out0, out1; 277cabdff1aSopenharmony_ci __m256i res0, res1, res2, res3; 278cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 279cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 280cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 283cabdff1aSopenharmony_ci src0 = __lasx_xvld(src, 0); 284cabdff1aSopenharmony_ci src += stride; 285cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 286cabdff1aSopenharmony_ci src1, src2, src3, src4); 287cabdff1aSopenharmony_ci src += stride_4x; 288cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 289cabdff1aSopenharmony_ci src5, src6, src7, src8); 290cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, 291cabdff1aSopenharmony_ci src4, src3, 0x20, src0, src1, src2, src3); 292cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20, 293cabdff1aSopenharmony_ci src8, src7, 0x20, src4, src5, src6, src7); 294cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6, 295cabdff1aSopenharmony_ci src0, src2, src4, src6); 296cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, 297cabdff1aSopenharmony_ci src6, coeff_vec, res0, res1, res2, res3); 298cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1); 299cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst, 0, 0); 300cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride, 0, 2); 301cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 302cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 303cabdff1aSopenharmony_ci dst += stride_4x; 304cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst, 0, 0); 305cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride, 0, 2); 306cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 307cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 308cabdff1aSopenharmony_ci} 309cabdff1aSopenharmony_ci 310cabdff1aSopenharmony_cistatic av_always_inline void copy_width8x8_lasx(uint8_t *src, uint8_t *dst, 311cabdff1aSopenharmony_ci ptrdiff_t stride) 312cabdff1aSopenharmony_ci{ 313cabdff1aSopenharmony_ci uint64_t tmp[8]; 314cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3, stride_4; 315cabdff1aSopenharmony_ci __asm__ volatile ( 316cabdff1aSopenharmony_ci "slli.d %[stride_2], %[stride], 1 \n\t" 317cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[stride] \n\t" 318cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 319cabdff1aSopenharmony_ci "ld.d %[tmp0], %[src], 0x0 \n\t" 320cabdff1aSopenharmony_ci "ldx.d %[tmp1], %[src], %[stride] \n\t" 321cabdff1aSopenharmony_ci "ldx.d %[tmp2], %[src], %[stride_2] \n\t" 322cabdff1aSopenharmony_ci "ldx.d %[tmp3], %[src], %[stride_3] \n\t" 323cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 324cabdff1aSopenharmony_ci "ld.d %[tmp4], %[src], 0x0 \n\t" 325cabdff1aSopenharmony_ci "ldx.d %[tmp5], %[src], %[stride] \n\t" 326cabdff1aSopenharmony_ci "ldx.d %[tmp6], %[src], %[stride_2] \n\t" 327cabdff1aSopenharmony_ci "ldx.d %[tmp7], %[src], %[stride_3] \n\t" 328cabdff1aSopenharmony_ci 329cabdff1aSopenharmony_ci "st.d %[tmp0], %[dst], 0x0 \n\t" 330cabdff1aSopenharmony_ci "stx.d %[tmp1], %[dst], %[stride] \n\t" 331cabdff1aSopenharmony_ci "stx.d %[tmp2], %[dst], %[stride_2] \n\t" 332cabdff1aSopenharmony_ci "stx.d %[tmp3], %[dst], %[stride_3] \n\t" 333cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride_4] \n\t" 334cabdff1aSopenharmony_ci "st.d %[tmp4], %[dst], 0x0 \n\t" 335cabdff1aSopenharmony_ci "stx.d %[tmp5], %[dst], %[stride] \n\t" 336cabdff1aSopenharmony_ci "stx.d %[tmp6], %[dst], %[stride_2] \n\t" 337cabdff1aSopenharmony_ci "stx.d %[tmp7], %[dst], %[stride_3] \n\t" 338cabdff1aSopenharmony_ci : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), 339cabdff1aSopenharmony_ci [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), 340cabdff1aSopenharmony_ci [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]), 341cabdff1aSopenharmony_ci [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]), 342cabdff1aSopenharmony_ci [dst]"+&r"(dst), [src]"+&r"(src), 343cabdff1aSopenharmony_ci [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 344cabdff1aSopenharmony_ci [stride_4]"=&r"(stride_4) 345cabdff1aSopenharmony_ci : [stride]"r"(stride) 346cabdff1aSopenharmony_ci : "memory" 347cabdff1aSopenharmony_ci ); 348cabdff1aSopenharmony_ci} 349cabdff1aSopenharmony_ci 350cabdff1aSopenharmony_cistatic av_always_inline void copy_width8x4_lasx(uint8_t *src, uint8_t *dst, 351cabdff1aSopenharmony_ci ptrdiff_t stride) 352cabdff1aSopenharmony_ci{ 353cabdff1aSopenharmony_ci uint64_t tmp[4]; 354cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3; 355cabdff1aSopenharmony_ci __asm__ volatile ( 356cabdff1aSopenharmony_ci "slli.d %[stride_2], %[stride], 1 \n\t" 357cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[stride] \n\t" 358cabdff1aSopenharmony_ci "ld.d %[tmp0], %[src], 0x0 \n\t" 359cabdff1aSopenharmony_ci "ldx.d %[tmp1], %[src], %[stride] \n\t" 360cabdff1aSopenharmony_ci "ldx.d %[tmp2], %[src], %[stride_2] \n\t" 361cabdff1aSopenharmony_ci "ldx.d %[tmp3], %[src], %[stride_3] \n\t" 362cabdff1aSopenharmony_ci 363cabdff1aSopenharmony_ci "st.d %[tmp0], %[dst], 0x0 \n\t" 364cabdff1aSopenharmony_ci "stx.d %[tmp1], %[dst], %[stride] \n\t" 365cabdff1aSopenharmony_ci "stx.d %[tmp2], %[dst], %[stride_2] \n\t" 366cabdff1aSopenharmony_ci "stx.d %[tmp3], %[dst], %[stride_3] \n\t" 367cabdff1aSopenharmony_ci : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), 368cabdff1aSopenharmony_ci [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), 369cabdff1aSopenharmony_ci [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3) 370cabdff1aSopenharmony_ci : [stride]"r"(stride), [dst]"r"(dst), [src]"r"(src) 371cabdff1aSopenharmony_ci : "memory" 372cabdff1aSopenharmony_ci ); 373cabdff1aSopenharmony_ci} 374cabdff1aSopenharmony_ci 375cabdff1aSopenharmony_cistatic void avc_chroma_hv_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 376cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 377cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1, 378cabdff1aSopenharmony_ci int32_t height) 379cabdff1aSopenharmony_ci{ 380cabdff1aSopenharmony_ci if (4 == height) { 381cabdff1aSopenharmony_ci avc_chroma_hv_8x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 382cabdff1aSopenharmony_ci coef_ver1); 383cabdff1aSopenharmony_ci } else if (8 == height) { 384cabdff1aSopenharmony_ci avc_chroma_hv_8x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 385cabdff1aSopenharmony_ci coef_ver1); 386cabdff1aSopenharmony_ci } 387cabdff1aSopenharmony_ci} 388cabdff1aSopenharmony_ci 389cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 390cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 391cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1) 392cabdff1aSopenharmony_ci{ 393cabdff1aSopenharmony_ci ptrdiff_t stride_2 = stride << 1; 394cabdff1aSopenharmony_ci __m256i src0, src1, src2; 395cabdff1aSopenharmony_ci __m256i res_hz, res_vt; 396cabdff1aSopenharmony_ci __m256i mask; 397cabdff1aSopenharmony_ci __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 398cabdff1aSopenharmony_ci __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 399cabdff1aSopenharmony_ci __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 400cabdff1aSopenharmony_ci __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 401cabdff1aSopenharmony_ci __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 402cabdff1aSopenharmony_ci __m256i coeff_vt_vec = __lasx_xvpermi_q(coeff_vt_vec1, coeff_vt_vec0, 0x02); 403cabdff1aSopenharmony_ci 404cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 405cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2); 406cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src0, src1); 407cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(src0, src1, 0x02); 408cabdff1aSopenharmony_ci res_hz = __lasx_xvdp2_h_bu(src0, coeff_hz_vec); 409cabdff1aSopenharmony_ci res_vt = __lasx_xvmul_h(res_hz, coeff_vt_vec); 410cabdff1aSopenharmony_ci res_hz = __lasx_xvpermi_q(res_hz, res_vt, 0x01); 411cabdff1aSopenharmony_ci res_vt = __lasx_xvadd_h(res_hz, res_vt); 412cabdff1aSopenharmony_ci res_vt = __lasx_xvssrarni_bu_h(res_vt, res_vt, 6); 413cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_vt, dst, 0, 0); 414cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_vt, dst + stride, 0, 1); 415cabdff1aSopenharmony_ci} 416cabdff1aSopenharmony_ci 417cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 418cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 419cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1) 420cabdff1aSopenharmony_ci{ 421cabdff1aSopenharmony_ci ptrdiff_t stride_2 = stride << 1; 422cabdff1aSopenharmony_ci ptrdiff_t stride_3 = stride_2 + stride; 423cabdff1aSopenharmony_ci ptrdiff_t stride_4 = stride_2 << 1; 424cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4; 425cabdff1aSopenharmony_ci __m256i res_hz0, res_hz1, res_vt0, res_vt1; 426cabdff1aSopenharmony_ci __m256i mask; 427cabdff1aSopenharmony_ci __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 428cabdff1aSopenharmony_ci __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 429cabdff1aSopenharmony_ci __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 430cabdff1aSopenharmony_ci __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 431cabdff1aSopenharmony_ci __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 432cabdff1aSopenharmony_ci 433cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 434cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 435cabdff1aSopenharmony_ci src, stride_4, src1, src2, src3, src4); 436cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask, 437cabdff1aSopenharmony_ci src4, src3, mask, src0, src1, src2, src3); 438cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src0, src1); 439cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1); 440cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 441cabdff1aSopenharmony_ci res_hz0 = __lasx_xvadd_h(res_vt0, res_vt1); 442cabdff1aSopenharmony_ci res_hz0 = __lasx_xvssrarni_bu_h(res_hz0, res_hz0, 6); 443cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst, 0, 0); 444cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1); 445cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4); 446cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5); 447cabdff1aSopenharmony_ci} 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x8_lasx(uint8_t *src, uint8_t * dst, ptrdiff_t stride, 450cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 451cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1) 452cabdff1aSopenharmony_ci{ 453cabdff1aSopenharmony_ci ptrdiff_t stride_2 = stride << 1; 454cabdff1aSopenharmony_ci ptrdiff_t stride_3 = stride_2 + stride; 455cabdff1aSopenharmony_ci ptrdiff_t stride_4 = stride_2 << 1; 456cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 457cabdff1aSopenharmony_ci __m256i res_hz0, res_hz1, res_hz2, res_hz3; 458cabdff1aSopenharmony_ci __m256i res_vt0, res_vt1, res_vt2, res_vt3; 459cabdff1aSopenharmony_ci __m256i mask; 460cabdff1aSopenharmony_ci __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 461cabdff1aSopenharmony_ci __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 462cabdff1aSopenharmony_ci __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 463cabdff1aSopenharmony_ci __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 464cabdff1aSopenharmony_ci __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 465cabdff1aSopenharmony_ci 466cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 467cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 468cabdff1aSopenharmony_ci src, stride_4, src1, src2, src3, src4); 469cabdff1aSopenharmony_ci src += stride_4; 470cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 471cabdff1aSopenharmony_ci src, stride_4, src5, src6, src7, src8); 472cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask, 473cabdff1aSopenharmony_ci src4, src3, mask, src0, src1, src2, src3); 474cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvshuf_b, src5, src4, mask, src6, src5, mask, src7, src6, mask, 475cabdff1aSopenharmony_ci src8, src7, mask, src4, src5, src6, src7); 476cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src4, src6, 0x02, 477cabdff1aSopenharmony_ci src5, src7, 0x02, src0, src1, src4, src5); 478cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src4, coeff_hz_vec, 479cabdff1aSopenharmony_ci src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3); 480cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, 481cabdff1aSopenharmony_ci coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 482cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvadd_h, res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2); 483cabdff1aSopenharmony_ci res_hz0 = __lasx_xvssrarni_bu_h(res_vt2, res_vt0, 6); 484cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst, 0, 0); 485cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1); 486cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4); 487cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5); 488cabdff1aSopenharmony_ci dst += stride_4; 489cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst, 0, 2); 490cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst + stride, 0, 3); 491cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 6); 492cabdff1aSopenharmony_ci __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 7); 493cabdff1aSopenharmony_ci} 494cabdff1aSopenharmony_ci 495cabdff1aSopenharmony_cistatic void avc_chroma_hv_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 496cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 497cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1, 498cabdff1aSopenharmony_ci int32_t height) 499cabdff1aSopenharmony_ci{ 500cabdff1aSopenharmony_ci if (8 == height) { 501cabdff1aSopenharmony_ci avc_chroma_hv_4x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 502cabdff1aSopenharmony_ci coef_ver1); 503cabdff1aSopenharmony_ci } else if (4 == height) { 504cabdff1aSopenharmony_ci avc_chroma_hv_4x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 505cabdff1aSopenharmony_ci coef_ver1); 506cabdff1aSopenharmony_ci } else if (2 == height) { 507cabdff1aSopenharmony_ci avc_chroma_hv_4x2_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 508cabdff1aSopenharmony_ci coef_ver1); 509cabdff1aSopenharmony_ci } 510cabdff1aSopenharmony_ci} 511cabdff1aSopenharmony_ci 512cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 513cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 514cabdff1aSopenharmony_ci{ 515cabdff1aSopenharmony_ci __m256i src0, src1; 516cabdff1aSopenharmony_ci __m256i res, mask; 517cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 518cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 519cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 520cabdff1aSopenharmony_ci 521cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 522cabdff1aSopenharmony_ci src1 = __lasx_xvldx(src, stride); 523cabdff1aSopenharmony_ci src0 = __lasx_xvshuf_b(src1, src0, mask); 524cabdff1aSopenharmony_ci res = __lasx_xvdp2_h_bu(src0, coeff_vec); 525cabdff1aSopenharmony_ci res = __lasx_xvslli_h(res, 3); 526cabdff1aSopenharmony_ci res = __lasx_xvssrarni_bu_h(res, res, 6); 527cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst, 0, 0); 528cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst + stride, 0, 1); 529cabdff1aSopenharmony_ci} 530cabdff1aSopenharmony_ci 531cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 532cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 533cabdff1aSopenharmony_ci{ 534cabdff1aSopenharmony_ci ptrdiff_t stride_2 = stride << 1; 535cabdff1aSopenharmony_ci ptrdiff_t stride_3 = stride_2 + stride; 536cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3; 537cabdff1aSopenharmony_ci __m256i res, mask; 538cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 539cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 540cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 543cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2); 544cabdff1aSopenharmony_ci src3 = __lasx_xvldx(src, stride_3); 545cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src0, src2); 546cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(src0, src2, 0x02); 547cabdff1aSopenharmony_ci res = __lasx_xvdp2_h_bu(src0, coeff_vec); 548cabdff1aSopenharmony_ci res = __lasx_xvslli_h(res, 3); 549cabdff1aSopenharmony_ci res = __lasx_xvssrarni_bu_h(res, res, 6); 550cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst, 0, 0); 551cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst + stride, 0, 1); 552cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst + stride_2, 0, 4); 553cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst + stride_3, 0, 5); 554cabdff1aSopenharmony_ci} 555cabdff1aSopenharmony_ci 556cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 557cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 558cabdff1aSopenharmony_ci{ 559cabdff1aSopenharmony_ci ptrdiff_t stride_2 = stride << 1; 560cabdff1aSopenharmony_ci ptrdiff_t stride_3 = stride_2 + stride; 561cabdff1aSopenharmony_ci ptrdiff_t stride_4 = stride_2 << 1; 562cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 563cabdff1aSopenharmony_ci __m256i res0, res1, mask; 564cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 565cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 566cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 567cabdff1aSopenharmony_ci 568cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 569cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); 570cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 571cabdff1aSopenharmony_ci src, stride_4, src1, src2, src3, src4); 572cabdff1aSopenharmony_ci src += stride_4; 573cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src5, src6); 574cabdff1aSopenharmony_ci src7 = __lasx_xvldx(src, stride_3); 575cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask, 576cabdff1aSopenharmony_ci src7, src6, mask, src0, src2, src4, src6); 577cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src4, src6, 0x02, src0, src4); 578cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src4, coeff_vec, res0, res1); 579cabdff1aSopenharmony_ci res0 = __lasx_xvssrarni_bu_h(res1, res0, 6); 580cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst, 0, 0); 581cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride, 0, 1); 582cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride_2, 0, 4); 583cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride_3, 0, 5); 584cabdff1aSopenharmony_ci dst += stride_4; 585cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst, 0, 2); 586cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride, 0, 3); 587cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride_2, 0, 6); 588cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride_3, 0, 7); 589cabdff1aSopenharmony_ci} 590cabdff1aSopenharmony_ci 591cabdff1aSopenharmony_cistatic void avc_chroma_hz_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 592cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1, 593cabdff1aSopenharmony_ci int32_t height) 594cabdff1aSopenharmony_ci{ 595cabdff1aSopenharmony_ci if (8 == height) { 596cabdff1aSopenharmony_ci avc_chroma_hz_4x8_lasx(src, dst, stride, coeff0, coeff1); 597cabdff1aSopenharmony_ci } else if (4 == height) { 598cabdff1aSopenharmony_ci avc_chroma_hz_4x4_lasx(src, dst, stride, coeff0, coeff1); 599cabdff1aSopenharmony_ci } else if (2 == height) { 600cabdff1aSopenharmony_ci avc_chroma_hz_4x2_lasx(src, dst, stride, coeff0, coeff1); 601cabdff1aSopenharmony_ci } 602cabdff1aSopenharmony_ci} 603cabdff1aSopenharmony_ci 604cabdff1aSopenharmony_cistatic void avc_chroma_hz_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 605cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1, 606cabdff1aSopenharmony_ci int32_t height) 607cabdff1aSopenharmony_ci{ 608cabdff1aSopenharmony_ci if (4 == height) { 609cabdff1aSopenharmony_ci avc_chroma_hz_8x4_lasx(src, dst, stride, coeff0, coeff1); 610cabdff1aSopenharmony_ci } else if (8 == height) { 611cabdff1aSopenharmony_ci avc_chroma_hz_8x8_lasx(src, dst, stride, coeff0, coeff1); 612cabdff1aSopenharmony_ci } else { 613cabdff1aSopenharmony_ci avc_chroma_hz_nonmult_lasx(src, dst, stride, coeff0, coeff1, height); 614cabdff1aSopenharmony_ci } 615cabdff1aSopenharmony_ci} 616cabdff1aSopenharmony_ci 617cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 618cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 619cabdff1aSopenharmony_ci{ 620cabdff1aSopenharmony_ci __m256i src0, src1, src2; 621cabdff1aSopenharmony_ci __m256i tmp0, tmp1; 622cabdff1aSopenharmony_ci __m256i res; 623cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 624cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 625cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_ci src0 = __lasx_xvld(src, 0); 628cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src, stride, src, stride << 1, src1, src2); 629cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, tmp0, tmp1); 630cabdff1aSopenharmony_ci tmp0 = __lasx_xvilvl_d(tmp1, tmp0); 631cabdff1aSopenharmony_ci res = __lasx_xvdp2_h_bu(tmp0, coeff_vec); 632cabdff1aSopenharmony_ci res = __lasx_xvslli_h(res, 3); 633cabdff1aSopenharmony_ci res = __lasx_xvssrarni_bu_h(res, res, 6); 634cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst, 0, 0); 635cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst + stride, 0, 1); 636cabdff1aSopenharmony_ci} 637cabdff1aSopenharmony_ci 638cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 639cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 640cabdff1aSopenharmony_ci{ 641cabdff1aSopenharmony_ci ptrdiff_t stride_2 = stride << 1; 642cabdff1aSopenharmony_ci ptrdiff_t stride_3 = stride_2 + stride; 643cabdff1aSopenharmony_ci ptrdiff_t stride_4 = stride_2 << 1; 644cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4; 645cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3; 646cabdff1aSopenharmony_ci __m256i res; 647cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 648cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 649cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 650cabdff1aSopenharmony_ci 651cabdff1aSopenharmony_ci src0 = __lasx_xvld(src, 0); 652cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 653cabdff1aSopenharmony_ci src, stride_4, src1, src2, src3, src4); 654cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, 655cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 656cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 657cabdff1aSopenharmony_ci tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02); 658cabdff1aSopenharmony_ci res = __lasx_xvdp2_h_bu(tmp0, coeff_vec); 659cabdff1aSopenharmony_ci res = __lasx_xvslli_h(res, 3); 660cabdff1aSopenharmony_ci res = __lasx_xvssrarni_bu_h(res, res, 6); 661cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst, 0, 0); 662cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst + stride, 0, 1); 663cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst + stride_2, 0, 4); 664cabdff1aSopenharmony_ci __lasx_xvstelm_w(res, dst + stride_3, 0, 5); 665cabdff1aSopenharmony_ci} 666cabdff1aSopenharmony_ci 667cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 668cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 669cabdff1aSopenharmony_ci{ 670cabdff1aSopenharmony_ci ptrdiff_t stride_2 = stride << 1; 671cabdff1aSopenharmony_ci ptrdiff_t stride_3 = stride_2 + stride; 672cabdff1aSopenharmony_ci ptrdiff_t stride_4 = stride_2 << 1; 673cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 674cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 675cabdff1aSopenharmony_ci __m256i res0, res1; 676cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 677cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 678cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 679cabdff1aSopenharmony_ci 680cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 681cabdff1aSopenharmony_ci src0 = __lasx_xvld(src, 0); 682cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 683cabdff1aSopenharmony_ci src, stride_4, src1, src2, src3, src4); 684cabdff1aSopenharmony_ci src += stride_4; 685cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, 686cabdff1aSopenharmony_ci src, stride_4, src5, src6, src7, src8); 687cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, 688cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 689cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, 690cabdff1aSopenharmony_ci tmp4, tmp5, tmp6, tmp7); 691cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, 692cabdff1aSopenharmony_ci tmp0, tmp2, tmp4, tmp6); 693cabdff1aSopenharmony_ci tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02); 694cabdff1aSopenharmony_ci tmp4 = __lasx_xvpermi_q(tmp4, tmp6, 0x02); 695cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, coeff_vec, tmp4, coeff_vec, res0, res1); 696cabdff1aSopenharmony_ci res0 = __lasx_xvssrarni_bu_h(res1, res0, 6); 697cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst, 0, 0); 698cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride, 0, 1); 699cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride_2, 0, 4); 700cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride_3, 0, 5); 701cabdff1aSopenharmony_ci dst += stride_4; 702cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst, 0, 2); 703cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride, 0, 3); 704cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride_2, 0, 6); 705cabdff1aSopenharmony_ci __lasx_xvstelm_w(res0, dst + stride_3, 0, 7); 706cabdff1aSopenharmony_ci} 707cabdff1aSopenharmony_ci 708cabdff1aSopenharmony_cistatic void avc_chroma_vt_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 709cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1, 710cabdff1aSopenharmony_ci int32_t height) 711cabdff1aSopenharmony_ci{ 712cabdff1aSopenharmony_ci if (8 == height) { 713cabdff1aSopenharmony_ci avc_chroma_vt_4x8_lasx(src, dst, stride, coeff0, coeff1); 714cabdff1aSopenharmony_ci } else if (4 == height) { 715cabdff1aSopenharmony_ci avc_chroma_vt_4x4_lasx(src, dst, stride, coeff0, coeff1); 716cabdff1aSopenharmony_ci } else if (2 == height) { 717cabdff1aSopenharmony_ci avc_chroma_vt_4x2_lasx(src, dst, stride, coeff0, coeff1); 718cabdff1aSopenharmony_ci } 719cabdff1aSopenharmony_ci} 720cabdff1aSopenharmony_ci 721cabdff1aSopenharmony_cistatic void avc_chroma_vt_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 722cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1, 723cabdff1aSopenharmony_ci int32_t height) 724cabdff1aSopenharmony_ci{ 725cabdff1aSopenharmony_ci if (4 == height) { 726cabdff1aSopenharmony_ci avc_chroma_vt_8x4_lasx(src, dst, stride, coeff0, coeff1); 727cabdff1aSopenharmony_ci } else if (8 == height) { 728cabdff1aSopenharmony_ci avc_chroma_vt_8x8_lasx(src, dst, stride, coeff0, coeff1); 729cabdff1aSopenharmony_ci } 730cabdff1aSopenharmony_ci} 731cabdff1aSopenharmony_ci 732cabdff1aSopenharmony_cistatic void copy_width4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 733cabdff1aSopenharmony_ci int32_t height) 734cabdff1aSopenharmony_ci{ 735cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; 736cabdff1aSopenharmony_ci 737cabdff1aSopenharmony_ci if (8 == height) { 738cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3, stride_4; 739cabdff1aSopenharmony_ci 740cabdff1aSopenharmony_ci __asm__ volatile ( 741cabdff1aSopenharmony_ci "slli.d %[stride_2], %[stride], 1 \n\t" 742cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[stride] \n\t" 743cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 744cabdff1aSopenharmony_ci "ld.wu %[tp0], %[src], 0 \n\t" 745cabdff1aSopenharmony_ci "ldx.wu %[tp1], %[src], %[stride] \n\t" 746cabdff1aSopenharmony_ci "ldx.wu %[tp2], %[src], %[stride_2] \n\t" 747cabdff1aSopenharmony_ci "ldx.wu %[tp3], %[src], %[stride_3] \n\t" 748cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 749cabdff1aSopenharmony_ci "ld.wu %[tp4], %[src], 0 \n\t" 750cabdff1aSopenharmony_ci "ldx.wu %[tp5], %[src], %[stride] \n\t" 751cabdff1aSopenharmony_ci "ldx.wu %[tp6], %[src], %[stride_2] \n\t" 752cabdff1aSopenharmony_ci "ldx.wu %[tp7], %[src], %[stride_3] \n\t" 753cabdff1aSopenharmony_ci "st.w %[tp0], %[dst], 0 \n\t" 754cabdff1aSopenharmony_ci "stx.w %[tp1], %[dst], %[stride] \n\t" 755cabdff1aSopenharmony_ci "stx.w %[tp2], %[dst], %[stride_2] \n\t" 756cabdff1aSopenharmony_ci "stx.w %[tp3], %[dst], %[stride_3] \n\t" 757cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride_4] \n\t" 758cabdff1aSopenharmony_ci "st.w %[tp4], %[dst], 0 \n\t" 759cabdff1aSopenharmony_ci "stx.w %[tp5], %[dst], %[stride] \n\t" 760cabdff1aSopenharmony_ci "stx.w %[tp6], %[dst], %[stride_2] \n\t" 761cabdff1aSopenharmony_ci "stx.w %[tp7], %[dst], %[stride_3] \n\t" 762cabdff1aSopenharmony_ci : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3), [stride_4]"+&r"(stride_4), 763cabdff1aSopenharmony_ci [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1), 764cabdff1aSopenharmony_ci [tp2]"+&r"(tp2), [tp3]"+&r"(tp3), [tp4]"+&r"(tp4), [tp5]"+&r"(tp5), 765cabdff1aSopenharmony_ci [tp6]"+&r"(tp6), [tp7]"+&r"(tp7) 766cabdff1aSopenharmony_ci : [stride]"r"(stride) 767cabdff1aSopenharmony_ci : "memory" 768cabdff1aSopenharmony_ci ); 769cabdff1aSopenharmony_ci } else if (4 == height) { 770cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3; 771cabdff1aSopenharmony_ci 772cabdff1aSopenharmony_ci __asm__ volatile ( 773cabdff1aSopenharmony_ci "slli.d %[stride_2], %[stride], 1 \n\t" 774cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[stride] \n\t" 775cabdff1aSopenharmony_ci "ld.wu %[tp0], %[src], 0 \n\t" 776cabdff1aSopenharmony_ci "ldx.wu %[tp1], %[src], %[stride] \n\t" 777cabdff1aSopenharmony_ci "ldx.wu %[tp2], %[src], %[stride_2] \n\t" 778cabdff1aSopenharmony_ci "ldx.wu %[tp3], %[src], %[stride_3] \n\t" 779cabdff1aSopenharmony_ci "st.w %[tp0], %[dst], 0 \n\t" 780cabdff1aSopenharmony_ci "stx.w %[tp1], %[dst], %[stride] \n\t" 781cabdff1aSopenharmony_ci "stx.w %[tp2], %[dst], %[stride_2] \n\t" 782cabdff1aSopenharmony_ci "stx.w %[tp3], %[dst], %[stride_3] \n\t" 783cabdff1aSopenharmony_ci : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3), 784cabdff1aSopenharmony_ci [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1), 785cabdff1aSopenharmony_ci [tp2]"+&r"(tp2), [tp3]"+&r"(tp3) 786cabdff1aSopenharmony_ci : [stride]"r"(stride) 787cabdff1aSopenharmony_ci : "memory" 788cabdff1aSopenharmony_ci ); 789cabdff1aSopenharmony_ci } else if (2 == height) { 790cabdff1aSopenharmony_ci __asm__ volatile ( 791cabdff1aSopenharmony_ci "ld.wu %[tp0], %[src], 0 \n\t" 792cabdff1aSopenharmony_ci "ldx.wu %[tp1], %[src], %[stride] \n\t" 793cabdff1aSopenharmony_ci "st.w %[tp0], %[dst], 0 \n\t" 794cabdff1aSopenharmony_ci "stx.w %[tp1], %[dst], %[stride] \n\t" 795cabdff1aSopenharmony_ci : [tp0]"+&r"(tp0), [tp1]"+&r"(tp1) 796cabdff1aSopenharmony_ci : [src]"r"(src), [dst]"r"(dst), [stride]"r"(stride) 797cabdff1aSopenharmony_ci : "memory" 798cabdff1aSopenharmony_ci ); 799cabdff1aSopenharmony_ci } 800cabdff1aSopenharmony_ci} 801cabdff1aSopenharmony_ci 802cabdff1aSopenharmony_cistatic void copy_width8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 803cabdff1aSopenharmony_ci int32_t height) 804cabdff1aSopenharmony_ci{ 805cabdff1aSopenharmony_ci if (8 == height) { 806cabdff1aSopenharmony_ci copy_width8x8_lasx(src, dst, stride); 807cabdff1aSopenharmony_ci } else if (4 == height) { 808cabdff1aSopenharmony_ci copy_width8x4_lasx(src, dst, stride); 809cabdff1aSopenharmony_ci } 810cabdff1aSopenharmony_ci} 811cabdff1aSopenharmony_ci 812cabdff1aSopenharmony_civoid ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, 813cabdff1aSopenharmony_ci int height, int x, int y) 814cabdff1aSopenharmony_ci{ 815cabdff1aSopenharmony_ci av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 816cabdff1aSopenharmony_ci 817cabdff1aSopenharmony_ci if(x && y) { 818cabdff1aSopenharmony_ci avc_chroma_hv_4w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height); 819cabdff1aSopenharmony_ci } else if (x) { 820cabdff1aSopenharmony_ci avc_chroma_hz_4w_lasx(src, dst, stride, x, (8 - x), height); 821cabdff1aSopenharmony_ci } else if (y) { 822cabdff1aSopenharmony_ci avc_chroma_vt_4w_lasx(src, dst, stride, y, (8 - y), height); 823cabdff1aSopenharmony_ci } else { 824cabdff1aSopenharmony_ci copy_width4_lasx(src, dst, stride, height); 825cabdff1aSopenharmony_ci } 826cabdff1aSopenharmony_ci} 827cabdff1aSopenharmony_ci 828cabdff1aSopenharmony_civoid ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, 829cabdff1aSopenharmony_ci int height, int x, int y) 830cabdff1aSopenharmony_ci{ 831cabdff1aSopenharmony_ci av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 832cabdff1aSopenharmony_ci 833cabdff1aSopenharmony_ci if (!(x || y)) { 834cabdff1aSopenharmony_ci copy_width8_lasx(src, dst, stride, height); 835cabdff1aSopenharmony_ci } else if (x && y) { 836cabdff1aSopenharmony_ci avc_chroma_hv_8w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height); 837cabdff1aSopenharmony_ci } else if (x) { 838cabdff1aSopenharmony_ci avc_chroma_hz_8w_lasx(src, dst, stride, x, (8 - x), height); 839cabdff1aSopenharmony_ci } else { 840cabdff1aSopenharmony_ci avc_chroma_vt_8w_lasx(src, dst, stride, y, (8 - y), height); 841cabdff1aSopenharmony_ci } 842cabdff1aSopenharmony_ci} 843cabdff1aSopenharmony_ci 844cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hv_and_aver_dst_8x4_lasx(uint8_t *src, 845cabdff1aSopenharmony_ci uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0, 846cabdff1aSopenharmony_ci uint32_t coef_hor1, uint32_t coef_ver0, 847cabdff1aSopenharmony_ci uint32_t coef_ver1) 848cabdff1aSopenharmony_ci{ 849cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 850cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 851cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 852cabdff1aSopenharmony_ci __m256i tp0, tp1, tp2, tp3; 853cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, out; 854cabdff1aSopenharmony_ci __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1; 855cabdff1aSopenharmony_ci __m256i mask; 856cabdff1aSopenharmony_ci __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 857cabdff1aSopenharmony_ci __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 858cabdff1aSopenharmony_ci __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 859cabdff1aSopenharmony_ci __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 860cabdff1aSopenharmony_ci __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 861cabdff1aSopenharmony_ci 862cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 863cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 864cabdff1aSopenharmony_ci src1, src2, src3, src4); 865cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3); 866cabdff1aSopenharmony_ci src0 = __lasx_xvshuf_b(src0, src0, mask); 867cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3); 868cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1); 869cabdff1aSopenharmony_ci res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec); 870cabdff1aSopenharmony_ci res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0); 871cabdff1aSopenharmony_ci res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0); 872cabdff1aSopenharmony_ci res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20); 873cabdff1aSopenharmony_ci res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3); 874cabdff1aSopenharmony_ci res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1); 875cabdff1aSopenharmony_ci res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1); 876cabdff1aSopenharmony_ci out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6); 877cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 878cabdff1aSopenharmony_ci tp0, tp1, tp2, tp3); 879cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 880cabdff1aSopenharmony_ci tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 881cabdff1aSopenharmony_ci out = __lasx_xvavgr_bu(out, tp0); 882cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst, 0, 0); 883cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride, 0, 2); 884cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 885cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 886cabdff1aSopenharmony_ci} 887cabdff1aSopenharmony_ci 888cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hv_and_aver_dst_8x8_lasx(uint8_t *src, 889cabdff1aSopenharmony_ci uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0, 890cabdff1aSopenharmony_ci uint32_t coef_hor1, uint32_t coef_ver0, 891cabdff1aSopenharmony_ci uint32_t coef_ver1) 892cabdff1aSopenharmony_ci{ 893cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 894cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 895cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 896cabdff1aSopenharmony_ci __m256i tp0, tp1, tp2, tp3, dst0, dst1; 897cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 898cabdff1aSopenharmony_ci __m256i out0, out1; 899cabdff1aSopenharmony_ci __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; 900cabdff1aSopenharmony_ci __m256i res_vt0, res_vt1, res_vt2, res_vt3; 901cabdff1aSopenharmony_ci __m256i mask; 902cabdff1aSopenharmony_ci __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0); 903cabdff1aSopenharmony_ci __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1); 904cabdff1aSopenharmony_ci __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0); 905cabdff1aSopenharmony_ci __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); 906cabdff1aSopenharmony_ci __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1); 907cabdff1aSopenharmony_ci 908cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); 909cabdff1aSopenharmony_ci src += stride; 910cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 911cabdff1aSopenharmony_ci src1, src2, src3, src4); 912cabdff1aSopenharmony_ci src += stride_4x; 913cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 914cabdff1aSopenharmony_ci src5, src6, src7, src8); 915cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20, 916cabdff1aSopenharmony_ci src8, src7, 0x20, src1, src3, src5, src7); 917cabdff1aSopenharmony_ci src0 = __lasx_xvshuf_b(src0, src0, mask); 918cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7, 919cabdff1aSopenharmony_ci src7, mask, src1, src3, src5, src7); 920cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3, 921cabdff1aSopenharmony_ci coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3); 922cabdff1aSopenharmony_ci res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec); 923cabdff1aSopenharmony_ci res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0); 924cabdff1aSopenharmony_ci res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0); 925cabdff1aSopenharmony_ci res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0); 926cabdff1aSopenharmony_ci res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0); 927cabdff1aSopenharmony_ci res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20); 928cabdff1aSopenharmony_ci res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3); 929cabdff1aSopenharmony_ci res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3); 930cabdff1aSopenharmony_ci res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3); 931cabdff1aSopenharmony_ci res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1); 932cabdff1aSopenharmony_ci res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1); 933cabdff1aSopenharmony_ci res_vt2 = __lasx_xvmadd_h(res_vt2, res_hz2, coeff_vt_vec1); 934cabdff1aSopenharmony_ci res_vt3 = __lasx_xvmadd_h(res_vt3, res_hz3, coeff_vt_vec1); 935cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6, 936cabdff1aSopenharmony_ci out0, out1); 937cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 938cabdff1aSopenharmony_ci tp0, tp1, tp2, tp3); 939cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 940cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 941cabdff1aSopenharmony_ci dst += stride_4x; 942cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 943cabdff1aSopenharmony_ci tp0, tp1, tp2, tp3); 944cabdff1aSopenharmony_ci dst -= stride_4x; 945cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 946cabdff1aSopenharmony_ci dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20); 947cabdff1aSopenharmony_ci out0 = __lasx_xvavgr_bu(out0, dst0); 948cabdff1aSopenharmony_ci out1 = __lasx_xvavgr_bu(out1, dst1); 949cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst, 0, 0); 950cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride, 0, 2); 951cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 952cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 953cabdff1aSopenharmony_ci dst += stride_4x; 954cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst, 0, 0); 955cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride, 0, 2); 956cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 957cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 958cabdff1aSopenharmony_ci} 959cabdff1aSopenharmony_ci 960cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hz_and_aver_dst_8x4_lasx(uint8_t *src, 961cabdff1aSopenharmony_ci uint8_t *dst, ptrdiff_t stride, uint32_t coeff0, 962cabdff1aSopenharmony_ci uint32_t coeff1) 963cabdff1aSopenharmony_ci{ 964cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 965cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 966cabdff1aSopenharmony_ci __m256i tp0, tp1, tp2, tp3; 967cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, out; 968cabdff1aSopenharmony_ci __m256i res0, res1; 969cabdff1aSopenharmony_ci __m256i mask; 970cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 971cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 972cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 973cabdff1aSopenharmony_ci 974cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 975cabdff1aSopenharmony_ci mask = __lasx_xvld(chroma_mask_arr, 0); 976cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 977cabdff1aSopenharmony_ci src0, src1, src2, src3); 978cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2); 979cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2); 980cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); 981cabdff1aSopenharmony_ci out = __lasx_xvssrarni_bu_h(res1, res0, 6); 982cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 983cabdff1aSopenharmony_ci tp0, tp1, tp2, tp3); 984cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 985cabdff1aSopenharmony_ci tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 986cabdff1aSopenharmony_ci out = __lasx_xvavgr_bu(out, tp0); 987cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst, 0, 0); 988cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride, 0, 2); 989cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 990cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 991cabdff1aSopenharmony_ci} 992cabdff1aSopenharmony_ci 993cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hz_and_aver_dst_8x8_lasx(uint8_t *src, 994cabdff1aSopenharmony_ci uint8_t *dst, ptrdiff_t stride, uint32_t coeff0, 995cabdff1aSopenharmony_ci uint32_t coeff1) 996cabdff1aSopenharmony_ci{ 997cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 998cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 999cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1000cabdff1aSopenharmony_ci __m256i tp0, tp1, tp2, tp3, dst0, dst1; 1001cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 1002cabdff1aSopenharmony_ci __m256i out0, out1; 1003cabdff1aSopenharmony_ci __m256i res0, res1, res2, res3; 1004cabdff1aSopenharmony_ci __m256i mask; 1005cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 1006cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 1007cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 1008cabdff1aSopenharmony_ci 1009cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 1010cabdff1aSopenharmony_ci mask = __lasx_xvld(chroma_mask_arr, 0); 1011cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 1012cabdff1aSopenharmony_ci src0, src1, src2, src3); 1013cabdff1aSopenharmony_ci src += stride_4x; 1014cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 1015cabdff1aSopenharmony_ci src4, src5, src6, src7); 1016cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20, 1017cabdff1aSopenharmony_ci src7, src6, 0x20, src0, src2, src4, src6); 1018cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4, 1019cabdff1aSopenharmony_ci mask, src6, src6, mask, src0, src2, src4, src6); 1020cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6, 1021cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 1022cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1); 1023cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 1024cabdff1aSopenharmony_ci tp0, tp1, tp2, tp3); 1025cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 1026cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 1027cabdff1aSopenharmony_ci dst += stride_4x; 1028cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 1029cabdff1aSopenharmony_ci tp0, tp1, tp2, tp3); 1030cabdff1aSopenharmony_ci dst -= stride_4x; 1031cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 1032cabdff1aSopenharmony_ci dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20); 1033cabdff1aSopenharmony_ci out0 = __lasx_xvavgr_bu(out0, dst0); 1034cabdff1aSopenharmony_ci out1 = __lasx_xvavgr_bu(out1, dst1); 1035cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst, 0, 0); 1036cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride, 0, 2); 1037cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 1038cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 1039cabdff1aSopenharmony_ci dst += stride_4x; 1040cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst, 0, 0); 1041cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride, 0, 2); 1042cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 1043cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 1044cabdff1aSopenharmony_ci} 1045cabdff1aSopenharmony_ci 1046cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_vt_and_aver_dst_8x4_lasx(uint8_t *src, 1047cabdff1aSopenharmony_ci uint8_t *dst, ptrdiff_t stride, uint32_t coeff0, 1048cabdff1aSopenharmony_ci uint32_t coeff1) 1049cabdff1aSopenharmony_ci{ 1050cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1051cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1052cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1053cabdff1aSopenharmony_ci __m256i tp0, tp1, tp2, tp3; 1054cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, out; 1055cabdff1aSopenharmony_ci __m256i res0, res1; 1056cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 1057cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 1058cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 1059cabdff1aSopenharmony_ci 1060cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 1061cabdff1aSopenharmony_ci src0 = __lasx_xvld(src, 0); 1062cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, 1063cabdff1aSopenharmony_ci src1, src2, src3, src4); 1064cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, 1065cabdff1aSopenharmony_ci src4, src3, 0x20, src0, src1, src2, src3); 1066cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2); 1067cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); 1068cabdff1aSopenharmony_ci out = __lasx_xvssrarni_bu_h(res1, res0, 6); 1069cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 1070cabdff1aSopenharmony_ci tp0, tp1, tp2, tp3); 1071cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 1072cabdff1aSopenharmony_ci tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 1073cabdff1aSopenharmony_ci out = __lasx_xvavgr_bu(out, tp0); 1074cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst, 0, 0); 1075cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride, 0, 2); 1076cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_2x, 0, 1); 1077cabdff1aSopenharmony_ci __lasx_xvstelm_d(out, dst + stride_3x, 0, 3); 1078cabdff1aSopenharmony_ci} 1079cabdff1aSopenharmony_ci 1080cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_vt_and_aver_dst_8x8_lasx(uint8_t *src, 1081cabdff1aSopenharmony_ci uint8_t *dst, ptrdiff_t stride, uint32_t coeff0, 1082cabdff1aSopenharmony_ci uint32_t coeff1) 1083cabdff1aSopenharmony_ci{ 1084cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1085cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1086cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1087cabdff1aSopenharmony_ci __m256i tp0, tp1, tp2, tp3, dst0, dst1; 1088cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 1089cabdff1aSopenharmony_ci __m256i out0, out1; 1090cabdff1aSopenharmony_ci __m256i res0, res1, res2, res3; 1091cabdff1aSopenharmony_ci __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0); 1092cabdff1aSopenharmony_ci __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1); 1093cabdff1aSopenharmony_ci __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); 1094cabdff1aSopenharmony_ci 1095cabdff1aSopenharmony_ci coeff_vec = __lasx_xvslli_b(coeff_vec, 3); 1096cabdff1aSopenharmony_ci src0 = __lasx_xvld(src, 0); 1097cabdff1aSopenharmony_ci src += stride; 1098cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 1099cabdff1aSopenharmony_ci src1, src2, src3, src4); 1100cabdff1aSopenharmony_ci src += stride_4x; 1101cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, 1102cabdff1aSopenharmony_ci src5, src6, src7, src8); 1103cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, 1104cabdff1aSopenharmony_ci src4, src3, 0x20, src0, src1, src2, src3); 1105cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20, 1106cabdff1aSopenharmony_ci src8, src7, 0x20, src4, src5, src6, src7); 1107cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6, 1108cabdff1aSopenharmony_ci src0, src2, src4, src6); 1109cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6, 1110cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 1111cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1); 1112cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 1113cabdff1aSopenharmony_ci tp0, tp1, tp2, tp3); 1114cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 1115cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20); 1116cabdff1aSopenharmony_ci dst += stride_4x; 1117cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, 1118cabdff1aSopenharmony_ci tp0, tp1, tp2, tp3); 1119cabdff1aSopenharmony_ci dst -= stride_4x; 1120cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); 1121cabdff1aSopenharmony_ci dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20); 1122cabdff1aSopenharmony_ci out0 = __lasx_xvavgr_bu(out0, dst0); 1123cabdff1aSopenharmony_ci out1 = __lasx_xvavgr_bu(out1, dst1); 1124cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst, 0, 0); 1125cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride, 0, 2); 1126cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1); 1127cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3); 1128cabdff1aSopenharmony_ci dst += stride_4x; 1129cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst, 0, 0); 1130cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride, 0, 2); 1131cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1); 1132cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3); 1133cabdff1aSopenharmony_ci} 1134cabdff1aSopenharmony_ci 1135cabdff1aSopenharmony_cistatic av_always_inline void avg_width8x8_lasx(uint8_t *src, uint8_t *dst, 1136cabdff1aSopenharmony_ci ptrdiff_t stride) 1137cabdff1aSopenharmony_ci{ 1138cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3; 1139cabdff1aSopenharmony_ci __m256i dst0, dst1, dst2, dst3; 1140cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1141cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1142cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1143cabdff1aSopenharmony_ci 1144cabdff1aSopenharmony_ci src0 = __lasx_xvldrepl_d(src, 0); 1145cabdff1aSopenharmony_ci src1 = __lasx_xvldrepl_d(src + stride, 0); 1146cabdff1aSopenharmony_ci src2 = __lasx_xvldrepl_d(src + stride_2x, 0); 1147cabdff1aSopenharmony_ci src3 = __lasx_xvldrepl_d(src + stride_3x, 0); 1148cabdff1aSopenharmony_ci dst0 = __lasx_xvldrepl_d(dst, 0); 1149cabdff1aSopenharmony_ci dst1 = __lasx_xvldrepl_d(dst + stride, 0); 1150cabdff1aSopenharmony_ci dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0); 1151cabdff1aSopenharmony_ci dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0); 1152cabdff1aSopenharmony_ci src0 = __lasx_xvpackev_d(src1,src0); 1153cabdff1aSopenharmony_ci src2 = __lasx_xvpackev_d(src3,src2); 1154cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(src0, src2, 0x02); 1155cabdff1aSopenharmony_ci dst0 = __lasx_xvpackev_d(dst1,dst0); 1156cabdff1aSopenharmony_ci dst2 = __lasx_xvpackev_d(dst3,dst2); 1157cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02); 1158cabdff1aSopenharmony_ci dst0 = __lasx_xvavgr_bu(src0, dst0); 1159cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 0); 1160cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1161cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1162cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1163cabdff1aSopenharmony_ci 1164cabdff1aSopenharmony_ci src += stride_4x; 1165cabdff1aSopenharmony_ci dst += stride_4x; 1166cabdff1aSopenharmony_ci src0 = __lasx_xvldrepl_d(src, 0); 1167cabdff1aSopenharmony_ci src1 = __lasx_xvldrepl_d(src + stride, 0); 1168cabdff1aSopenharmony_ci src2 = __lasx_xvldrepl_d(src + stride_2x, 0); 1169cabdff1aSopenharmony_ci src3 = __lasx_xvldrepl_d(src + stride_3x, 0); 1170cabdff1aSopenharmony_ci dst0 = __lasx_xvldrepl_d(dst, 0); 1171cabdff1aSopenharmony_ci dst1 = __lasx_xvldrepl_d(dst + stride, 0); 1172cabdff1aSopenharmony_ci dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0); 1173cabdff1aSopenharmony_ci dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0); 1174cabdff1aSopenharmony_ci src0 = __lasx_xvpackev_d(src1,src0); 1175cabdff1aSopenharmony_ci src2 = __lasx_xvpackev_d(src3,src2); 1176cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(src0, src2, 0x02); 1177cabdff1aSopenharmony_ci dst0 = __lasx_xvpackev_d(dst1,dst0); 1178cabdff1aSopenharmony_ci dst2 = __lasx_xvpackev_d(dst3,dst2); 1179cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02); 1180cabdff1aSopenharmony_ci dst0 = __lasx_xvavgr_bu(src0, dst0); 1181cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 0); 1182cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1183cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1184cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1185cabdff1aSopenharmony_ci} 1186cabdff1aSopenharmony_ci 1187cabdff1aSopenharmony_cistatic av_always_inline void avg_width8x4_lasx(uint8_t *src, uint8_t *dst, 1188cabdff1aSopenharmony_ci ptrdiff_t stride) 1189cabdff1aSopenharmony_ci{ 1190cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3; 1191cabdff1aSopenharmony_ci __m256i dst0, dst1, dst2, dst3; 1192cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1193cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1194cabdff1aSopenharmony_ci 1195cabdff1aSopenharmony_ci src0 = __lasx_xvldrepl_d(src, 0); 1196cabdff1aSopenharmony_ci src1 = __lasx_xvldrepl_d(src + stride, 0); 1197cabdff1aSopenharmony_ci src2 = __lasx_xvldrepl_d(src + stride_2x, 0); 1198cabdff1aSopenharmony_ci src3 = __lasx_xvldrepl_d(src + stride_3x, 0); 1199cabdff1aSopenharmony_ci dst0 = __lasx_xvldrepl_d(dst, 0); 1200cabdff1aSopenharmony_ci dst1 = __lasx_xvldrepl_d(dst + stride, 0); 1201cabdff1aSopenharmony_ci dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0); 1202cabdff1aSopenharmony_ci dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0); 1203cabdff1aSopenharmony_ci src0 = __lasx_xvpackev_d(src1,src0); 1204cabdff1aSopenharmony_ci src2 = __lasx_xvpackev_d(src3,src2); 1205cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(src0, src2, 0x02); 1206cabdff1aSopenharmony_ci dst0 = __lasx_xvpackev_d(dst1,dst0); 1207cabdff1aSopenharmony_ci dst2 = __lasx_xvpackev_d(dst3,dst2); 1208cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02); 1209cabdff1aSopenharmony_ci dst0 = __lasx_xvavgr_bu(src0, dst0); 1210cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 0); 1211cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1212cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1213cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1214cabdff1aSopenharmony_ci} 1215cabdff1aSopenharmony_ci 1216cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst, 1217cabdff1aSopenharmony_ci ptrdiff_t stride, 1218cabdff1aSopenharmony_ci uint32_t coef_hor0, 1219cabdff1aSopenharmony_ci uint32_t coef_hor1, 1220cabdff1aSopenharmony_ci uint32_t coef_ver0, 1221cabdff1aSopenharmony_ci uint32_t coef_ver1, 1222cabdff1aSopenharmony_ci int32_t height) 1223cabdff1aSopenharmony_ci{ 1224cabdff1aSopenharmony_ci if (4 == height) { 1225cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_8x4_lasx(src, dst, stride, coef_hor0, 1226cabdff1aSopenharmony_ci coef_hor1, coef_ver0, coef_ver1); 1227cabdff1aSopenharmony_ci } else if (8 == height) { 1228cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_8x8_lasx(src, dst, stride, coef_hor0, 1229cabdff1aSopenharmony_ci coef_hor1, coef_ver0, coef_ver1); 1230cabdff1aSopenharmony_ci } 1231cabdff1aSopenharmony_ci} 1232cabdff1aSopenharmony_ci 1233cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst, 1234cabdff1aSopenharmony_ci ptrdiff_t stride, uint32_t coeff0, 1235cabdff1aSopenharmony_ci uint32_t coeff1, int32_t height) 1236cabdff1aSopenharmony_ci{ 1237cabdff1aSopenharmony_ci if (4 == height) { 1238cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1); 1239cabdff1aSopenharmony_ci } else if (8 == height) { 1240cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1); 1241cabdff1aSopenharmony_ci } 1242cabdff1aSopenharmony_ci} 1243cabdff1aSopenharmony_ci 1244cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst, 1245cabdff1aSopenharmony_ci ptrdiff_t stride, uint32_t coeff0, 1246cabdff1aSopenharmony_ci uint32_t coeff1, int32_t height) 1247cabdff1aSopenharmony_ci{ 1248cabdff1aSopenharmony_ci if (4 == height) { 1249cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1); 1250cabdff1aSopenharmony_ci } else if (8 == height) { 1251cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1); 1252cabdff1aSopenharmony_ci } 1253cabdff1aSopenharmony_ci} 1254cabdff1aSopenharmony_ci 1255cabdff1aSopenharmony_cistatic void avg_width8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1256cabdff1aSopenharmony_ci int32_t height) 1257cabdff1aSopenharmony_ci{ 1258cabdff1aSopenharmony_ci if (8 == height) { 1259cabdff1aSopenharmony_ci avg_width8x8_lasx(src, dst, stride); 1260cabdff1aSopenharmony_ci } else if (4 == height) { 1261cabdff1aSopenharmony_ci avg_width8x4_lasx(src, dst, stride); 1262cabdff1aSopenharmony_ci } 1263cabdff1aSopenharmony_ci} 1264cabdff1aSopenharmony_ci 1265cabdff1aSopenharmony_civoid ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, 1266cabdff1aSopenharmony_ci int height, int x, int y) 1267cabdff1aSopenharmony_ci{ 1268cabdff1aSopenharmony_ci av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1269cabdff1aSopenharmony_ci 1270cabdff1aSopenharmony_ci if (!(x || y)) { 1271cabdff1aSopenharmony_ci avg_width8_lasx(src, dst, stride, height); 1272cabdff1aSopenharmony_ci } else if (x && y) { 1273cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), y, 1274cabdff1aSopenharmony_ci (8 - y), height); 1275cabdff1aSopenharmony_ci } else if (x) { 1276cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), height); 1277cabdff1aSopenharmony_ci } else { 1278cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_8w_lasx(src, dst, stride, y, (8 - y), height); 1279cabdff1aSopenharmony_ci } 1280cabdff1aSopenharmony_ci} 1281