1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited 3cabdff1aSopenharmony_ci * Contributed by Hao Chen <chenhao@loongson.cn> 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h" 23cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 24cabdff1aSopenharmony_ci#include "vp9dsp_loongarch.h" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, \ 27cabdff1aSopenharmony_ci _dst5, _dst6, _dst7, _dst, _stride, \ 28cabdff1aSopenharmony_ci _stride2, _stride3, _stride4) \ 29cabdff1aSopenharmony_ci{ \ 30cabdff1aSopenharmony_ci __lsx_vst(_dst0, _dst, 0); \ 31cabdff1aSopenharmony_ci __lsx_vstx(_dst1, _dst, _stride); \ 32cabdff1aSopenharmony_ci __lsx_vstx(_dst2, _dst, _stride2); \ 33cabdff1aSopenharmony_ci __lsx_vstx(_dst3, _dst, _stride3); \ 34cabdff1aSopenharmony_ci _dst += _stride4; \ 35cabdff1aSopenharmony_ci __lsx_vst(_dst4, _dst, 0); \ 36cabdff1aSopenharmony_ci __lsx_vstx(_dst5, _dst, _stride); \ 37cabdff1aSopenharmony_ci __lsx_vstx(_dst6, _dst, _stride2); \ 38cabdff1aSopenharmony_ci __lsx_vstx(_dst7, _dst, _stride3); \ 39cabdff1aSopenharmony_ci} 40cabdff1aSopenharmony_ci 41cabdff1aSopenharmony_ci#define LSX_ST_8X16(_dst0, _dst1, _dst2, _dst3, _dst4, \ 42cabdff1aSopenharmony_ci _dst5, _dst6, _dst7, _dst, _stride) \ 43cabdff1aSopenharmony_ci{ \ 44cabdff1aSopenharmony_ci __lsx_vst(_dst0, _dst, 0); \ 45cabdff1aSopenharmony_ci __lsx_vst(_dst0, _dst, 16); \ 46cabdff1aSopenharmony_ci _dst += _stride; \ 47cabdff1aSopenharmony_ci __lsx_vst(_dst1, _dst, 0); \ 48cabdff1aSopenharmony_ci __lsx_vst(_dst1, _dst, 16); \ 49cabdff1aSopenharmony_ci _dst += _stride; \ 50cabdff1aSopenharmony_ci __lsx_vst(_dst2, _dst, 0); \ 51cabdff1aSopenharmony_ci __lsx_vst(_dst2, _dst, 16); \ 52cabdff1aSopenharmony_ci _dst += _stride; \ 53cabdff1aSopenharmony_ci __lsx_vst(_dst3, _dst, 0); \ 54cabdff1aSopenharmony_ci __lsx_vst(_dst3, _dst, 16); \ 55cabdff1aSopenharmony_ci _dst += _stride; \ 56cabdff1aSopenharmony_ci __lsx_vst(_dst4, _dst, 0); \ 57cabdff1aSopenharmony_ci __lsx_vst(_dst4, _dst, 16); \ 58cabdff1aSopenharmony_ci _dst += _stride; \ 59cabdff1aSopenharmony_ci __lsx_vst(_dst5, _dst, 0); \ 60cabdff1aSopenharmony_ci __lsx_vst(_dst5, _dst, 16); \ 61cabdff1aSopenharmony_ci _dst += _stride; \ 62cabdff1aSopenharmony_ci __lsx_vst(_dst6, _dst, 0); \ 63cabdff1aSopenharmony_ci __lsx_vst(_dst6, _dst, 16); \ 64cabdff1aSopenharmony_ci _dst += _stride; \ 65cabdff1aSopenharmony_ci __lsx_vst(_dst7, _dst, 0); \ 66cabdff1aSopenharmony_ci __lsx_vst(_dst7, _dst, 16); \ 67cabdff1aSopenharmony_ci _dst += _stride; \ 68cabdff1aSopenharmony_ci} 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_civoid ff_vert_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, 71cabdff1aSopenharmony_ci const uint8_t *src) 72cabdff1aSopenharmony_ci{ 73cabdff1aSopenharmony_ci __m128i src0; 74cabdff1aSopenharmony_ci ptrdiff_t stride2 = dst_stride << 1; 75cabdff1aSopenharmony_ci ptrdiff_t stride3 = stride2 + dst_stride; 76cabdff1aSopenharmony_ci ptrdiff_t stride4 = stride2 << 1; 77cabdff1aSopenharmony_ci src0 = __lsx_vld(src, 0); 78cabdff1aSopenharmony_ci LSX_ST_8(src0, src0, src0, src0, src0, src0, src0, src0, dst, 79cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); 80cabdff1aSopenharmony_ci dst += stride4; 81cabdff1aSopenharmony_ci LSX_ST_8(src0, src0, src0, src0, src0, src0, src0, src0, dst, 82cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); 83cabdff1aSopenharmony_ci} 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_civoid ff_vert_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, 86cabdff1aSopenharmony_ci const uint8_t *src) 87cabdff1aSopenharmony_ci{ 88cabdff1aSopenharmony_ci uint32_t row; 89cabdff1aSopenharmony_ci __m128i src0, src1; 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); 92cabdff1aSopenharmony_ci for (row = 32; row--;) { 93cabdff1aSopenharmony_ci __lsx_vst(src0, dst, 0); 94cabdff1aSopenharmony_ci __lsx_vst(src1, dst, 16); 95cabdff1aSopenharmony_ci dst += dst_stride; 96cabdff1aSopenharmony_ci } 97cabdff1aSopenharmony_ci} 98cabdff1aSopenharmony_ci 99cabdff1aSopenharmony_civoid ff_hor_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, 100cabdff1aSopenharmony_ci const uint8_t *top) 101cabdff1aSopenharmony_ci{ 102cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 103cabdff1aSopenharmony_ci __m128i src8, src9, src10, src11, src12, src13, src14, src15; 104cabdff1aSopenharmony_ci ptrdiff_t stride2 = dst_stride << 1; 105cabdff1aSopenharmony_ci ptrdiff_t stride3 = stride2 + dst_stride; 106cabdff1aSopenharmony_ci ptrdiff_t stride4 = stride2 << 1; 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ci src15 = __lsx_vldrepl_b(src, 0); 109cabdff1aSopenharmony_ci src14 = __lsx_vldrepl_b(src, 1); 110cabdff1aSopenharmony_ci src13 = __lsx_vldrepl_b(src, 2); 111cabdff1aSopenharmony_ci src12 = __lsx_vldrepl_b(src, 3); 112cabdff1aSopenharmony_ci src11 = __lsx_vldrepl_b(src, 4); 113cabdff1aSopenharmony_ci src10 = __lsx_vldrepl_b(src, 5); 114cabdff1aSopenharmony_ci src9 = __lsx_vldrepl_b(src, 6); 115cabdff1aSopenharmony_ci src8 = __lsx_vldrepl_b(src, 7); 116cabdff1aSopenharmony_ci src7 = __lsx_vldrepl_b(src, 8); 117cabdff1aSopenharmony_ci src6 = __lsx_vldrepl_b(src, 9); 118cabdff1aSopenharmony_ci src5 = __lsx_vldrepl_b(src, 10); 119cabdff1aSopenharmony_ci src4 = __lsx_vldrepl_b(src, 11); 120cabdff1aSopenharmony_ci src3 = __lsx_vldrepl_b(src, 12); 121cabdff1aSopenharmony_ci src2 = __lsx_vldrepl_b(src, 13); 122cabdff1aSopenharmony_ci src1 = __lsx_vldrepl_b(src, 14); 123cabdff1aSopenharmony_ci src0 = __lsx_vldrepl_b(src, 15); 124cabdff1aSopenharmony_ci LSX_ST_8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 125cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); 126cabdff1aSopenharmony_ci dst += stride4; 127cabdff1aSopenharmony_ci LSX_ST_8(src8, src9, src10, src11, src12, src13, src14, src15, dst, 128cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); 129cabdff1aSopenharmony_ci} 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_civoid ff_hor_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, 132cabdff1aSopenharmony_ci const uint8_t *top) 133cabdff1aSopenharmony_ci{ 134cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 135cabdff1aSopenharmony_ci __m128i src8, src9, src10, src11, src12, src13, src14, src15; 136cabdff1aSopenharmony_ci __m128i src16, src17, src18, src19, src20, src21, src22, src23; 137cabdff1aSopenharmony_ci __m128i src24, src25, src26, src27, src28, src29, src30, src31; 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci src31 = __lsx_vldrepl_b(src, 0); 140cabdff1aSopenharmony_ci src30 = __lsx_vldrepl_b(src, 1); 141cabdff1aSopenharmony_ci src29 = __lsx_vldrepl_b(src, 2); 142cabdff1aSopenharmony_ci src28 = __lsx_vldrepl_b(src, 3); 143cabdff1aSopenharmony_ci src27 = __lsx_vldrepl_b(src, 4); 144cabdff1aSopenharmony_ci src26 = __lsx_vldrepl_b(src, 5); 145cabdff1aSopenharmony_ci src25 = __lsx_vldrepl_b(src, 6); 146cabdff1aSopenharmony_ci src24 = __lsx_vldrepl_b(src, 7); 147cabdff1aSopenharmony_ci src23 = __lsx_vldrepl_b(src, 8); 148cabdff1aSopenharmony_ci src22 = __lsx_vldrepl_b(src, 9); 149cabdff1aSopenharmony_ci src21 = __lsx_vldrepl_b(src, 10); 150cabdff1aSopenharmony_ci src20 = __lsx_vldrepl_b(src, 11); 151cabdff1aSopenharmony_ci src19 = __lsx_vldrepl_b(src, 12); 152cabdff1aSopenharmony_ci src18 = __lsx_vldrepl_b(src, 13); 153cabdff1aSopenharmony_ci src17 = __lsx_vldrepl_b(src, 14); 154cabdff1aSopenharmony_ci src16 = __lsx_vldrepl_b(src, 15); 155cabdff1aSopenharmony_ci src15 = __lsx_vldrepl_b(src, 16); 156cabdff1aSopenharmony_ci src14 = __lsx_vldrepl_b(src, 17); 157cabdff1aSopenharmony_ci src13 = __lsx_vldrepl_b(src, 18); 158cabdff1aSopenharmony_ci src12 = __lsx_vldrepl_b(src, 19); 159cabdff1aSopenharmony_ci src11 = __lsx_vldrepl_b(src, 20); 160cabdff1aSopenharmony_ci src10 = __lsx_vldrepl_b(src, 21); 161cabdff1aSopenharmony_ci src9 = __lsx_vldrepl_b(src, 22); 162cabdff1aSopenharmony_ci src8 = __lsx_vldrepl_b(src, 23); 163cabdff1aSopenharmony_ci src7 = __lsx_vldrepl_b(src, 24); 164cabdff1aSopenharmony_ci src6 = __lsx_vldrepl_b(src, 25); 165cabdff1aSopenharmony_ci src5 = __lsx_vldrepl_b(src, 26); 166cabdff1aSopenharmony_ci src4 = __lsx_vldrepl_b(src, 27); 167cabdff1aSopenharmony_ci src3 = __lsx_vldrepl_b(src, 28); 168cabdff1aSopenharmony_ci src2 = __lsx_vldrepl_b(src, 29); 169cabdff1aSopenharmony_ci src1 = __lsx_vldrepl_b(src, 30); 170cabdff1aSopenharmony_ci src0 = __lsx_vldrepl_b(src, 31); 171cabdff1aSopenharmony_ci LSX_ST_8X16(src0, src1, src2, src3, src4, src5, src6, src7, 172cabdff1aSopenharmony_ci dst, dst_stride); 173cabdff1aSopenharmony_ci LSX_ST_8X16(src8, src9, src10, src11, src12, src13, src14, src15, 174cabdff1aSopenharmony_ci dst, dst_stride); 175cabdff1aSopenharmony_ci LSX_ST_8X16(src16, src17, src18, src19, src20, src21, src22, src23, 176cabdff1aSopenharmony_ci dst, dst_stride); 177cabdff1aSopenharmony_ci LSX_ST_8X16(src24, src25, src26, src27, src28, src29, src30, src31, 178cabdff1aSopenharmony_ci dst, dst_stride); 179cabdff1aSopenharmony_ci} 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_civoid ff_dc_4x4_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, 182cabdff1aSopenharmony_ci const uint8_t *src_top) 183cabdff1aSopenharmony_ci{ 184cabdff1aSopenharmony_ci __m128i tmp0, tmp1, dst0; 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci tmp0 = __lsx_vldrepl_w(src_top, 0); 187cabdff1aSopenharmony_ci tmp1 = __lsx_vldrepl_w(src_left, 0); 188cabdff1aSopenharmony_ci dst0 = __lsx_vilvl_w(tmp1, tmp0); 189cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_hu_bu(dst0, dst0); 190cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_wu_hu(dst0, dst0); 191cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_du_wu(dst0, dst0); 192cabdff1aSopenharmony_ci dst0 = __lsx_vsrari_w(dst0, 3); 193cabdff1aSopenharmony_ci dst0 = __lsx_vshuf4i_b(dst0, 0); 194cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); 195cabdff1aSopenharmony_ci dst += dst_stride; 196cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); 197cabdff1aSopenharmony_ci dst += dst_stride; 198cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); 199cabdff1aSopenharmony_ci dst += dst_stride; 200cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); 201cabdff1aSopenharmony_ci} 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci#define INTRA_DC_TL_4X4(dir) \ 204cabdff1aSopenharmony_civoid ff_dc_##dir##_4x4_lsx(uint8_t *dst, ptrdiff_t dst_stride, \ 205cabdff1aSopenharmony_ci const uint8_t *left, \ 206cabdff1aSopenharmony_ci const uint8_t *top) \ 207cabdff1aSopenharmony_ci{ \ 208cabdff1aSopenharmony_ci __m128i tmp0, dst0; \ 209cabdff1aSopenharmony_ci \ 210cabdff1aSopenharmony_ci tmp0 = __lsx_vldrepl_w(dir, 0); \ 211cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_hu_bu(tmp0, tmp0); \ 212cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_wu_hu(dst0, dst0); \ 213cabdff1aSopenharmony_ci dst0 = __lsx_vsrari_w(dst0, 2); \ 214cabdff1aSopenharmony_ci dst0 = __lsx_vshuf4i_b(dst0, 0); \ 215cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); \ 216cabdff1aSopenharmony_ci dst += dst_stride; \ 217cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); \ 218cabdff1aSopenharmony_ci dst += dst_stride; \ 219cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); \ 220cabdff1aSopenharmony_ci dst += dst_stride; \ 221cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); \ 222cabdff1aSopenharmony_ci} 223cabdff1aSopenharmony_ciINTRA_DC_TL_4X4(top); 224cabdff1aSopenharmony_ciINTRA_DC_TL_4X4(left); 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_civoid ff_dc_8x8_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, 227cabdff1aSopenharmony_ci const uint8_t *src_top) 228cabdff1aSopenharmony_ci{ 229cabdff1aSopenharmony_ci __m128i tmp0, tmp1, dst0; 230cabdff1aSopenharmony_ci 231cabdff1aSopenharmony_ci tmp0 = __lsx_vldrepl_d(src_top, 0); 232cabdff1aSopenharmony_ci tmp1 = __lsx_vldrepl_d(src_left, 0); 233cabdff1aSopenharmony_ci dst0 = __lsx_vilvl_d(tmp1, tmp0); 234cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_hu_bu(dst0, dst0); 235cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_wu_hu(dst0, dst0); 236cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_du_wu(dst0, dst0); 237cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_qu_du(dst0, dst0); 238cabdff1aSopenharmony_ci dst0 = __lsx_vsrari_w(dst0, 4); 239cabdff1aSopenharmony_ci dst0 = __lsx_vreplvei_b(dst0, 0); 240cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); 241cabdff1aSopenharmony_ci dst += dst_stride; 242cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); 243cabdff1aSopenharmony_ci dst += dst_stride; 244cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); 245cabdff1aSopenharmony_ci dst += dst_stride; 246cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); 247cabdff1aSopenharmony_ci dst += dst_stride; 248cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); 249cabdff1aSopenharmony_ci dst += dst_stride; 250cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); 251cabdff1aSopenharmony_ci dst += dst_stride; 252cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); 253cabdff1aSopenharmony_ci dst += dst_stride; 254cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); 255cabdff1aSopenharmony_ci} 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_ci#define INTRA_DC_TL_8X8(dir) \ 258cabdff1aSopenharmony_civoid ff_dc_##dir##_8x8_lsx(uint8_t *dst, ptrdiff_t dst_stride, \ 259cabdff1aSopenharmony_ci const uint8_t *left, \ 260cabdff1aSopenharmony_ci const uint8_t *top) \ 261cabdff1aSopenharmony_ci{ \ 262cabdff1aSopenharmony_ci __m128i tmp0, dst0; \ 263cabdff1aSopenharmony_ci \ 264cabdff1aSopenharmony_ci tmp0 = __lsx_vldrepl_d(dir, 0); \ 265cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_hu_bu(tmp0, tmp0); \ 266cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_wu_hu(dst0, dst0); \ 267cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_du_wu(dst0, dst0); \ 268cabdff1aSopenharmony_ci dst0 = __lsx_vsrari_w(dst0, 3); \ 269cabdff1aSopenharmony_ci dst0 = __lsx_vreplvei_b(dst0, 0); \ 270cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); \ 271cabdff1aSopenharmony_ci dst += dst_stride; \ 272cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); \ 273cabdff1aSopenharmony_ci dst += dst_stride; \ 274cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); \ 275cabdff1aSopenharmony_ci dst += dst_stride; \ 276cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); \ 277cabdff1aSopenharmony_ci dst += dst_stride; \ 278cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); \ 279cabdff1aSopenharmony_ci dst += dst_stride; \ 280cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); \ 281cabdff1aSopenharmony_ci dst += dst_stride; \ 282cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); \ 283cabdff1aSopenharmony_ci dst += dst_stride; \ 284cabdff1aSopenharmony_ci __lsx_vstelm_d(dst0, dst, 0, 0); \ 285cabdff1aSopenharmony_ci} 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ciINTRA_DC_TL_8X8(top); 288cabdff1aSopenharmony_ciINTRA_DC_TL_8X8(left); 289cabdff1aSopenharmony_ci 290cabdff1aSopenharmony_civoid ff_dc_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride, 291cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top) 292cabdff1aSopenharmony_ci{ 293cabdff1aSopenharmony_ci __m128i tmp0, tmp1, dst0; 294cabdff1aSopenharmony_ci ptrdiff_t stride2 = dst_stride << 1; 295cabdff1aSopenharmony_ci ptrdiff_t stride3 = stride2 + dst_stride; 296cabdff1aSopenharmony_ci ptrdiff_t stride4 = stride2 << 1; 297cabdff1aSopenharmony_ci 298cabdff1aSopenharmony_ci tmp0 = __lsx_vld(src_top, 0); 299cabdff1aSopenharmony_ci tmp1 = __lsx_vld(src_left, 0); 300cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp0, tmp1); 301cabdff1aSopenharmony_ci dst0 = __lsx_vadd_h(tmp0, tmp1); 302cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_wu_hu(dst0, dst0); 303cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_du_wu(dst0, dst0); 304cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_qu_du(dst0, dst0); 305cabdff1aSopenharmony_ci dst0 = __lsx_vsrari_w(dst0, 5); 306cabdff1aSopenharmony_ci dst0 = __lsx_vreplvei_b(dst0, 0); 307cabdff1aSopenharmony_ci LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst, 308cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); 309cabdff1aSopenharmony_ci dst += stride4; 310cabdff1aSopenharmony_ci LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst, 311cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); 312cabdff1aSopenharmony_ci} 313cabdff1aSopenharmony_ci 314cabdff1aSopenharmony_ci#define INTRA_DC_TL_16X16(dir) \ 315cabdff1aSopenharmony_civoid ff_dc_##dir##_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride, \ 316cabdff1aSopenharmony_ci const uint8_t *left, \ 317cabdff1aSopenharmony_ci const uint8_t *top) \ 318cabdff1aSopenharmony_ci{ \ 319cabdff1aSopenharmony_ci __m128i tmp0, dst0; \ 320cabdff1aSopenharmony_ci ptrdiff_t stride2 = dst_stride << 1; \ 321cabdff1aSopenharmony_ci ptrdiff_t stride3 = stride2 + dst_stride; \ 322cabdff1aSopenharmony_ci ptrdiff_t stride4 = stride2 << 1; \ 323cabdff1aSopenharmony_ci \ 324cabdff1aSopenharmony_ci tmp0 = __lsx_vld(dir, 0); \ 325cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_hu_bu(tmp0, tmp0); \ 326cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_wu_hu(dst0, dst0); \ 327cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_du_wu(dst0, dst0); \ 328cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_qu_du(dst0, dst0); \ 329cabdff1aSopenharmony_ci dst0 = __lsx_vsrari_w(dst0, 4); \ 330cabdff1aSopenharmony_ci dst0 = __lsx_vreplvei_b(dst0, 0); \ 331cabdff1aSopenharmony_ci LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst, \ 332cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); \ 333cabdff1aSopenharmony_ci dst += stride4; \ 334cabdff1aSopenharmony_ci LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst, \ 335cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); \ 336cabdff1aSopenharmony_ci} 337cabdff1aSopenharmony_ci 338cabdff1aSopenharmony_ciINTRA_DC_TL_16X16(top); 339cabdff1aSopenharmony_ciINTRA_DC_TL_16X16(left); 340cabdff1aSopenharmony_ci 341cabdff1aSopenharmony_civoid ff_dc_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride, 342cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top) 343cabdff1aSopenharmony_ci{ 344cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3, dst0; 345cabdff1aSopenharmony_ci 346cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src_top, 0, src_top, 16, tmp0, tmp1); 347cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src_left, 0, src_left, 16, tmp2, tmp3); 348cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, 349cabdff1aSopenharmony_ci tmp3, tmp3, tmp0, tmp1, tmp2, tmp3); 350cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp0, tmp1); 351cabdff1aSopenharmony_ci dst0 = __lsx_vadd_h(tmp0, tmp1); 352cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_wu_hu(dst0, dst0); 353cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_du_wu(dst0, dst0); 354cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_qu_du(dst0, dst0); 355cabdff1aSopenharmony_ci dst0 = __lsx_vsrari_w(dst0, 6); 356cabdff1aSopenharmony_ci dst0 = __lsx_vreplvei_b(dst0, 0); 357cabdff1aSopenharmony_ci LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, 358cabdff1aSopenharmony_ci dst, dst_stride); 359cabdff1aSopenharmony_ci LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, 360cabdff1aSopenharmony_ci dst, dst_stride); 361cabdff1aSopenharmony_ci LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, 362cabdff1aSopenharmony_ci dst, dst_stride); 363cabdff1aSopenharmony_ci LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, 364cabdff1aSopenharmony_ci dst, dst_stride); 365cabdff1aSopenharmony_ci} 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci#define INTRA_DC_TL_32X32(dir) \ 368cabdff1aSopenharmony_civoid ff_dc_##dir##_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride, \ 369cabdff1aSopenharmony_ci const uint8_t *left, \ 370cabdff1aSopenharmony_ci const uint8_t *top) \ 371cabdff1aSopenharmony_ci{ \ 372cabdff1aSopenharmony_ci __m128i tmp0, tmp1, dst0; \ 373cabdff1aSopenharmony_ci \ 374cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, dir, 0, dir, 16, tmp0, tmp1); \ 375cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp0, tmp1); \ 376cabdff1aSopenharmony_ci dst0 = __lsx_vadd_h(tmp0, tmp1); \ 377cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_wu_hu(dst0, dst0); \ 378cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_du_wu(dst0, dst0); \ 379cabdff1aSopenharmony_ci dst0 = __lsx_vhaddw_qu_du(dst0, dst0); \ 380cabdff1aSopenharmony_ci dst0 = __lsx_vsrari_w(dst0, 5); \ 381cabdff1aSopenharmony_ci dst0 = __lsx_vreplvei_b(dst0, 0); \ 382cabdff1aSopenharmony_ci LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, \ 383cabdff1aSopenharmony_ci dst, dst_stride); \ 384cabdff1aSopenharmony_ci LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, \ 385cabdff1aSopenharmony_ci dst, dst_stride); \ 386cabdff1aSopenharmony_ci LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, \ 387cabdff1aSopenharmony_ci dst, dst_stride); \ 388cabdff1aSopenharmony_ci LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, \ 389cabdff1aSopenharmony_ci dst, dst_stride); \ 390cabdff1aSopenharmony_ci} 391cabdff1aSopenharmony_ci 392cabdff1aSopenharmony_ciINTRA_DC_TL_32X32(top); 393cabdff1aSopenharmony_ciINTRA_DC_TL_32X32(left); 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci#define INTRA_PREDICT_VALDC_16X16_LSX(val) \ 396cabdff1aSopenharmony_civoid ff_dc_##val##_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride, \ 397cabdff1aSopenharmony_ci const uint8_t *left, const uint8_t *top) \ 398cabdff1aSopenharmony_ci{ \ 399cabdff1aSopenharmony_ci __m128i out = __lsx_vldi(val); \ 400cabdff1aSopenharmony_ci ptrdiff_t stride2 = dst_stride << 1; \ 401cabdff1aSopenharmony_ci ptrdiff_t stride3 = stride2 + dst_stride; \ 402cabdff1aSopenharmony_ci ptrdiff_t stride4 = stride2 << 1; \ 403cabdff1aSopenharmony_ci \ 404cabdff1aSopenharmony_ci LSX_ST_8(out, out, out, out, out, out, out, out, dst, \ 405cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); \ 406cabdff1aSopenharmony_ci dst += stride4; \ 407cabdff1aSopenharmony_ci LSX_ST_8(out, out, out, out, out, out, out, out, dst, \ 408cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); \ 409cabdff1aSopenharmony_ci} 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_LSX(127); 412cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_LSX(128); 413cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_LSX(129); 414cabdff1aSopenharmony_ci 415cabdff1aSopenharmony_ci#define INTRA_PREDICT_VALDC_32X32_LSX(val) \ 416cabdff1aSopenharmony_civoid ff_dc_##val##_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride, \ 417cabdff1aSopenharmony_ci const uint8_t *left, const uint8_t *top) \ 418cabdff1aSopenharmony_ci{ \ 419cabdff1aSopenharmony_ci __m128i out = __lsx_vldi(val); \ 420cabdff1aSopenharmony_ci \ 421cabdff1aSopenharmony_ci LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\ 422cabdff1aSopenharmony_ci LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\ 423cabdff1aSopenharmony_ci LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\ 424cabdff1aSopenharmony_ci LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\ 425cabdff1aSopenharmony_ci} 426cabdff1aSopenharmony_ci 427cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_LSX(127); 428cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_LSX(128); 429cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_LSX(129); 430cabdff1aSopenharmony_ci 431cabdff1aSopenharmony_civoid ff_tm_4x4_lsx(uint8_t *dst, ptrdiff_t dst_stride, 432cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top_ptr) 433cabdff1aSopenharmony_ci{ 434cabdff1aSopenharmony_ci uint8_t top_left = src_top_ptr[-1]; 435cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3, reg0, reg1; 436cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3; 437cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3; 438cabdff1aSopenharmony_ci 439cabdff1aSopenharmony_ci reg0 = __lsx_vreplgr2vr_h(top_left); 440cabdff1aSopenharmony_ci reg1 = __lsx_vld(src_top_ptr, 0); 441cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2, src_left, 442cabdff1aSopenharmony_ci 3, tmp3, tmp2, tmp1, tmp0); 443cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3, reg1, 444cabdff1aSopenharmony_ci src0, src1, src2, src3); 445cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vhaddw_hu_bu, src0, src0, src1, src1, src2, src2, src3, 446cabdff1aSopenharmony_ci src3, dst0, dst1, dst2, dst3); 447cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, dst0, reg0, dst1, reg0, dst2, reg0, dst3, reg0, 448cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 449cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, dst0, 7, dst1, 7, dst2, 7, dst3, 7, 450cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 451cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1); 452cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 0); 453cabdff1aSopenharmony_ci dst += dst_stride; 454cabdff1aSopenharmony_ci __lsx_vstelm_w(dst0, dst, 0, 2); 455cabdff1aSopenharmony_ci dst += dst_stride; 456cabdff1aSopenharmony_ci __lsx_vstelm_w(dst1, dst, 0, 0); 457cabdff1aSopenharmony_ci dst += dst_stride; 458cabdff1aSopenharmony_ci __lsx_vstelm_w(dst1, dst, 0, 2); 459cabdff1aSopenharmony_ci} 460cabdff1aSopenharmony_ci 461cabdff1aSopenharmony_civoid ff_tm_8x8_lsx(uint8_t *dst, ptrdiff_t dst_stride, 462cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top_ptr) 463cabdff1aSopenharmony_ci{ 464cabdff1aSopenharmony_ci uint8_t top_left = src_top_ptr[-1]; 465cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 466cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 467cabdff1aSopenharmony_ci __m128i reg0, reg1; 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ci reg0 = __lsx_vreplgr2vr_h(top_left); 470cabdff1aSopenharmony_ci reg1 = __lsx_vld(src_top_ptr, 0); 471cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2, src_left, 472cabdff1aSopenharmony_ci 3, tmp7, tmp6, tmp5, tmp4); 473cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_b, src_left, 4, src_left, 5, src_left, 6, src_left, 474cabdff1aSopenharmony_ci 7, tmp3, tmp2, tmp1, tmp0); 475cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3, reg1, 476cabdff1aSopenharmony_ci src0, src1, src2, src3); 477cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, tmp4, reg1, tmp5, reg1, tmp6, reg1, tmp7, reg1, 478cabdff1aSopenharmony_ci src4, src5, src6, src7); 479cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vhaddw_hu_bu, src0, src0, src1, src1, src2, src2, src3, 480cabdff1aSopenharmony_ci src3, src0, src1, src2, src3); 481cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vhaddw_hu_bu, src4, src4, src5, src5, src6, src6, src7, 482cabdff1aSopenharmony_ci src7, src4, src5, src6, src7); 483cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0, 484cabdff1aSopenharmony_ci src0, src1, src2, src3); 485cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0, 486cabdff1aSopenharmony_ci src4, src5, src6, src7); 487cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7, 488cabdff1aSopenharmony_ci src0, src1, src2, src3); 489cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7, 490cabdff1aSopenharmony_ci src4, src5, src6, src7); 491cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, src5, src4, src7, src6, 492cabdff1aSopenharmony_ci src0, src1, src2, src3); 493cabdff1aSopenharmony_ci __lsx_vstelm_d(src0, dst, 0, 0); 494cabdff1aSopenharmony_ci dst += dst_stride; 495cabdff1aSopenharmony_ci __lsx_vstelm_d(src0, dst, 0, 1); 496cabdff1aSopenharmony_ci dst += dst_stride; 497cabdff1aSopenharmony_ci __lsx_vstelm_d(src1, dst, 0, 0); 498cabdff1aSopenharmony_ci dst += dst_stride; 499cabdff1aSopenharmony_ci __lsx_vstelm_d(src1, dst, 0, 1); 500cabdff1aSopenharmony_ci dst += dst_stride; 501cabdff1aSopenharmony_ci __lsx_vstelm_d(src2, dst, 0, 0); 502cabdff1aSopenharmony_ci dst += dst_stride; 503cabdff1aSopenharmony_ci __lsx_vstelm_d(src2, dst, 0, 1); 504cabdff1aSopenharmony_ci dst += dst_stride; 505cabdff1aSopenharmony_ci __lsx_vstelm_d(src3, dst, 0, 0); 506cabdff1aSopenharmony_ci dst += dst_stride; 507cabdff1aSopenharmony_ci __lsx_vstelm_d(src3, dst, 0, 1); 508cabdff1aSopenharmony_ci} 509cabdff1aSopenharmony_ci 510cabdff1aSopenharmony_civoid ff_tm_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride, 511cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top_ptr) 512cabdff1aSopenharmony_ci{ 513cabdff1aSopenharmony_ci uint8_t top_left = src_top_ptr[-1]; 514cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 515cabdff1aSopenharmony_ci __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 516cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 517cabdff1aSopenharmony_ci __m128i reg0, reg1; 518cabdff1aSopenharmony_ci ptrdiff_t stride2 = dst_stride << 1; 519cabdff1aSopenharmony_ci ptrdiff_t stride3 = stride2 + dst_stride; 520cabdff1aSopenharmony_ci ptrdiff_t stride4 = stride2 << 1; 521cabdff1aSopenharmony_ci 522cabdff1aSopenharmony_ci reg0 = __lsx_vreplgr2vr_h(top_left); 523cabdff1aSopenharmony_ci reg1 = __lsx_vld(src_top_ptr, 0); 524cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2, src_left, 525cabdff1aSopenharmony_ci 3, tmp15, tmp14, tmp13, tmp12); 526cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_b, src_left, 4, src_left, 5, src_left, 6, src_left, 527cabdff1aSopenharmony_ci 7, tmp11, tmp10, tmp9, tmp8); 528cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_b, src_left, 8, src_left, 9, src_left, 10, 529cabdff1aSopenharmony_ci src_left, 11, tmp7, tmp6, tmp5, tmp4); 530cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_b, src_left, 12, src_left, 13, src_left, 14, 531cabdff1aSopenharmony_ci src_left, 15, tmp3, tmp2, tmp1, tmp0); 532cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwev_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3, 533cabdff1aSopenharmony_ci reg1, src0, src1, src2, src3); 534cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwod_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3, 535cabdff1aSopenharmony_ci reg1, src4, src5, src6, src7); 536cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0, 537cabdff1aSopenharmony_ci src0, src1, src2, src3); 538cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0, 539cabdff1aSopenharmony_ci src4, src5, src6, src7); 540cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7, 541cabdff1aSopenharmony_ci src0, src1, src2, src3); 542cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7, 543cabdff1aSopenharmony_ci src4, src5, src6, src7); 544cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3, 545cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 546cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwev_h_bu, tmp4, reg1, tmp5, reg1, tmp6, reg1, tmp7, 547cabdff1aSopenharmony_ci reg1, src0, src1, src2, src3); 548cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwod_h_bu, tmp4, reg1, tmp5, reg1, tmp6, reg1, tmp7, 549cabdff1aSopenharmony_ci reg1, src4, src5, src6, src7); 550cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0, 551cabdff1aSopenharmony_ci src0, src1, src2, src3); 552cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0, 553cabdff1aSopenharmony_ci src4, src5, src6, src7); 554cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7, 555cabdff1aSopenharmony_ci src0, src1, src2, src3); 556cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7, 557cabdff1aSopenharmony_ci src4, src5, src6, src7); 558cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3, 559cabdff1aSopenharmony_ci tmp4, tmp5, tmp6, tmp7); 560cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwev_h_bu, tmp8, reg1, tmp9, reg1, tmp10, reg1, tmp11, 561cabdff1aSopenharmony_ci reg1, src0, src1, src2, src3); 562cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwod_h_bu, tmp8, reg1, tmp9, reg1, tmp10, reg1, tmp11, 563cabdff1aSopenharmony_ci reg1, src4, src5, src6, src7); 564cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0, 565cabdff1aSopenharmony_ci src0, src1, src2, src3); 566cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0, 567cabdff1aSopenharmony_ci src4, src5, src6, src7); 568cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7, 569cabdff1aSopenharmony_ci src0, src1, src2, src3); 570cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7, 571cabdff1aSopenharmony_ci src4, src5, src6, src7); 572cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3, 573cabdff1aSopenharmony_ci tmp8, tmp9, tmp10, tmp11); 574cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwev_h_bu, tmp12, reg1, tmp13, reg1, tmp14, reg1, 575cabdff1aSopenharmony_ci tmp15, reg1, src0, src1, src2, src3); 576cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwod_h_bu, tmp12, reg1, tmp13, reg1, tmp14, reg1, 577cabdff1aSopenharmony_ci tmp15, reg1, src4, src5, src6, src7); 578cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0, 579cabdff1aSopenharmony_ci src0, src1, src2, src3); 580cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0, 581cabdff1aSopenharmony_ci src4, src5, src6, src7); 582cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7, 583cabdff1aSopenharmony_ci src0, src1, src2, src3); 584cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7, 585cabdff1aSopenharmony_ci src4, src5, src6, src7); 586cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3, 587cabdff1aSopenharmony_ci tmp12, tmp13, tmp14, tmp15); 588cabdff1aSopenharmony_ci LSX_ST_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, dst, 589cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); 590cabdff1aSopenharmony_ci dst += stride4; 591cabdff1aSopenharmony_ci LSX_ST_8(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, dst, 592cabdff1aSopenharmony_ci dst_stride, stride2, stride3, stride4); 593cabdff1aSopenharmony_ci} 594cabdff1aSopenharmony_ci 595cabdff1aSopenharmony_civoid ff_tm_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride, 596cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top_ptr) 597cabdff1aSopenharmony_ci{ 598cabdff1aSopenharmony_ci uint8_t top_left = src_top_ptr[-1]; 599cabdff1aSopenharmony_ci uint32_t loop_cnt; 600cabdff1aSopenharmony_ci __m128i tmp0, tmp1, tmp2, tmp3, reg0, reg1, reg2; 601cabdff1aSopenharmony_ci __m128i src0, src1, src2, src3, src4, src5, src6, src7; 602cabdff1aSopenharmony_ci __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 603cabdff1aSopenharmony_ci 604cabdff1aSopenharmony_ci reg0 = __lsx_vreplgr2vr_h(top_left); 605cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vld, src_top_ptr, 0, src_top_ptr, 16, reg1, reg2); 606cabdff1aSopenharmony_ci 607cabdff1aSopenharmony_ci src_left += 28; 608cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 609cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2, 610cabdff1aSopenharmony_ci src_left, 3, tmp3, tmp2, tmp1, tmp0); 611cabdff1aSopenharmony_ci src_left -= 4; 612cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwev_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1, 613cabdff1aSopenharmony_ci tmp3, reg1, src0, src1, src2, src3); 614cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwod_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1, 615cabdff1aSopenharmony_ci tmp3, reg1, src4, src5, src6, src7); 616cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, 617cabdff1aSopenharmony_ci reg0, src0, src1, src2, src3); 618cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, 619cabdff1aSopenharmony_ci reg0, src4, src5, src6, src7); 620cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwev_h_bu, tmp0, reg2, tmp1, reg2, tmp2, reg2, 621cabdff1aSopenharmony_ci tmp3, reg2, dst0, dst1, dst2, dst3); 622cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vaddwod_h_bu, tmp0, reg2, tmp1, reg2, tmp2, reg2, 623cabdff1aSopenharmony_ci tmp3, reg2, dst4, dst5, dst6, dst7); 624cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, dst0, reg0, dst1, reg0, dst2, reg0, dst3, 625cabdff1aSopenharmony_ci reg0, dst0, dst1, dst2, dst3); 626cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vssub_hu, dst4, reg0, dst5, reg0, dst6, reg0, dst7, 627cabdff1aSopenharmony_ci reg0, dst4, dst5, dst6, dst7); 628cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7, 629cabdff1aSopenharmony_ci src0, src1, src2, src3); 630cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7, 631cabdff1aSopenharmony_ci src4, src5, src6, src7); 632cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, dst0, 7, dst1, 7, dst2, 7, dst3, 7, 633cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 634cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vsat_hu, dst4, 7, dst5, 7, dst6, 7, dst7, 7, 635cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 636cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, 637cabdff1aSopenharmony_ci src3, src0, src1, src2, src3); 638cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vpackev_b, dst4, dst0, dst5, dst1, dst6, dst2, dst7, 639cabdff1aSopenharmony_ci dst3, dst0, dst1, dst2, dst3); 640cabdff1aSopenharmony_ci __lsx_vst(src0, dst, 0); 641cabdff1aSopenharmony_ci __lsx_vst(dst0, dst, 16); 642cabdff1aSopenharmony_ci dst += dst_stride; 643cabdff1aSopenharmony_ci __lsx_vst(src1, dst, 0); 644cabdff1aSopenharmony_ci __lsx_vst(dst1, dst, 16); 645cabdff1aSopenharmony_ci dst += dst_stride; 646cabdff1aSopenharmony_ci __lsx_vst(src2, dst, 0); 647cabdff1aSopenharmony_ci __lsx_vst(dst2, dst, 16); 648cabdff1aSopenharmony_ci dst += dst_stride; 649cabdff1aSopenharmony_ci __lsx_vst(src3, dst, 0); 650cabdff1aSopenharmony_ci __lsx_vst(dst3, dst, 16); 651cabdff1aSopenharmony_ci dst += dst_stride; 652cabdff1aSopenharmony_ci } 653cabdff1aSopenharmony_ci} 654