1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited 3cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 23cabdff1aSopenharmony_ci#include "hpeldsp_lasx.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_cistatic av_always_inline void 26cabdff1aSopenharmony_ciput_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, 27cabdff1aSopenharmony_ci int dst_stride, int src_stride1, int src_stride2, int h) 28cabdff1aSopenharmony_ci{ 29cabdff1aSopenharmony_ci int stride1_2, stride1_3, stride1_4; 30cabdff1aSopenharmony_ci int stride2_2, stride2_3, stride2_4; 31cabdff1aSopenharmony_ci __asm__ volatile ( 32cabdff1aSopenharmony_ci "slli.d %[stride1_2], %[srcStride1], 1 \n\t" 33cabdff1aSopenharmony_ci "slli.d %[stride2_2], %[srcStride2], 1 \n\t" 34cabdff1aSopenharmony_ci "add.d %[stride1_3], %[stride1_2], %[srcStride1] \n\t" 35cabdff1aSopenharmony_ci "add.d %[stride2_3], %[stride2_2], %[srcStride2] \n\t" 36cabdff1aSopenharmony_ci "slli.d %[stride1_4], %[stride1_2], 1 \n\t" 37cabdff1aSopenharmony_ci "slli.d %[stride2_4], %[stride2_2], 1 \n\t" 38cabdff1aSopenharmony_ci "1: \n\t" 39cabdff1aSopenharmony_ci "vld $vr0, %[src1], 0 \n\t" 40cabdff1aSopenharmony_ci "vldx $vr1, %[src1], %[srcStride1] \n\t" 41cabdff1aSopenharmony_ci "vldx $vr2, %[src1], %[stride1_2] \n\t" 42cabdff1aSopenharmony_ci "vldx $vr3, %[src1], %[stride1_3] \n\t" 43cabdff1aSopenharmony_ci "add.d %[src1], %[src1], %[stride1_4] \n\t" 44cabdff1aSopenharmony_ci 45cabdff1aSopenharmony_ci "vld $vr4, %[src2], 0 \n\t" 46cabdff1aSopenharmony_ci "vldx $vr5, %[src2], %[srcStride2] \n\t" 47cabdff1aSopenharmony_ci "vldx $vr6, %[src2], %[stride2_2] \n\t" 48cabdff1aSopenharmony_ci "vldx $vr7, %[src2], %[stride2_3] \n\t" 49cabdff1aSopenharmony_ci "add.d %[src2], %[src2], %[stride2_4] \n\t" 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ci "addi.d %[h], %[h], -4 \n\t" 52cabdff1aSopenharmony_ci 53cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr4, $vr0 \n\t" 54cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr5, $vr1 \n\t" 55cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr6, $vr2 \n\t" 56cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr7, $vr3 \n\t" 57cabdff1aSopenharmony_ci "vstelm.d $vr0, %[dst], 0, 0 \n\t" 58cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 59cabdff1aSopenharmony_ci "vstelm.d $vr1, %[dst], 0, 0 \n\t" 60cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 61cabdff1aSopenharmony_ci "vstelm.d $vr2, %[dst], 0, 0 \n\t" 62cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 63cabdff1aSopenharmony_ci "vstelm.d $vr3, %[dst], 0, 0 \n\t" 64cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 65cabdff1aSopenharmony_ci "bnez %[h], 1b \n\t" 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ci : [dst]"+&r"(dst), [src2]"+&r"(src2), [src1]"+&r"(src1), 68cabdff1aSopenharmony_ci [h]"+&r"(h), [stride1_2]"=&r"(stride1_2), 69cabdff1aSopenharmony_ci [stride1_3]"=&r"(stride1_3), [stride1_4]"=&r"(stride1_4), 70cabdff1aSopenharmony_ci [stride2_2]"=&r"(stride2_2), [stride2_3]"=&r"(stride2_3), 71cabdff1aSopenharmony_ci [stride2_4]"=&r"(stride2_4) 72cabdff1aSopenharmony_ci : [dstStride]"r"(dst_stride), [srcStride1]"r"(src_stride1), 73cabdff1aSopenharmony_ci [srcStride2]"r"(src_stride2) 74cabdff1aSopenharmony_ci : "memory" 75cabdff1aSopenharmony_ci ); 76cabdff1aSopenharmony_ci} 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_cistatic av_always_inline void 79cabdff1aSopenharmony_ciput_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, 80cabdff1aSopenharmony_ci int dst_stride, int src_stride1, int src_stride2, int h) 81cabdff1aSopenharmony_ci{ 82cabdff1aSopenharmony_ci int stride1_2, stride1_3, stride1_4; 83cabdff1aSopenharmony_ci int stride2_2, stride2_3, stride2_4; 84cabdff1aSopenharmony_ci int dststride2, dststride3, dststride4; 85cabdff1aSopenharmony_ci __asm__ volatile ( 86cabdff1aSopenharmony_ci "slli.d %[stride1_2], %[srcStride1], 1 \n\t" 87cabdff1aSopenharmony_ci "slli.d %[stride2_2], %[srcStride2], 1 \n\t" 88cabdff1aSopenharmony_ci "slli.d %[dststride2], %[dstStride], 1 \n\t" 89cabdff1aSopenharmony_ci "add.d %[stride1_3], %[stride1_2], %[srcStride1] \n\t" 90cabdff1aSopenharmony_ci "add.d %[stride2_3], %[stride2_2], %[srcStride2] \n\t" 91cabdff1aSopenharmony_ci "add.d %[dststride3], %[dststride2], %[dstStride] \n\t" 92cabdff1aSopenharmony_ci "slli.d %[stride1_4], %[stride1_2], 1 \n\t" 93cabdff1aSopenharmony_ci "slli.d %[stride2_4], %[stride2_2], 1 \n\t" 94cabdff1aSopenharmony_ci "slli.d %[dststride4], %[dststride2], 1 \n\t" 95cabdff1aSopenharmony_ci "1: \n\t" 96cabdff1aSopenharmony_ci "vld $vr0, %[src1], 0 \n\t" 97cabdff1aSopenharmony_ci "vldx $vr1, %[src1], %[srcStride1] \n\t" 98cabdff1aSopenharmony_ci "vldx $vr2, %[src1], %[stride1_2] \n\t" 99cabdff1aSopenharmony_ci "vldx $vr3, %[src1], %[stride1_3] \n\t" 100cabdff1aSopenharmony_ci "add.d %[src1], %[src1], %[stride1_4] \n\t" 101cabdff1aSopenharmony_ci 102cabdff1aSopenharmony_ci "vld $vr4, %[src2], 0 \n\t" 103cabdff1aSopenharmony_ci "vldx $vr5, %[src2], %[srcStride2] \n\t" 104cabdff1aSopenharmony_ci "vldx $vr6, %[src2], %[stride2_2] \n\t" 105cabdff1aSopenharmony_ci "vldx $vr7, %[src2], %[stride2_3] \n\t" 106cabdff1aSopenharmony_ci "add.d %[src2], %[src2], %[stride2_4] \n\t" 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ci "addi.d %[h], %[h], -4 \n\t" 109cabdff1aSopenharmony_ci 110cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr4, $vr0 \n\t" 111cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr5, $vr1 \n\t" 112cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr6, $vr2 \n\t" 113cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr7, $vr3 \n\t" 114cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0 \n\t" 115cabdff1aSopenharmony_ci "vstx $vr1, %[dst], %[dstStride] \n\t" 116cabdff1aSopenharmony_ci "vstx $vr2, %[dst], %[dststride2] \n\t" 117cabdff1aSopenharmony_ci "vstx $vr3, %[dst], %[dststride3] \n\t" 118cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dststride4] \n\t" 119cabdff1aSopenharmony_ci "bnez %[h], 1b \n\t" 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci : [dst]"+&r"(dst), [src2]"+&r"(src2), [src1]"+&r"(src1), 122cabdff1aSopenharmony_ci [h]"+&r"(h), [stride1_2]"=&r"(stride1_2), 123cabdff1aSopenharmony_ci [stride1_3]"=&r"(stride1_3), [stride1_4]"=&r"(stride1_4), 124cabdff1aSopenharmony_ci [stride2_2]"=&r"(stride2_2), [stride2_3]"=&r"(stride2_3), 125cabdff1aSopenharmony_ci [stride2_4]"=&r"(stride2_4), [dststride2]"=&r"(dststride2), 126cabdff1aSopenharmony_ci [dststride3]"=&r"(dststride3), [dststride4]"=&r"(dststride4) 127cabdff1aSopenharmony_ci : [dstStride]"r"(dst_stride), [srcStride1]"r"(src_stride1), 128cabdff1aSopenharmony_ci [srcStride2]"r"(src_stride2) 129cabdff1aSopenharmony_ci : "memory" 130cabdff1aSopenharmony_ci ); 131cabdff1aSopenharmony_ci} 132cabdff1aSopenharmony_ci 133cabdff1aSopenharmony_civoid ff_put_pixels8_8_lasx(uint8_t *block, const uint8_t *pixels, 134cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 135cabdff1aSopenharmony_ci{ 136cabdff1aSopenharmony_ci uint64_t tmp[8]; 137cabdff1aSopenharmony_ci int h_8 = h >> 3; 138cabdff1aSopenharmony_ci int res = h & 7; 139cabdff1aSopenharmony_ci ptrdiff_t stride2, stride3, stride4; 140cabdff1aSopenharmony_ci 141cabdff1aSopenharmony_ci __asm__ volatile ( 142cabdff1aSopenharmony_ci "beqz %[h_8], 2f \n\t" 143cabdff1aSopenharmony_ci "slli.d %[stride2], %[stride], 1 \n\t" 144cabdff1aSopenharmony_ci "add.d %[stride3], %[stride2], %[stride] \n\t" 145cabdff1aSopenharmony_ci "slli.d %[stride4], %[stride2], 1 \n\t" 146cabdff1aSopenharmony_ci "1: \n\t" 147cabdff1aSopenharmony_ci "ld.d %[tmp0], %[src], 0x0 \n\t" 148cabdff1aSopenharmony_ci "ldx.d %[tmp1], %[src], %[stride] \n\t" 149cabdff1aSopenharmony_ci "ldx.d %[tmp2], %[src], %[stride2] \n\t" 150cabdff1aSopenharmony_ci "ldx.d %[tmp3], %[src], %[stride3] \n\t" 151cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride4] \n\t" 152cabdff1aSopenharmony_ci "ld.d %[tmp4], %[src], 0x0 \n\t" 153cabdff1aSopenharmony_ci "ldx.d %[tmp5], %[src], %[stride] \n\t" 154cabdff1aSopenharmony_ci "ldx.d %[tmp6], %[src], %[stride2] \n\t" 155cabdff1aSopenharmony_ci "ldx.d %[tmp7], %[src], %[stride3] \n\t" 156cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride4] \n\t" 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_ci "addi.d %[h_8], %[h_8], -1 \n\t" 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_ci "st.d %[tmp0], %[dst], 0x0 \n\t" 161cabdff1aSopenharmony_ci "stx.d %[tmp1], %[dst], %[stride] \n\t" 162cabdff1aSopenharmony_ci "stx.d %[tmp2], %[dst], %[stride2] \n\t" 163cabdff1aSopenharmony_ci "stx.d %[tmp3], %[dst], %[stride3] \n\t" 164cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride4] \n\t" 165cabdff1aSopenharmony_ci "st.d %[tmp4], %[dst], 0x0 \n\t" 166cabdff1aSopenharmony_ci "stx.d %[tmp5], %[dst], %[stride] \n\t" 167cabdff1aSopenharmony_ci "stx.d %[tmp6], %[dst], %[stride2] \n\t" 168cabdff1aSopenharmony_ci "stx.d %[tmp7], %[dst], %[stride3] \n\t" 169cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride4] \n\t" 170cabdff1aSopenharmony_ci "bnez %[h_8], 1b \n\t" 171cabdff1aSopenharmony_ci 172cabdff1aSopenharmony_ci "2: \n\t" 173cabdff1aSopenharmony_ci "beqz %[res], 4f \n\t" 174cabdff1aSopenharmony_ci "3: \n\t" 175cabdff1aSopenharmony_ci "ld.d %[tmp0], %[src], 0x0 \n\t" 176cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride] \n\t" 177cabdff1aSopenharmony_ci "addi.d %[res], %[res], -1 \n\t" 178cabdff1aSopenharmony_ci "st.d %[tmp0], %[dst], 0x0 \n\t" 179cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride] \n\t" 180cabdff1aSopenharmony_ci "bnez %[res], 3b \n\t" 181cabdff1aSopenharmony_ci "4: \n\t" 182cabdff1aSopenharmony_ci : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), 183cabdff1aSopenharmony_ci [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), 184cabdff1aSopenharmony_ci [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]), 185cabdff1aSopenharmony_ci [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]), 186cabdff1aSopenharmony_ci [dst]"+&r"(block), [src]"+&r"(pixels), 187cabdff1aSopenharmony_ci [h_8]"+&r"(h_8), [res]"+&r"(res), 188cabdff1aSopenharmony_ci [stride2]"=&r"(stride2), [stride3]"=&r"(stride3), 189cabdff1aSopenharmony_ci [stride4]"=&r"(stride4) 190cabdff1aSopenharmony_ci : [stride]"r"(line_size) 191cabdff1aSopenharmony_ci : "memory" 192cabdff1aSopenharmony_ci ); 193cabdff1aSopenharmony_ci} 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_civoid ff_put_pixels16_8_lsx(uint8_t *block, const uint8_t *pixels, 196cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 197cabdff1aSopenharmony_ci{ 198cabdff1aSopenharmony_ci int h_8 = h >> 3; 199cabdff1aSopenharmony_ci int res = h & 7; 200cabdff1aSopenharmony_ci ptrdiff_t stride2, stride3, stride4; 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci __asm__ volatile ( 203cabdff1aSopenharmony_ci "beqz %[h_8], 2f \n\t" 204cabdff1aSopenharmony_ci "slli.d %[stride2], %[stride], 1 \n\t" 205cabdff1aSopenharmony_ci "add.d %[stride3], %[stride2], %[stride] \n\t" 206cabdff1aSopenharmony_ci "slli.d %[stride4], %[stride2], 1 \n\t" 207cabdff1aSopenharmony_ci "1: \n\t" 208cabdff1aSopenharmony_ci "vld $vr0, %[src], 0x0 \n\t" 209cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[stride] \n\t" 210cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride2] \n\t" 211cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride3] \n\t" 212cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride4] \n\t" 213cabdff1aSopenharmony_ci "vld $vr4, %[src], 0x0 \n\t" 214cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[stride] \n\t" 215cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride2] \n\t" 216cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride3] \n\t" 217cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride4] \n\t" 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_ci "addi.d %[h_8], %[h_8], -1 \n\t" 220cabdff1aSopenharmony_ci 221cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0x0 \n\t" 222cabdff1aSopenharmony_ci "vstx $vr1, %[dst], %[stride] \n\t" 223cabdff1aSopenharmony_ci "vstx $vr2, %[dst], %[stride2] \n\t" 224cabdff1aSopenharmony_ci "vstx $vr3, %[dst], %[stride3] \n\t" 225cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride4] \n\t" 226cabdff1aSopenharmony_ci "vst $vr4, %[dst], 0x0 \n\t" 227cabdff1aSopenharmony_ci "vstx $vr5, %[dst], %[stride] \n\t" 228cabdff1aSopenharmony_ci "vstx $vr6, %[dst], %[stride2] \n\t" 229cabdff1aSopenharmony_ci "vstx $vr7, %[dst], %[stride3] \n\t" 230cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride4] \n\t" 231cabdff1aSopenharmony_ci "bnez %[h_8], 1b \n\t" 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_ci "2: \n\t" 234cabdff1aSopenharmony_ci "beqz %[res], 4f \n\t" 235cabdff1aSopenharmony_ci "3: \n\t" 236cabdff1aSopenharmony_ci "vld $vr0, %[src], 0x0 \n\t" 237cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride] \n\t" 238cabdff1aSopenharmony_ci "addi.d %[res], %[res], -1 \n\t" 239cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0x0 \n\t" 240cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride] \n\t" 241cabdff1aSopenharmony_ci "bnez %[res], 3b \n\t" 242cabdff1aSopenharmony_ci "4: \n\t" 243cabdff1aSopenharmony_ci : [dst]"+&r"(block), [src]"+&r"(pixels), 244cabdff1aSopenharmony_ci [h_8]"+&r"(h_8), [res]"+&r"(res), 245cabdff1aSopenharmony_ci [stride2]"=&r"(stride2), [stride3]"=&r"(stride3), 246cabdff1aSopenharmony_ci [stride4]"=&r"(stride4) 247cabdff1aSopenharmony_ci : [stride]"r"(line_size) 248cabdff1aSopenharmony_ci : "memory" 249cabdff1aSopenharmony_ci ); 250cabdff1aSopenharmony_ci} 251cabdff1aSopenharmony_ci 252cabdff1aSopenharmony_civoid ff_put_pixels8_x2_8_lasx(uint8_t *block, const uint8_t *pixels, 253cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 254cabdff1aSopenharmony_ci{ 255cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(block, pixels, pixels + 1, line_size, line_size, 256cabdff1aSopenharmony_ci line_size, h); 257cabdff1aSopenharmony_ci} 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_civoid ff_put_pixels8_y2_8_lasx(uint8_t *block, const uint8_t *pixels, 260cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 261cabdff1aSopenharmony_ci{ 262cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(block, pixels, pixels + line_size, line_size, 263cabdff1aSopenharmony_ci line_size, line_size, h); 264cabdff1aSopenharmony_ci} 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_civoid ff_put_pixels16_x2_8_lasx(uint8_t *block, const uint8_t *pixels, 267cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 268cabdff1aSopenharmony_ci{ 269cabdff1aSopenharmony_ci put_pixels16_l2_8_lsx(block, pixels, pixels + 1, line_size, line_size, 270cabdff1aSopenharmony_ci line_size, h); 271cabdff1aSopenharmony_ci} 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_civoid ff_put_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels, 274cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 275cabdff1aSopenharmony_ci{ 276cabdff1aSopenharmony_ci put_pixels16_l2_8_lsx(block, pixels, pixels + line_size, line_size, 277cabdff1aSopenharmony_ci line_size, line_size, h); 278cabdff1aSopenharmony_ci} 279cabdff1aSopenharmony_ci 280cabdff1aSopenharmony_cistatic void common_hz_bil_no_rnd_16x16_lasx(const uint8_t *src, 281cabdff1aSopenharmony_ci int32_t src_stride, 282cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 283cabdff1aSopenharmony_ci{ 284cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 285cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 286cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 287cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 288cabdff1aSopenharmony_ci uint8_t *_src = (uint8_t*)src; 289cabdff1aSopenharmony_ci 290cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 291cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 292cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 293cabdff1aSopenharmony_ci _src += 1; 294cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 295cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 296cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 297cabdff1aSopenharmony_ci _src += (src_stride_4x -1); 298cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, 299cabdff1aSopenharmony_ci src4, 0x20, src7, src6, 0x20, src0, src1, src2, src3); 300cabdff1aSopenharmony_ci src0 = __lasx_xvavg_bu(src0, src2); 301cabdff1aSopenharmony_ci src1 = __lasx_xvavg_bu(src1, src3); 302cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 303cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 1); 304cabdff1aSopenharmony_ci dst += dst_stride; 305cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 2); 306cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 3); 307cabdff1aSopenharmony_ci dst += dst_stride; 308cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 0); 309cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 1); 310cabdff1aSopenharmony_ci dst += dst_stride; 311cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 2); 312cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 3); 313cabdff1aSopenharmony_ci dst += dst_stride; 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 316cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 317cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 318cabdff1aSopenharmony_ci _src += 1; 319cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 320cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 321cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 322cabdff1aSopenharmony_ci _src += (src_stride_4x - 1); 323cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 324cabdff1aSopenharmony_ci 0x20, src7, src6, 0x20, src0, src1, src2, src3); 325cabdff1aSopenharmony_ci src0 = __lasx_xvavg_bu(src0, src2); 326cabdff1aSopenharmony_ci src1 = __lasx_xvavg_bu(src1, src3); 327cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 328cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 1); 329cabdff1aSopenharmony_ci dst += dst_stride; 330cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 2); 331cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 3); 332cabdff1aSopenharmony_ci dst += dst_stride; 333cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 0); 334cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 1); 335cabdff1aSopenharmony_ci dst += dst_stride; 336cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 2); 337cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 3); 338cabdff1aSopenharmony_ci dst += dst_stride; 339cabdff1aSopenharmony_ci 340cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 341cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 342cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 343cabdff1aSopenharmony_ci _src += 1; 344cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 345cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 346cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 347cabdff1aSopenharmony_ci _src += (src_stride_4x - 1); 348cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 349cabdff1aSopenharmony_ci 0x20, src7, src6, 0x20, src0, src1, src2, src3); 350cabdff1aSopenharmony_ci src0 = __lasx_xvavg_bu(src0, src2); 351cabdff1aSopenharmony_ci src1 = __lasx_xvavg_bu(src1, src3); 352cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 353cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 1); 354cabdff1aSopenharmony_ci dst += dst_stride; 355cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 2); 356cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 3); 357cabdff1aSopenharmony_ci dst += dst_stride; 358cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 0); 359cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 1); 360cabdff1aSopenharmony_ci dst += dst_stride; 361cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 2); 362cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 3); 363cabdff1aSopenharmony_ci dst += dst_stride; 364cabdff1aSopenharmony_ci 365cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 366cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 367cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 368cabdff1aSopenharmony_ci _src += 1; 369cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 370cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 371cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 372cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 373cabdff1aSopenharmony_ci 0x20, src7, src6, 0x20, src0, src1, src2, src3); 374cabdff1aSopenharmony_ci src0 = __lasx_xvavg_bu(src0, src2); 375cabdff1aSopenharmony_ci src1 = __lasx_xvavg_bu(src1, src3); 376cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 377cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 1); 378cabdff1aSopenharmony_ci dst += dst_stride; 379cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 2); 380cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 3); 381cabdff1aSopenharmony_ci dst += dst_stride; 382cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 0); 383cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 1); 384cabdff1aSopenharmony_ci dst += dst_stride; 385cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 2); 386cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 3); 387cabdff1aSopenharmony_ci} 388cabdff1aSopenharmony_ci 389cabdff1aSopenharmony_cistatic void common_hz_bil_no_rnd_8x16_lasx(const uint8_t *src, 390cabdff1aSopenharmony_ci int32_t src_stride, 391cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 392cabdff1aSopenharmony_ci{ 393cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 394cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 395cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 396cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 397cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 400cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 401cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 402cabdff1aSopenharmony_ci _src += 1; 403cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 404cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 405cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 406cabdff1aSopenharmony_ci _src += (src_stride_4x - 1); 407cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 408cabdff1aSopenharmony_ci 0x20, src7, src6, 0x20, src0, src1, src2, src3); 409cabdff1aSopenharmony_ci src0 = __lasx_xvavg_bu(src0, src2); 410cabdff1aSopenharmony_ci src1 = __lasx_xvavg_bu(src1, src3); 411cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 412cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 1); 413cabdff1aSopenharmony_ci dst += dst_stride; 414cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 2); 415cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 3); 416cabdff1aSopenharmony_ci dst += dst_stride; 417cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 0); 418cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 1); 419cabdff1aSopenharmony_ci dst += dst_stride; 420cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 2); 421cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 3); 422cabdff1aSopenharmony_ci dst += dst_stride; 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 425cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 426cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 427cabdff1aSopenharmony_ci _src += 1; 428cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 429cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 430cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 431cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 432cabdff1aSopenharmony_ci 0x20, src7, src6, 0x20, src0, src1, src2, src3); 433cabdff1aSopenharmony_ci src0 = __lasx_xvavg_bu(src0, src2); 434cabdff1aSopenharmony_ci src1 = __lasx_xvavg_bu(src1, src3); 435cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 436cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 1); 437cabdff1aSopenharmony_ci dst += dst_stride; 438cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 2); 439cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 3); 440cabdff1aSopenharmony_ci dst += dst_stride; 441cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 0); 442cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 1); 443cabdff1aSopenharmony_ci dst += dst_stride; 444cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 2); 445cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 8, 3); 446cabdff1aSopenharmony_ci} 447cabdff1aSopenharmony_ci 448cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels16_x2_8_lasx(uint8_t *block, const uint8_t *pixels, 449cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 450cabdff1aSopenharmony_ci{ 451cabdff1aSopenharmony_ci if (h == 16) { 452cabdff1aSopenharmony_ci common_hz_bil_no_rnd_16x16_lasx(pixels, line_size, block, line_size); 453cabdff1aSopenharmony_ci } else if (h == 8) { 454cabdff1aSopenharmony_ci common_hz_bil_no_rnd_8x16_lasx(pixels, line_size, block, line_size); 455cabdff1aSopenharmony_ci } 456cabdff1aSopenharmony_ci} 457cabdff1aSopenharmony_ci 458cabdff1aSopenharmony_cistatic void common_vt_bil_no_rnd_16x16_lasx(const uint8_t *src, 459cabdff1aSopenharmony_ci int32_t src_stride, 460cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 461cabdff1aSopenharmony_ci{ 462cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 463cabdff1aSopenharmony_ci __m256i src9, src10, src11, src12, src13, src14, src15, src16; 464cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 465cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 466cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 467cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 470cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 471cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 472cabdff1aSopenharmony_ci _src += src_stride_4x; 473cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 474cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 475cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 476cabdff1aSopenharmony_ci _src += src_stride_4x; 477cabdff1aSopenharmony_ci src8 = __lasx_xvld(_src, 0); 478cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src9, src10); 479cabdff1aSopenharmony_ci src11 = __lasx_xvldx(_src, src_stride_3x); 480cabdff1aSopenharmony_ci _src += src_stride_4x; 481cabdff1aSopenharmony_ci src12 = __lasx_xvld(_src, 0); 482cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 483cabdff1aSopenharmony_ci src13, src14); 484cabdff1aSopenharmony_ci src15 = __lasx_xvldx(_src, src_stride_3x); 485cabdff1aSopenharmony_ci _src += src_stride_4x; 486cabdff1aSopenharmony_ci src16 = __lasx_xvld(_src, 0); 487cabdff1aSopenharmony_ci 488cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 489cabdff1aSopenharmony_ci 0x20, src4, src3, 0x20, src0, src1, src2, src3); 490cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 491cabdff1aSopenharmony_ci 0x20, src8, src7, 0x20, src4, src5, src6, src7); 492cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src9, src8, 0x20, src10, src9, 0x20, src11, 493cabdff1aSopenharmony_ci src10, 0x20, src12, src11, 0x20, src8, src9, src10, src11); 494cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src13, src12, 0x20, src14, src13, 0x20, src15, 495cabdff1aSopenharmony_ci src14, 0x20, src16, src15, 0x20, src12, src13, src14, src15); 496cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvavg_bu, src0, src1, src2, src3, src4, src5, src6, src7, 497cabdff1aSopenharmony_ci src0, src2, src4, src6); 498cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvavg_bu, src8, src9, src10, src11, src12, src13, src14, 499cabdff1aSopenharmony_ci src15, src8, src10, src12, src14); 500cabdff1aSopenharmony_ci 501cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 502cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 1); 503cabdff1aSopenharmony_ci dst += dst_stride; 504cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 2); 505cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 3); 506cabdff1aSopenharmony_ci dst += dst_stride; 507cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, dst, 0, 0); 508cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, dst, 8, 1); 509cabdff1aSopenharmony_ci dst += dst_stride; 510cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, dst, 0, 2); 511cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, dst, 8, 3); 512cabdff1aSopenharmony_ci dst += dst_stride; 513cabdff1aSopenharmony_ci __lasx_xvstelm_d(src4, dst, 0, 0); 514cabdff1aSopenharmony_ci __lasx_xvstelm_d(src4, dst, 8, 1); 515cabdff1aSopenharmony_ci dst += dst_stride; 516cabdff1aSopenharmony_ci __lasx_xvstelm_d(src4, dst, 0, 2); 517cabdff1aSopenharmony_ci __lasx_xvstelm_d(src4, dst, 8, 3); 518cabdff1aSopenharmony_ci dst += dst_stride; 519cabdff1aSopenharmony_ci __lasx_xvstelm_d(src6, dst, 0, 0); 520cabdff1aSopenharmony_ci __lasx_xvstelm_d(src6, dst, 8, 1); 521cabdff1aSopenharmony_ci dst += dst_stride; 522cabdff1aSopenharmony_ci __lasx_xvstelm_d(src6, dst, 0, 2); 523cabdff1aSopenharmony_ci __lasx_xvstelm_d(src6, dst, 8, 3); 524cabdff1aSopenharmony_ci dst += dst_stride; 525cabdff1aSopenharmony_ci __lasx_xvstelm_d(src8, dst, 0, 0); 526cabdff1aSopenharmony_ci __lasx_xvstelm_d(src8, dst, 8, 1); 527cabdff1aSopenharmony_ci dst += dst_stride; 528cabdff1aSopenharmony_ci __lasx_xvstelm_d(src8, dst, 0, 2); 529cabdff1aSopenharmony_ci __lasx_xvstelm_d(src8, dst, 8, 3); 530cabdff1aSopenharmony_ci dst += dst_stride; 531cabdff1aSopenharmony_ci __lasx_xvstelm_d(src10, dst, 0, 0); 532cabdff1aSopenharmony_ci __lasx_xvstelm_d(src10, dst, 8, 1); 533cabdff1aSopenharmony_ci dst += dst_stride; 534cabdff1aSopenharmony_ci __lasx_xvstelm_d(src10, dst, 0, 2); 535cabdff1aSopenharmony_ci __lasx_xvstelm_d(src10, dst, 8, 3); 536cabdff1aSopenharmony_ci dst += dst_stride; 537cabdff1aSopenharmony_ci __lasx_xvstelm_d(src12, dst, 0, 0); 538cabdff1aSopenharmony_ci __lasx_xvstelm_d(src12, dst, 8, 1); 539cabdff1aSopenharmony_ci dst += dst_stride; 540cabdff1aSopenharmony_ci __lasx_xvstelm_d(src12, dst, 0, 2); 541cabdff1aSopenharmony_ci __lasx_xvstelm_d(src12, dst, 8, 3); 542cabdff1aSopenharmony_ci dst += dst_stride; 543cabdff1aSopenharmony_ci __lasx_xvstelm_d(src14, dst, 0, 0); 544cabdff1aSopenharmony_ci __lasx_xvstelm_d(src14, dst, 8, 1); 545cabdff1aSopenharmony_ci dst += dst_stride; 546cabdff1aSopenharmony_ci __lasx_xvstelm_d(src14, dst, 0, 2); 547cabdff1aSopenharmony_ci __lasx_xvstelm_d(src14, dst, 8, 3); 548cabdff1aSopenharmony_ci} 549cabdff1aSopenharmony_ci 550cabdff1aSopenharmony_cistatic void common_vt_bil_no_rnd_8x16_lasx(const uint8_t *src, 551cabdff1aSopenharmony_ci int32_t src_stride, 552cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 553cabdff1aSopenharmony_ci{ 554cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 555cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 556cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 557cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 558cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 559cabdff1aSopenharmony_ci 560cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 561cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 562cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 563cabdff1aSopenharmony_ci _src += src_stride_4x; 564cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 565cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 566cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 567cabdff1aSopenharmony_ci _src += src_stride_4x; 568cabdff1aSopenharmony_ci src8 = __lasx_xvld(_src, 0); 569cabdff1aSopenharmony_ci 570cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 571cabdff1aSopenharmony_ci 0x20, src4, src3, 0x20, src0, src1, src2, src3); 572cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 573cabdff1aSopenharmony_ci 0x20, src8, src7, 0x20, src4, src5, src6, src7); 574cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvavg_bu, src0, src1, src2, src3, src4, src5, src6, src7, 575cabdff1aSopenharmony_ci src0, src2, src4, src6); 576cabdff1aSopenharmony_ci 577cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 578cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 1); 579cabdff1aSopenharmony_ci dst += dst_stride; 580cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 2); 581cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 8, 3); 582cabdff1aSopenharmony_ci dst += dst_stride; 583cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, dst, 0, 0); 584cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, dst, 8, 1); 585cabdff1aSopenharmony_ci dst += dst_stride; 586cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, dst, 0, 2); 587cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, dst, 8, 3); 588cabdff1aSopenharmony_ci dst += dst_stride; 589cabdff1aSopenharmony_ci __lasx_xvstelm_d(src4, dst, 0, 0); 590cabdff1aSopenharmony_ci __lasx_xvstelm_d(src4, dst, 8, 1); 591cabdff1aSopenharmony_ci dst += dst_stride; 592cabdff1aSopenharmony_ci __lasx_xvstelm_d(src4, dst, 0, 2); 593cabdff1aSopenharmony_ci __lasx_xvstelm_d(src4, dst, 8, 3); 594cabdff1aSopenharmony_ci dst += dst_stride; 595cabdff1aSopenharmony_ci __lasx_xvstelm_d(src6, dst, 0, 0); 596cabdff1aSopenharmony_ci __lasx_xvstelm_d(src6, dst, 8, 1); 597cabdff1aSopenharmony_ci dst += dst_stride; 598cabdff1aSopenharmony_ci __lasx_xvstelm_d(src6, dst, 0, 2); 599cabdff1aSopenharmony_ci __lasx_xvstelm_d(src6, dst, 8, 3); 600cabdff1aSopenharmony_ci} 601cabdff1aSopenharmony_ci 602cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels, 603cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 604cabdff1aSopenharmony_ci{ 605cabdff1aSopenharmony_ci if (h == 16) { 606cabdff1aSopenharmony_ci common_vt_bil_no_rnd_16x16_lasx(pixels, line_size, block, line_size); 607cabdff1aSopenharmony_ci } else if (h == 8) { 608cabdff1aSopenharmony_ci common_vt_bil_no_rnd_8x16_lasx(pixels, line_size, block, line_size); 609cabdff1aSopenharmony_ci } 610cabdff1aSopenharmony_ci} 611cabdff1aSopenharmony_ci 612cabdff1aSopenharmony_cistatic void common_hv_bil_no_rnd_16x16_lasx(const uint8_t *src, 613cabdff1aSopenharmony_ci int32_t src_stride, 614cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 615cabdff1aSopenharmony_ci{ 616cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 617cabdff1aSopenharmony_ci __m256i src10, src11, src12, src13, src14, src15, src16, src17; 618cabdff1aSopenharmony_ci __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 619cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 620cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 621cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 622cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 623cabdff1aSopenharmony_ci 624cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 625cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 626cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 627cabdff1aSopenharmony_ci _src += src_stride_4x; 628cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 629cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 630cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 631cabdff1aSopenharmony_ci _src += (1 - src_stride_4x); 632cabdff1aSopenharmony_ci src9 = __lasx_xvld(_src, 0); 633cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 634cabdff1aSopenharmony_ci src10, src11); 635cabdff1aSopenharmony_ci src12 = __lasx_xvldx(_src, src_stride_3x); 636cabdff1aSopenharmony_ci _src += src_stride_4x; 637cabdff1aSopenharmony_ci src13 = __lasx_xvld(_src, 0); 638cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 639cabdff1aSopenharmony_ci src14, src15); 640cabdff1aSopenharmony_ci src16 = __lasx_xvldx(_src, src_stride_3x); 641cabdff1aSopenharmony_ci _src += (src_stride_4x - 1); 642cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, 645cabdff1aSopenharmony_ci src6, 0x02, src3, src7, 0x02, src0, src1, src2, src3); 646cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, 647cabdff1aSopenharmony_ci src14, 0x02, src11, src15, 0x02, src4, src5, src6, src7); 648cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, 649cabdff1aSopenharmony_ci src8, src9); 650cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, src3, 651cabdff1aSopenharmony_ci sum0, sum2, sum4, sum6); 652cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, src3, 653cabdff1aSopenharmony_ci sum1, sum3, sum5, sum7); 654cabdff1aSopenharmony_ci src8 = __lasx_xvilvl_h(src9, src4); 655cabdff1aSopenharmony_ci src9 = __lasx_xvilvh_h(src9, src4); 656cabdff1aSopenharmony_ci 657cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2, 658cabdff1aSopenharmony_ci sum3, sum3, src0, src1, src2, src3); 659cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6, 660cabdff1aSopenharmony_ci sum7, sum7, src4, src5, src6, src7); 661cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9); 662cabdff1aSopenharmony_ci 663cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5, 664cabdff1aSopenharmony_ci sum0, sum1, sum2, sum3); 665cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9, 666cabdff1aSopenharmony_ci sum4, sum5, sum6, sum7); 667cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1, 668cabdff1aSopenharmony_ci sum0, sum1, sum2, sum3); 669cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1, 670cabdff1aSopenharmony_ci sum4, sum5, sum6, sum7); 671cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, 2, 672cabdff1aSopenharmony_ci sum7, sum6, 2, sum0, sum1, sum2, sum3); 673cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 0); 674cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 8, 1); 675cabdff1aSopenharmony_ci dst += dst_stride; 676cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 0, 0); 677cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 8, 1); 678cabdff1aSopenharmony_ci dst += dst_stride; 679cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 0, 0); 680cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 8, 1); 681cabdff1aSopenharmony_ci dst += dst_stride; 682cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 0, 0); 683cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 8, 1); 684cabdff1aSopenharmony_ci dst += dst_stride; 685cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 2); 686cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 8, 3); 687cabdff1aSopenharmony_ci dst += dst_stride; 688cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 0, 2); 689cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 8, 3); 690cabdff1aSopenharmony_ci dst += dst_stride; 691cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 0, 2); 692cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 8, 3); 693cabdff1aSopenharmony_ci dst += dst_stride; 694cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 0, 2); 695cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 8, 3); 696cabdff1aSopenharmony_ci dst += dst_stride; 697cabdff1aSopenharmony_ci 698cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 699cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 700cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 701cabdff1aSopenharmony_ci _src += src_stride_4x; 702cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 703cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 704cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 705cabdff1aSopenharmony_ci _src += (1 - src_stride_4x); 706cabdff1aSopenharmony_ci src9 = __lasx_xvld(_src, 0); 707cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 708cabdff1aSopenharmony_ci src10, src11); 709cabdff1aSopenharmony_ci src12 = __lasx_xvldx(_src, src_stride_3x); 710cabdff1aSopenharmony_ci _src += src_stride_4x; 711cabdff1aSopenharmony_ci src13 = __lasx_xvld(_src, 0); 712cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 713cabdff1aSopenharmony_ci src14, src15); 714cabdff1aSopenharmony_ci src16 = __lasx_xvldx(_src, src_stride_3x); 715cabdff1aSopenharmony_ci _src += (src_stride_4x - 1); 716cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); 717cabdff1aSopenharmony_ci 718cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, src6, 0x02, 719cabdff1aSopenharmony_ci src3, src7, 0x02, src0, src1, src2, src3); 720cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, src14, 0x02, 721cabdff1aSopenharmony_ci src11, src15, 0x02, src4, src5, src6, src7); 722cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, src8, src9); 723cabdff1aSopenharmony_ci 724cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, src3, 725cabdff1aSopenharmony_ci sum0, sum2, sum4, sum6); 726cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, src3, 727cabdff1aSopenharmony_ci sum1, sum3, sum5, sum7); 728cabdff1aSopenharmony_ci src8 = __lasx_xvilvl_h(src9, src4); 729cabdff1aSopenharmony_ci src9 = __lasx_xvilvh_h(src9, src4); 730cabdff1aSopenharmony_ci 731cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2, 732cabdff1aSopenharmony_ci sum3, sum3, src0, src1, src2, src3); 733cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6, 734cabdff1aSopenharmony_ci sum7, sum7, src4, src5, src6, src7); 735cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9); 736cabdff1aSopenharmony_ci 737cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5, 738cabdff1aSopenharmony_ci sum0, sum1, sum2, sum3); 739cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9, 740cabdff1aSopenharmony_ci sum4, sum5, sum6, sum7); 741cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1, 742cabdff1aSopenharmony_ci sum0, sum1, sum2, sum3); 743cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1, 744cabdff1aSopenharmony_ci sum4, sum5, sum6, sum7); 745cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, 2, 746cabdff1aSopenharmony_ci sum7, sum6, 2, sum0, sum1, sum2, sum3); 747cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 0); 748cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 8, 1); 749cabdff1aSopenharmony_ci dst += dst_stride; 750cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 0, 0); 751cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 8, 1); 752cabdff1aSopenharmony_ci dst += dst_stride; 753cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 0, 0); 754cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 8, 1); 755cabdff1aSopenharmony_ci dst += dst_stride; 756cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 0, 0); 757cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 8, 1); 758cabdff1aSopenharmony_ci dst += dst_stride; 759cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 2); 760cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 8, 3); 761cabdff1aSopenharmony_ci dst += dst_stride; 762cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 0, 2); 763cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 8, 3); 764cabdff1aSopenharmony_ci dst += dst_stride; 765cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 0, 2); 766cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 8, 3); 767cabdff1aSopenharmony_ci dst += dst_stride; 768cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 0, 2); 769cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 8, 3); 770cabdff1aSopenharmony_ci} 771cabdff1aSopenharmony_ci 772cabdff1aSopenharmony_cistatic void common_hv_bil_no_rnd_8x16_lasx(const uint8_t *src, 773cabdff1aSopenharmony_ci int32_t src_stride, 774cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 775cabdff1aSopenharmony_ci{ 776cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 777cabdff1aSopenharmony_ci __m256i src10, src11, src12, src13, src14, src15, src16, src17; 778cabdff1aSopenharmony_ci __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 779cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 780cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 781cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 782cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 783cabdff1aSopenharmony_ci 784cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 785cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 786cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 787cabdff1aSopenharmony_ci _src += src_stride_4x; 788cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 789cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 790cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 791cabdff1aSopenharmony_ci _src += (1 - src_stride_4x); 792cabdff1aSopenharmony_ci src9 = __lasx_xvld(_src, 0); 793cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 794cabdff1aSopenharmony_ci src10, src11); 795cabdff1aSopenharmony_ci src12 = __lasx_xvldx(_src, src_stride_3x); 796cabdff1aSopenharmony_ci _src += src_stride_4x; 797cabdff1aSopenharmony_ci src13 = __lasx_xvld(_src, 0); 798cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 799cabdff1aSopenharmony_ci src14, src15); 800cabdff1aSopenharmony_ci src16 = __lasx_xvldx(_src, src_stride_3x); 801cabdff1aSopenharmony_ci _src += (src_stride_4x - 1); 802cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); 803cabdff1aSopenharmony_ci 804cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, 805cabdff1aSopenharmony_ci src6, 0x02, src3, src7, 0x02, src0, src1, src2, src3); 806cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, 807cabdff1aSopenharmony_ci src14, 0x02, src11, src15, 0x02, src4, src5, src6, src7); 808cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, src8, src9); 809cabdff1aSopenharmony_ci 810cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, src3, 811cabdff1aSopenharmony_ci sum0, sum2, sum4, sum6); 812cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, src3, 813cabdff1aSopenharmony_ci sum1, sum3, sum5, sum7); 814cabdff1aSopenharmony_ci src8 = __lasx_xvilvl_h(src9, src4); 815cabdff1aSopenharmony_ci src9 = __lasx_xvilvh_h(src9, src4); 816cabdff1aSopenharmony_ci 817cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2, 818cabdff1aSopenharmony_ci sum3, sum3, src0, src1, src2, src3); 819cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6, 820cabdff1aSopenharmony_ci sum7, sum7, src4, src5, src6, src7); 821cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9); 822cabdff1aSopenharmony_ci 823cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5, 824cabdff1aSopenharmony_ci sum0, sum1, sum2, sum3); 825cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9, 826cabdff1aSopenharmony_ci sum4, sum5, sum6, sum7); 827cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1, 828cabdff1aSopenharmony_ci sum0, sum1, sum2, sum3); 829cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1, 830cabdff1aSopenharmony_ci sum4, sum5, sum6, sum7); 831cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, 2, 832cabdff1aSopenharmony_ci sum7, sum6, 2, sum0, sum1, sum2, sum3); 833cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 0); 834cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 8, 1); 835cabdff1aSopenharmony_ci dst += dst_stride; 836cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 0, 0); 837cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 8, 1); 838cabdff1aSopenharmony_ci dst += dst_stride; 839cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 0, 0); 840cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 8, 1); 841cabdff1aSopenharmony_ci dst += dst_stride; 842cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 0, 0); 843cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 8, 1); 844cabdff1aSopenharmony_ci dst += dst_stride; 845cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 2); 846cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 8, 3); 847cabdff1aSopenharmony_ci dst += dst_stride; 848cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 0, 2); 849cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 8, 3); 850cabdff1aSopenharmony_ci dst += dst_stride; 851cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 0, 2); 852cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 8, 3); 853cabdff1aSopenharmony_ci dst += dst_stride; 854cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 0, 2); 855cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 8, 3); 856cabdff1aSopenharmony_ci} 857cabdff1aSopenharmony_ci 858cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels16_xy2_8_lasx(uint8_t *block, 859cabdff1aSopenharmony_ci const uint8_t *pixels, 860cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 861cabdff1aSopenharmony_ci{ 862cabdff1aSopenharmony_ci if (h == 16) { 863cabdff1aSopenharmony_ci common_hv_bil_no_rnd_16x16_lasx(pixels, line_size, block, line_size); 864cabdff1aSopenharmony_ci } else if (h == 8) { 865cabdff1aSopenharmony_ci common_hv_bil_no_rnd_8x16_lasx(pixels, line_size, block, line_size); 866cabdff1aSopenharmony_ci } 867cabdff1aSopenharmony_ci} 868cabdff1aSopenharmony_ci 869cabdff1aSopenharmony_cistatic void common_hz_bil_no_rnd_8x8_lasx(const uint8_t *src, 870cabdff1aSopenharmony_ci int32_t src_stride, 871cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 872cabdff1aSopenharmony_ci{ 873cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 874cabdff1aSopenharmony_ci __m256i src8, src9, src10, src11, src12, src13, src14, src15; 875cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 876cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 877cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 878cabdff1aSopenharmony_ci int32_t dst_stride_4x = dst_stride << 2; 879cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 880cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 881cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 882cabdff1aSopenharmony_ci 883cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 884cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 885cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 886cabdff1aSopenharmony_ci _src += src_stride_4x; 887cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 888cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 889cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 890cabdff1aSopenharmony_ci _src += (1 - src_stride_4x); 891cabdff1aSopenharmony_ci src8 = __lasx_xvld(_src, 0); 892cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src9, src10); 893cabdff1aSopenharmony_ci src11 = __lasx_xvldx(_src, src_stride_3x); 894cabdff1aSopenharmony_ci _src += src_stride_4x; 895cabdff1aSopenharmony_ci src12 = __lasx_xvld(_src, 0); 896cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 897cabdff1aSopenharmony_ci src13, src14); 898cabdff1aSopenharmony_ci src15 = __lasx_xvldx(_src, src_stride_3x); 899cabdff1aSopenharmony_ci 900cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_d, src1, src0, src3, src2, src5, src4, src7, 901cabdff1aSopenharmony_ci src6, src0, src1, src2, src3); 902cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_d, src9, src8, src11, src10, src13, src12, src15, 903cabdff1aSopenharmony_ci src14, src4, src5, src6, src7); 904cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 905cabdff1aSopenharmony_ci 0x20, src7, src6, 0x20, src0, src1, src2, src3); 906cabdff1aSopenharmony_ci src0 = __lasx_xvavg_bu(src0, src2); 907cabdff1aSopenharmony_ci src1 = __lasx_xvavg_bu(src1, src3); 908cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 909cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride, 0, 1); 910cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride_2x, 0, 2); 911cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride_3x, 0, 3); 912cabdff1aSopenharmony_ci dst += dst_stride_4x; 913cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 0); 914cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst + dst_stride, 0, 1); 915cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst + dst_stride_2x, 0, 2); 916cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst + dst_stride_3x, 0, 3); 917cabdff1aSopenharmony_ci} 918cabdff1aSopenharmony_ci 919cabdff1aSopenharmony_cistatic void common_hz_bil_no_rnd_4x8_lasx(const uint8_t *src, 920cabdff1aSopenharmony_ci int32_t src_stride, 921cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 922cabdff1aSopenharmony_ci{ 923cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 924cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 925cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 926cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 927cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 928cabdff1aSopenharmony_ci uint8_t *_src = (uint8_t*)src; 929cabdff1aSopenharmony_ci 930cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 931cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 932cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 933cabdff1aSopenharmony_ci _src += 1; 934cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 935cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 936cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 937cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_d, src1, src0, src3, src2, src5, src4, src7, src6, 938cabdff1aSopenharmony_ci src0, src1, src2, src3); 939cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src1); 940cabdff1aSopenharmony_ci src0 = __lasx_xvavg_bu(src0, src1); 941cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 942cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride, 0, 1); 943cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride_2x, 0, 2); 944cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride_3x, 0, 3); 945cabdff1aSopenharmony_ci} 946cabdff1aSopenharmony_ci 947cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels8_x2_8_lasx(uint8_t *block, const uint8_t *pixels, 948cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 949cabdff1aSopenharmony_ci{ 950cabdff1aSopenharmony_ci if (h == 8) { 951cabdff1aSopenharmony_ci common_hz_bil_no_rnd_8x8_lasx(pixels, line_size, block, line_size); 952cabdff1aSopenharmony_ci } else if (h == 4) { 953cabdff1aSopenharmony_ci common_hz_bil_no_rnd_4x8_lasx(pixels, line_size, block, line_size); 954cabdff1aSopenharmony_ci } 955cabdff1aSopenharmony_ci} 956cabdff1aSopenharmony_ci 957cabdff1aSopenharmony_cistatic void common_vt_bil_no_rnd_8x8_lasx(const uint8_t *src, int32_t src_stride, 958cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 959cabdff1aSopenharmony_ci{ 960cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8; 961cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 962cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 963cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 964cabdff1aSopenharmony_ci int32_t dst_stride_4x = dst_stride << 2; 965cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 966cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 967cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 968cabdff1aSopenharmony_ci 969cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 970cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 971cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 972cabdff1aSopenharmony_ci _src += src_stride_4x; 973cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 974cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 975cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 976cabdff1aSopenharmony_ci _src += src_stride_4x; 977cabdff1aSopenharmony_ci src8 = __lasx_xvld(_src, 0); 978cabdff1aSopenharmony_ci 979cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_d, src1, src0, src2, src1, src3, src2, src4, src3, 980cabdff1aSopenharmony_ci src0, src1, src2, src3); 981cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_d, src5, src4, src6, src5, src7, src6, src8, src7, 982cabdff1aSopenharmony_ci src4, src5, src6, src7); 983cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src2, src0, 0x20, src3, src1, 0x20, src6, src4, 984cabdff1aSopenharmony_ci 0x20, src7, src5, 0x20, src0, src1, src2, src3); 985cabdff1aSopenharmony_ci src0 = __lasx_xvavg_bu(src0, src1); 986cabdff1aSopenharmony_ci src1 = __lasx_xvavg_bu(src2, src3); 987cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 988cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride, 0, 1); 989cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride_2x, 0, 2); 990cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride_3x, 0, 3); 991cabdff1aSopenharmony_ci dst += dst_stride_4x; 992cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst, 0, 0); 993cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst + dst_stride, 0, 1); 994cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst + dst_stride_2x, 0, 2); 995cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, dst + dst_stride_3x, 0, 3); 996cabdff1aSopenharmony_ci} 997cabdff1aSopenharmony_ci 998cabdff1aSopenharmony_cistatic void common_vt_bil_no_rnd_4x8_lasx(const uint8_t *src, int32_t src_stride, 999cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 1000cabdff1aSopenharmony_ci{ 1001cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4; 1002cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 1003cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 1004cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 1005cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1006cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1007cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 1008cabdff1aSopenharmony_ci 1009cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 1010cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, _src, 1011cabdff1aSopenharmony_ci src_stride_3x, _src, src_stride_4x, src1, src2, src3, src4); 1012cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_d, src1, src0, src2, src1, src3, src2, src4, src3, 1013cabdff1aSopenharmony_ci src0, src1, src2, src3); 1014cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src2, src0, 0x20, src3, src1, 0x20, src0, src1); 1015cabdff1aSopenharmony_ci src0 = __lasx_xvavg_bu(src0, src1); 1016cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst, 0, 0); 1017cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride, 0, 1); 1018cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride_2x, 0, 2); 1019cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, dst + dst_stride_3x, 0, 3); 1020cabdff1aSopenharmony_ci} 1021cabdff1aSopenharmony_ci 1022cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels8_y2_8_lasx(uint8_t *block, const uint8_t *pixels, 1023cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 1024cabdff1aSopenharmony_ci{ 1025cabdff1aSopenharmony_ci if (h == 8) { 1026cabdff1aSopenharmony_ci common_vt_bil_no_rnd_8x8_lasx(pixels, line_size, block, line_size); 1027cabdff1aSopenharmony_ci } else if (h == 4) { 1028cabdff1aSopenharmony_ci common_vt_bil_no_rnd_4x8_lasx(pixels, line_size, block, line_size); 1029cabdff1aSopenharmony_ci } 1030cabdff1aSopenharmony_ci} 1031cabdff1aSopenharmony_ci 1032cabdff1aSopenharmony_cistatic void common_hv_bil_no_rnd_8x8_lasx(const uint8_t *src, int32_t src_stride, 1033cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 1034cabdff1aSopenharmony_ci{ 1035cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 1036cabdff1aSopenharmony_ci __m256i src8, src9, src10, src11, src12, src13, src14, src15, src16, src17; 1037cabdff1aSopenharmony_ci __m256i sum0, sum1, sum2, sum3; 1038cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 1039cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 1040cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 1041cabdff1aSopenharmony_ci int32_t dst_stride_4x = dst_stride << 2; 1042cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1043cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1044cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 1045cabdff1aSopenharmony_ci 1046cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 1047cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 1048cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 1049cabdff1aSopenharmony_ci _src += src_stride_4x; 1050cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 1051cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 1052cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 1053cabdff1aSopenharmony_ci _src += (1 - src_stride_4x); 1054cabdff1aSopenharmony_ci src9 = __lasx_xvld(_src, 0); 1055cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 1056cabdff1aSopenharmony_ci src10, src11); 1057cabdff1aSopenharmony_ci src12 = __lasx_xvldx(_src, src_stride_3x); 1058cabdff1aSopenharmony_ci _src += src_stride_4x; 1059cabdff1aSopenharmony_ci src13 = __lasx_xvld(_src, 0); 1060cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 1061cabdff1aSopenharmony_ci src14, src15); 1062cabdff1aSopenharmony_ci src16 = __lasx_xvldx(_src, src_stride_3x); 1063cabdff1aSopenharmony_ci _src += (src_stride_4x - 1); 1064cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); 1065cabdff1aSopenharmony_ci 1066cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src9, src0, src10, src1, src11, src2, src12, src3, 1067cabdff1aSopenharmony_ci src0, src1, src2, src3); 1068cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src13, src4, src14, src5, src15, src6, src16, src7, 1069cabdff1aSopenharmony_ci src4, src5, src6, src7); 1070cabdff1aSopenharmony_ci src8 = __lasx_xvilvl_b(src17, src8); 1071cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 1072cabdff1aSopenharmony_ci 0x20, src4, src3, 0x20, src0, src1, src2, src3); 1073cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 1074cabdff1aSopenharmony_ci 0x20, src8, src7, 0x20, src4, src5, src6, src7); 1075cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, src0, src0, src1, src1, src2, src2, 1076cabdff1aSopenharmony_ci src3, src3, src0, src1, src2, src3); 1077cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, src4, src4, src5, src5, src6, src6, 1078cabdff1aSopenharmony_ci src7, src7, src4, src5, src6, src7); 1079cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_h, src0, src1, src2, src3, src4, src5, src6, src7, 1080cabdff1aSopenharmony_ci sum0, sum1, sum2, sum3); 1081cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1, 1082cabdff1aSopenharmony_ci sum0, sum1, sum2, sum3); 1083cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum0, sum1); 1084cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 0); 1085cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst + dst_stride, 0, 2); 1086cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst + dst_stride_2x, 0, 1); 1087cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst + dst_stride_3x, 0, 3); 1088cabdff1aSopenharmony_ci dst += dst_stride_4x; 1089cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 0, 0); 1090cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst + dst_stride, 0, 2); 1091cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst + dst_stride_2x, 0, 1); 1092cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst + dst_stride_3x, 0, 3); 1093cabdff1aSopenharmony_ci} 1094cabdff1aSopenharmony_ci 1095cabdff1aSopenharmony_cistatic void common_hv_bil_no_rnd_4x8_lasx(const uint8_t *src, int32_t src_stride, 1096cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 1097cabdff1aSopenharmony_ci{ 1098cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 1099cabdff1aSopenharmony_ci __m256i src8, src9, sum0, sum1; 1100cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 1101cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 1102cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 1103cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1104cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1105cabdff1aSopenharmony_ci uint8_t *_src = (uint8_t*)src; 1106cabdff1aSopenharmony_ci 1107cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 1108cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 1109cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 1110cabdff1aSopenharmony_ci _src += 1; 1111cabdff1aSopenharmony_ci src5 = __lasx_xvld(_src, 0); 1112cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src6, src7); 1113cabdff1aSopenharmony_ci src8 = __lasx_xvldx(_src, src_stride_3x); 1114cabdff1aSopenharmony_ci _src += (src_stride_4x - 1); 1115cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src4, src9); 1116cabdff1aSopenharmony_ci 1117cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src5, src0, src6, src1, src7, src2, src8, src3, 1118cabdff1aSopenharmony_ci src0, src1, src2, src3); 1119cabdff1aSopenharmony_ci src4 = __lasx_xvilvl_b(src9, src4); 1120cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 1121cabdff1aSopenharmony_ci 0x20, src4, src3, 0x20, src0, src1, src2, src3); 1122cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, src0, src0, src1, src1, src2, src2, 1123cabdff1aSopenharmony_ci src3, src3, src0, src1, src2, src3); 1124cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvadd_h, src0, src1, src2, src3, sum0, sum1); 1125cabdff1aSopenharmony_ci sum0 = __lasx_xvaddi_hu(sum0, 1); 1126cabdff1aSopenharmony_ci sum1 = __lasx_xvaddi_hu(sum1, 1); 1127cabdff1aSopenharmony_ci sum0 = __lasx_xvsrani_b_h(sum1, sum0, 2); 1128cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 0); 1129cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst + dst_stride, 0, 2); 1130cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst + dst_stride_2x, 0, 1); 1131cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst + dst_stride_3x, 0, 3); 1132cabdff1aSopenharmony_ci} 1133cabdff1aSopenharmony_ci 1134cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels8_xy2_8_lasx(uint8_t *block, const uint8_t *pixels, 1135cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 1136cabdff1aSopenharmony_ci{ 1137cabdff1aSopenharmony_ci if (h == 8) { 1138cabdff1aSopenharmony_ci common_hv_bil_no_rnd_8x8_lasx(pixels, line_size, block, line_size); 1139cabdff1aSopenharmony_ci } else if (h == 4) { 1140cabdff1aSopenharmony_ci common_hv_bil_no_rnd_4x8_lasx(pixels, line_size, block, line_size); 1141cabdff1aSopenharmony_ci } 1142cabdff1aSopenharmony_ci} 1143cabdff1aSopenharmony_ci 1144cabdff1aSopenharmony_cistatic void common_hv_bil_16w_lasx(const uint8_t *src, int32_t src_stride, 1145cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1146cabdff1aSopenharmony_ci uint8_t height) 1147cabdff1aSopenharmony_ci{ 1148cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 1149cabdff1aSopenharmony_ci __m256i src10, src11, src12, src13, src14, src15, src16, src17; 1150cabdff1aSopenharmony_ci __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 1151cabdff1aSopenharmony_ci uint8_t loop_cnt; 1152cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 1153cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 1154cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1155cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 1156cabdff1aSopenharmony_ci 1157cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 1158cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 1159cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); 1160cabdff1aSopenharmony_ci src3 = __lasx_xvldx(_src, src_stride_3x); 1161cabdff1aSopenharmony_ci _src += src_stride_4x; 1162cabdff1aSopenharmony_ci src4 = __lasx_xvld(_src, 0); 1163cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); 1164cabdff1aSopenharmony_ci src7 = __lasx_xvldx(_src, src_stride_3x); 1165cabdff1aSopenharmony_ci _src += (1 - src_stride_4x); 1166cabdff1aSopenharmony_ci src9 = __lasx_xvld(_src, 0); 1167cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 1168cabdff1aSopenharmony_ci src10, src11); 1169cabdff1aSopenharmony_ci src12 = __lasx_xvldx(_src, src_stride_3x); 1170cabdff1aSopenharmony_ci _src += src_stride_4x; 1171cabdff1aSopenharmony_ci src13 = __lasx_xvld(_src, 0); 1172cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, 1173cabdff1aSopenharmony_ci src14, src15); 1174cabdff1aSopenharmony_ci src16 = __lasx_xvldx(_src, src_stride_3x); 1175cabdff1aSopenharmony_ci _src += (src_stride_4x - 1); 1176cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); 1177cabdff1aSopenharmony_ci 1178cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, 1179cabdff1aSopenharmony_ci src6, 0x02, src3, src7, 0x02, src0, src1, src2, src3); 1180cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, 1181cabdff1aSopenharmony_ci src14, 0x02, src11, src15, 0x02, src4, src5, src6, src7); 1182cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, 1183cabdff1aSopenharmony_ci src8, src9); 1184cabdff1aSopenharmony_ci 1185cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, 1186cabdff1aSopenharmony_ci src3, sum0, sum2, sum4, sum6); 1187cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, 1188cabdff1aSopenharmony_ci src3, sum1, sum3, sum5, sum7); 1189cabdff1aSopenharmony_ci src8 = __lasx_xvilvl_h(src9, src4); 1190cabdff1aSopenharmony_ci src9 = __lasx_xvilvh_h(src9, src4); 1191cabdff1aSopenharmony_ci 1192cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2, 1193cabdff1aSopenharmony_ci sum3, sum3, src0, src1, src2, src3); 1194cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6, 1195cabdff1aSopenharmony_ci sum7, sum7, src4, src5, src6, src7); 1196cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9); 1197cabdff1aSopenharmony_ci 1198cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, 1199cabdff1aSopenharmony_ci src5, sum0, sum1, sum2, sum3); 1200cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, 1201cabdff1aSopenharmony_ci src9, sum4, sum5, sum6, sum7); 1202cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvsrarni_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, 1203cabdff1aSopenharmony_ci sum4, 2, sum7, sum6, 2, sum0, sum1, sum2, sum3); 1204cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 0); 1205cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 8, 1); 1206cabdff1aSopenharmony_ci dst += dst_stride; 1207cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 0, 0); 1208cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 8, 1); 1209cabdff1aSopenharmony_ci dst += dst_stride; 1210cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 0, 0); 1211cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 8, 1); 1212cabdff1aSopenharmony_ci dst += dst_stride; 1213cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 0, 0); 1214cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 8, 1); 1215cabdff1aSopenharmony_ci dst += dst_stride; 1216cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 2); 1217cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 8, 3); 1218cabdff1aSopenharmony_ci dst += dst_stride; 1219cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 0, 2); 1220cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum1, dst, 8, 3); 1221cabdff1aSopenharmony_ci dst += dst_stride; 1222cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 0, 2); 1223cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum2, dst, 8, 3); 1224cabdff1aSopenharmony_ci dst += dst_stride; 1225cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 0, 2); 1226cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum3, dst, 8, 3); 1227cabdff1aSopenharmony_ci dst += dst_stride; 1228cabdff1aSopenharmony_ci } 1229cabdff1aSopenharmony_ci} 1230cabdff1aSopenharmony_ci 1231cabdff1aSopenharmony_civoid ff_put_pixels16_xy2_8_lasx(uint8_t *block, const uint8_t *pixels, 1232cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 1233cabdff1aSopenharmony_ci{ 1234cabdff1aSopenharmony_ci common_hv_bil_16w_lasx(pixels, line_size, block, line_size, h); 1235cabdff1aSopenharmony_ci} 1236cabdff1aSopenharmony_ci 1237cabdff1aSopenharmony_cistatic void common_hv_bil_8w_lasx(const uint8_t *src, int32_t src_stride, 1238cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1239cabdff1aSopenharmony_ci uint8_t height) 1240cabdff1aSopenharmony_ci{ 1241cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, src4, src5, src6, src7; 1242cabdff1aSopenharmony_ci __m256i src8, src9, sum0, sum1; 1243cabdff1aSopenharmony_ci uint8_t loop_cnt; 1244cabdff1aSopenharmony_ci int32_t src_stride_2x = src_stride << 1; 1245cabdff1aSopenharmony_ci int32_t src_stride_4x = src_stride << 2; 1246cabdff1aSopenharmony_ci int32_t dst_stride_2x = dst_stride << 1; 1247cabdff1aSopenharmony_ci int32_t dst_stride_4x = dst_stride << 2; 1248cabdff1aSopenharmony_ci int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1249cabdff1aSopenharmony_ci int32_t src_stride_3x = src_stride_2x + src_stride; 1250cabdff1aSopenharmony_ci uint8_t* _src = (uint8_t*)src; 1251cabdff1aSopenharmony_ci 1252cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src0, src5); 1253cabdff1aSopenharmony_ci _src += src_stride; 1254cabdff1aSopenharmony_ci 1255cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1256cabdff1aSopenharmony_ci src1 = __lasx_xvld(_src, 0); 1257cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src2, src3); 1258cabdff1aSopenharmony_ci src4 = __lasx_xvldx(_src, src_stride_3x); 1259cabdff1aSopenharmony_ci _src += 1; 1260cabdff1aSopenharmony_ci src6 = __lasx_xvld(_src, 0); 1261cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src7, src8); 1262cabdff1aSopenharmony_ci src9 = __lasx_xvldx(_src, src_stride_3x); 1263cabdff1aSopenharmony_ci _src += (src_stride_4x - 1); 1264cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src5, src0, src6, src1, src7, src2, src8, src3, 1265cabdff1aSopenharmony_ci src0, src1, src2, src3); 1266cabdff1aSopenharmony_ci src5 = __lasx_xvilvl_b(src9, src4); 1267cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 1268cabdff1aSopenharmony_ci 0x20, src5, src3, 0x20, src0, src1, src2, src3); 1269cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvhaddw_hu_bu, src0, src0, src1, src1, src2, src2, 1270cabdff1aSopenharmony_ci src3, src3, src0, src1, src2, src3); 1271cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvadd_h, src0, src1, src2, src3, sum0, sum1); 1272cabdff1aSopenharmony_ci sum0 = __lasx_xvsrarni_b_h(sum1, sum0, 2); 1273cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst, 0, 0); 1274cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst + dst_stride, 0, 2); 1275cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst + dst_stride_2x, 0, 1); 1276cabdff1aSopenharmony_ci __lasx_xvstelm_d(sum0, dst + dst_stride_3x, 0, 3); 1277cabdff1aSopenharmony_ci dst += dst_stride_4x; 1278cabdff1aSopenharmony_ci src0 = src4; 1279cabdff1aSopenharmony_ci src5 = src9; 1280cabdff1aSopenharmony_ci } 1281cabdff1aSopenharmony_ci} 1282cabdff1aSopenharmony_ci 1283cabdff1aSopenharmony_civoid ff_put_pixels8_xy2_8_lasx(uint8_t *block, const uint8_t *pixels, 1284cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 1285cabdff1aSopenharmony_ci{ 1286cabdff1aSopenharmony_ci common_hv_bil_8w_lasx(pixels, line_size, block, line_size, h); 1287cabdff1aSopenharmony_ci} 1288