1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Loongson LASX optimized h264qpel 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (c) 2020 Loongson Technology Corporation Limited 5cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * This file is part of FFmpeg. 8cabdff1aSopenharmony_ci * 9cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci * 14cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci * Lesser General Public License for more details. 18cabdff1aSopenharmony_ci * 19cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci */ 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci#include "h264qpel_lasx.h" 25cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 26cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cistatic const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = { 29cabdff1aSopenharmony_ci /* 8 width cases */ 30cabdff1aSopenharmony_ci 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12, 31cabdff1aSopenharmony_ci 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12, 32cabdff1aSopenharmony_ci 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 33cabdff1aSopenharmony_ci 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 34cabdff1aSopenharmony_ci 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 35cabdff1aSopenharmony_ci 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 36cabdff1aSopenharmony_ci}; 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \ 39cabdff1aSopenharmony_ci( { \ 40cabdff1aSopenharmony_ci __m256i out0_m; \ 41cabdff1aSopenharmony_ci __m256i tmp0_m; \ 42cabdff1aSopenharmony_ci \ 43cabdff1aSopenharmony_ci tmp0_m = __lasx_xvshuf_b(in1, in0, mask0); \ 44cabdff1aSopenharmony_ci out0_m = __lasx_xvhaddw_h_b(tmp0_m, tmp0_m); \ 45cabdff1aSopenharmony_ci tmp0_m = __lasx_xvshuf_b(in1, in0, mask1); \ 46cabdff1aSopenharmony_ci out0_m = __lasx_xvdp2add_h_b(out0_m, minus5b, tmp0_m); \ 47cabdff1aSopenharmony_ci tmp0_m = __lasx_xvshuf_b(in1, in0, mask2); \ 48cabdff1aSopenharmony_ci out0_m = __lasx_xvdp2add_h_b(out0_m, plus20b, tmp0_m); \ 49cabdff1aSopenharmony_ci \ 50cabdff1aSopenharmony_ci out0_m; \ 51cabdff1aSopenharmony_ci} ) 52cabdff1aSopenharmony_ci 53cabdff1aSopenharmony_ci#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ 54cabdff1aSopenharmony_ci( { \ 55cabdff1aSopenharmony_ci __m256i out0_m; \ 56cabdff1aSopenharmony_ci \ 57cabdff1aSopenharmony_ci out0_m = __lasx_xvdp2_h_b(in0, coeff0); \ 58cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvdp2add_h_b, out0_m, in1, coeff1, out0_m,\ 59cabdff1aSopenharmony_ci in2, coeff2, out0_m, out0_m); \ 60cabdff1aSopenharmony_ci \ 61cabdff1aSopenharmony_ci out0_m; \ 62cabdff1aSopenharmony_ci} ) 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_cistatic av_always_inline 65cabdff1aSopenharmony_civoid avc_luma_hv_qrt_and_aver_dst_16x16_lasx(uint8_t *src_x, 66cabdff1aSopenharmony_ci uint8_t *src_y, 67cabdff1aSopenharmony_ci uint8_t *dst, ptrdiff_t stride) 68cabdff1aSopenharmony_ci{ 69cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 70cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 71cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 72cabdff1aSopenharmony_ci uint32_t loop_cnt; 73cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 74cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 75cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 76cabdff1aSopenharmony_ci __m256i tmp0, tmp1; 77cabdff1aSopenharmony_ci __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2; 78cabdff1aSopenharmony_ci __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 79cabdff1aSopenharmony_ci __m256i src_vt7, src_vt8; 80cabdff1aSopenharmony_ci __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h; 81cabdff1aSopenharmony_ci __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2; 82cabdff1aSopenharmony_ci __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 83cabdff1aSopenharmony_ci __m256i vt_out3, out0, out1, out2, out3; 84cabdff1aSopenharmony_ci __m256i minus5b = __lasx_xvldi(0xFB); 85cabdff1aSopenharmony_ci __m256i plus20b = __lasx_xvldi(20); 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_ci filt0 = __lasx_xvreplgr2vr_h(filt_const0); 88cabdff1aSopenharmony_ci filt1 = __lasx_xvreplgr2vr_h(filt_const1); 89cabdff1aSopenharmony_ci filt2 = __lasx_xvreplgr2vr_h(filt_const2); 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci mask0 = __lasx_xvld(luma_mask_arr, 0); 92cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2); 93cabdff1aSopenharmony_ci src_vt0 = __lasx_xvld(src_y, 0); 94cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x, 95cabdff1aSopenharmony_ci src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4); 96cabdff1aSopenharmony_ci src_y += stride_4x; 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci src_vt0 = __lasx_xvxori_b(src_vt0, 128); 99cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128, 100cabdff1aSopenharmony_ci src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4); 101cabdff1aSopenharmony_ci 102cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 103cabdff1aSopenharmony_ci src_hz0 = __lasx_xvld(src_x, 0); 104cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x, 105cabdff1aSopenharmony_ci src_hz1, src_hz2); 106cabdff1aSopenharmony_ci src_hz3 = __lasx_xvldx(src_x, stride_3x); 107cabdff1aSopenharmony_ci src_x += stride_4x; 108cabdff1aSopenharmony_ci src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94); 109cabdff1aSopenharmony_ci src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94); 110cabdff1aSopenharmony_ci src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94); 111cabdff1aSopenharmony_ci src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94); 112cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128, 113cabdff1aSopenharmony_ci src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3); 114cabdff1aSopenharmony_ci 115cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 116cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 117cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 118cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 119cabdff1aSopenharmony_ci hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5); 120cabdff1aSopenharmony_ci hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5); 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, 123cabdff1aSopenharmony_ci src_y, stride_3x, src_y, stride_4x, 124cabdff1aSopenharmony_ci src_vt5, src_vt6, src_vt7, src_vt8); 125cabdff1aSopenharmony_ci src_y += stride_4x; 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128, 128cabdff1aSopenharmony_ci src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8); 129cabdff1aSopenharmony_ci 130cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5, 131cabdff1aSopenharmony_ci 0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02, 132cabdff1aSopenharmony_ci src_vt0, src_vt1, src_vt2, src_vt3); 133cabdff1aSopenharmony_ci src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02); 134cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1, 135cabdff1aSopenharmony_ci src_vt3, src_vt2, src_vt87_h, src_vt3, 136cabdff1aSopenharmony_ci src_hz0, src_hz1, src_hz2, src_hz3); 137cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1, 138cabdff1aSopenharmony_ci src_vt3, src_vt2, src_vt87_h, src_vt3, 139cabdff1aSopenharmony_ci src_vt0, src_vt1, src_vt2, src_vt3); 140cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, src_hz1, 141cabdff1aSopenharmony_ci 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 0x02, 142cabdff1aSopenharmony_ci src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h); 143cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, src_hz1, 144cabdff1aSopenharmony_ci 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 0x13, 145cabdff1aSopenharmony_ci src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h); 146cabdff1aSopenharmony_ci vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, filt0, 147cabdff1aSopenharmony_ci filt1, filt2); 148cabdff1aSopenharmony_ci vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, filt0, 149cabdff1aSopenharmony_ci filt1, filt2); 150cabdff1aSopenharmony_ci vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, filt0, 151cabdff1aSopenharmony_ci filt1, filt2); 152cabdff1aSopenharmony_ci vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, filt0, 153cabdff1aSopenharmony_ci filt1, filt2); 154cabdff1aSopenharmony_ci vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5); 155cabdff1aSopenharmony_ci vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5); 156cabdff1aSopenharmony_ci 157cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2, 158cabdff1aSopenharmony_ci out0, out2); 159cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2, 160cabdff1aSopenharmony_ci out1, out3); 161cabdff1aSopenharmony_ci tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1); 162cabdff1aSopenharmony_ci tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1); 163cabdff1aSopenharmony_ci 164cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 165cabdff1aSopenharmony_ci out0 = __lasx_xvld(dst, 0); 166cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, dst, stride, dst, stride_2x, out1, out2); 167cabdff1aSopenharmony_ci out3 = __lasx_xvldx(dst, stride_3x); 168cabdff1aSopenharmony_ci out0 = __lasx_xvpermi_q(out0, out2, 0x02); 169cabdff1aSopenharmony_ci out1 = __lasx_xvpermi_q(out1, out3, 0x02); 170cabdff1aSopenharmony_ci out2 = __lasx_xvilvl_d(out1, out0); 171cabdff1aSopenharmony_ci out3 = __lasx_xvilvh_d(out1, out0); 172cabdff1aSopenharmony_ci out0 = __lasx_xvpermi_q(out2, out3, 0x02); 173cabdff1aSopenharmony_ci out1 = __lasx_xvpermi_q(out2, out3, 0x13); 174cabdff1aSopenharmony_ci tmp0 = __lasx_xvavgr_bu(out0, tmp0); 175cabdff1aSopenharmony_ci tmp1 = __lasx_xvavgr_bu(out1, tmp1); 176cabdff1aSopenharmony_ci 177cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst, 0, 0); 178cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst + stride, 0, 1); 179cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0); 180cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1); 181cabdff1aSopenharmony_ci 182cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst, 8, 2); 183cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst + stride, 8, 3); 184cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2); 185cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3); 186cabdff1aSopenharmony_ci 187cabdff1aSopenharmony_ci dst += stride_4x; 188cabdff1aSopenharmony_ci src_vt0 = src_vt4; 189cabdff1aSopenharmony_ci src_vt1 = src_vt5; 190cabdff1aSopenharmony_ci src_vt2 = src_vt6; 191cabdff1aSopenharmony_ci src_vt3 = src_vt7; 192cabdff1aSopenharmony_ci src_vt4 = src_vt8; 193cabdff1aSopenharmony_ci } 194cabdff1aSopenharmony_ci} 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_cistatic av_always_inline void 197cabdff1aSopenharmony_ciavc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y, 198cabdff1aSopenharmony_ci uint8_t *dst, ptrdiff_t stride) 199cabdff1aSopenharmony_ci{ 200cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 201cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 202cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 203cabdff1aSopenharmony_ci uint32_t loop_cnt; 204cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 205cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 206cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 207cabdff1aSopenharmony_ci __m256i tmp0, tmp1; 208cabdff1aSopenharmony_ci __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2; 209cabdff1aSopenharmony_ci __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 210cabdff1aSopenharmony_ci __m256i src_vt7, src_vt8; 211cabdff1aSopenharmony_ci __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h; 212cabdff1aSopenharmony_ci __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2; 213cabdff1aSopenharmony_ci __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 214cabdff1aSopenharmony_ci __m256i vt_out3, out0, out1, out2, out3; 215cabdff1aSopenharmony_ci __m256i minus5b = __lasx_xvldi(0xFB); 216cabdff1aSopenharmony_ci __m256i plus20b = __lasx_xvldi(20); 217cabdff1aSopenharmony_ci 218cabdff1aSopenharmony_ci filt0 = __lasx_xvreplgr2vr_h(filt_const0); 219cabdff1aSopenharmony_ci filt1 = __lasx_xvreplgr2vr_h(filt_const1); 220cabdff1aSopenharmony_ci filt2 = __lasx_xvreplgr2vr_h(filt_const2); 221cabdff1aSopenharmony_ci 222cabdff1aSopenharmony_ci mask0 = __lasx_xvld(luma_mask_arr, 0); 223cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2); 224cabdff1aSopenharmony_ci src_vt0 = __lasx_xvld(src_y, 0); 225cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x, 226cabdff1aSopenharmony_ci src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4); 227cabdff1aSopenharmony_ci src_y += stride_4x; 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci src_vt0 = __lasx_xvxori_b(src_vt0, 128); 230cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128, 231cabdff1aSopenharmony_ci src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4); 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 234cabdff1aSopenharmony_ci src_hz0 = __lasx_xvld(src_x, 0); 235cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x, 236cabdff1aSopenharmony_ci src_hz1, src_hz2); 237cabdff1aSopenharmony_ci src_hz3 = __lasx_xvldx(src_x, stride_3x); 238cabdff1aSopenharmony_ci src_x += stride_4x; 239cabdff1aSopenharmony_ci src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94); 240cabdff1aSopenharmony_ci src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94); 241cabdff1aSopenharmony_ci src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94); 242cabdff1aSopenharmony_ci src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94); 243cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128, 244cabdff1aSopenharmony_ci src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3); 245cabdff1aSopenharmony_ci 246cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 247cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 248cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 249cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 250cabdff1aSopenharmony_ci hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5); 251cabdff1aSopenharmony_ci hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5); 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, 254cabdff1aSopenharmony_ci src_y, stride_3x, src_y, stride_4x, 255cabdff1aSopenharmony_ci src_vt5, src_vt6, src_vt7, src_vt8); 256cabdff1aSopenharmony_ci src_y += stride_4x; 257cabdff1aSopenharmony_ci 258cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128, 259cabdff1aSopenharmony_ci src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8); 260cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5, 261cabdff1aSopenharmony_ci 0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02, 262cabdff1aSopenharmony_ci src_vt0, src_vt1, src_vt2, src_vt3); 263cabdff1aSopenharmony_ci src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02); 264cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1, 265cabdff1aSopenharmony_ci src_vt3, src_vt2, src_vt87_h, src_vt3, 266cabdff1aSopenharmony_ci src_hz0, src_hz1, src_hz2, src_hz3); 267cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1, 268cabdff1aSopenharmony_ci src_vt3, src_vt2, src_vt87_h, src_vt3, 269cabdff1aSopenharmony_ci src_vt0, src_vt1, src_vt2, src_vt3); 270cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, 271cabdff1aSopenharmony_ci src_hz1, 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 272cabdff1aSopenharmony_ci 0x02, src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h); 273cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, 274cabdff1aSopenharmony_ci src_hz1, 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 275cabdff1aSopenharmony_ci 0x13, src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h); 276cabdff1aSopenharmony_ci 277cabdff1aSopenharmony_ci vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, 278cabdff1aSopenharmony_ci filt0, filt1, filt2); 279cabdff1aSopenharmony_ci vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, 280cabdff1aSopenharmony_ci filt0, filt1, filt2); 281cabdff1aSopenharmony_ci vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, 282cabdff1aSopenharmony_ci filt0, filt1, filt2); 283cabdff1aSopenharmony_ci vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, 284cabdff1aSopenharmony_ci filt0, filt1, filt2); 285cabdff1aSopenharmony_ci vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5); 286cabdff1aSopenharmony_ci vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5); 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2, 289cabdff1aSopenharmony_ci out0, out2); 290cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2, 291cabdff1aSopenharmony_ci out1, out3); 292cabdff1aSopenharmony_ci tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1); 293cabdff1aSopenharmony_ci tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1); 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 296cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst, 0, 0); 297cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst + stride, 0, 1); 298cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0); 299cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1); 300cabdff1aSopenharmony_ci 301cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst, 8, 2); 302cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst + stride, 8, 3); 303cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2); 304cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3); 305cabdff1aSopenharmony_ci 306cabdff1aSopenharmony_ci dst += stride_4x; 307cabdff1aSopenharmony_ci src_vt0 = src_vt4; 308cabdff1aSopenharmony_ci src_vt1 = src_vt5; 309cabdff1aSopenharmony_ci src_vt2 = src_vt6; 310cabdff1aSopenharmony_ci src_vt3 = src_vt7; 311cabdff1aSopenharmony_ci src_vt4 = src_vt8; 312cabdff1aSopenharmony_ci } 313cabdff1aSopenharmony_ci} 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_ci/* put_pixels8_8_inline_asm: dst = src */ 316cabdff1aSopenharmony_cistatic av_always_inline void 317cabdff1aSopenharmony_ciput_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) 318cabdff1aSopenharmony_ci{ 319cabdff1aSopenharmony_ci uint64_t tmp[8]; 320cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3, stride_4; 321cabdff1aSopenharmony_ci __asm__ volatile ( 322cabdff1aSopenharmony_ci "slli.d %[stride_2], %[stride], 1 \n\t" 323cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[stride] \n\t" 324cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 325cabdff1aSopenharmony_ci "ld.d %[tmp0], %[src], 0x0 \n\t" 326cabdff1aSopenharmony_ci "ldx.d %[tmp1], %[src], %[stride] \n\t" 327cabdff1aSopenharmony_ci "ldx.d %[tmp2], %[src], %[stride_2] \n\t" 328cabdff1aSopenharmony_ci "ldx.d %[tmp3], %[src], %[stride_3] \n\t" 329cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 330cabdff1aSopenharmony_ci "ld.d %[tmp4], %[src], 0x0 \n\t" 331cabdff1aSopenharmony_ci "ldx.d %[tmp5], %[src], %[stride] \n\t" 332cabdff1aSopenharmony_ci "ldx.d %[tmp6], %[src], %[stride_2] \n\t" 333cabdff1aSopenharmony_ci "ldx.d %[tmp7], %[src], %[stride_3] \n\t" 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci "st.d %[tmp0], %[dst], 0x0 \n\t" 336cabdff1aSopenharmony_ci "stx.d %[tmp1], %[dst], %[stride] \n\t" 337cabdff1aSopenharmony_ci "stx.d %[tmp2], %[dst], %[stride_2] \n\t" 338cabdff1aSopenharmony_ci "stx.d %[tmp3], %[dst], %[stride_3] \n\t" 339cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride_4] \n\t" 340cabdff1aSopenharmony_ci "st.d %[tmp4], %[dst], 0x0 \n\t" 341cabdff1aSopenharmony_ci "stx.d %[tmp5], %[dst], %[stride] \n\t" 342cabdff1aSopenharmony_ci "stx.d %[tmp6], %[dst], %[stride_2] \n\t" 343cabdff1aSopenharmony_ci "stx.d %[tmp7], %[dst], %[stride_3] \n\t" 344cabdff1aSopenharmony_ci : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), 345cabdff1aSopenharmony_ci [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), 346cabdff1aSopenharmony_ci [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]), 347cabdff1aSopenharmony_ci [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]), 348cabdff1aSopenharmony_ci [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 349cabdff1aSopenharmony_ci [stride_4]"=&r"(stride_4), 350cabdff1aSopenharmony_ci [dst]"+&r"(dst), [src]"+&r"(src) 351cabdff1aSopenharmony_ci : [stride]"r"(stride) 352cabdff1aSopenharmony_ci : "memory" 353cabdff1aSopenharmony_ci ); 354cabdff1aSopenharmony_ci} 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ci/* avg_pixels8_8_lsx : dst = avg(src, dst) 357cabdff1aSopenharmony_ci * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8. 358cabdff1aSopenharmony_ci * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 359cabdff1aSopenharmony_cistatic av_always_inline void 360cabdff1aSopenharmony_ciavg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) 361cabdff1aSopenharmony_ci{ 362cabdff1aSopenharmony_ci uint8_t *tmp = dst; 363cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3, stride_4; 364cabdff1aSopenharmony_ci __asm__ volatile ( 365cabdff1aSopenharmony_ci /* h0~h7 */ 366cabdff1aSopenharmony_ci "slli.d %[stride_2], %[stride], 1 \n\t" 367cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[stride] \n\t" 368cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 369cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 370cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[stride] \n\t" 371cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 372cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 373cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 374cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 375cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[stride] \n\t" 376cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 377cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_ci "vld $vr8, %[tmp], 0 \n\t" 380cabdff1aSopenharmony_ci "vldx $vr9, %[tmp], %[stride] \n\t" 381cabdff1aSopenharmony_ci "vldx $vr10, %[tmp], %[stride_2] \n\t" 382cabdff1aSopenharmony_ci "vldx $vr11, %[tmp], %[stride_3] \n\t" 383cabdff1aSopenharmony_ci "add.d %[tmp], %[tmp], %[stride_4] \n\t" 384cabdff1aSopenharmony_ci "vld $vr12, %[tmp], 0 \n\t" 385cabdff1aSopenharmony_ci "vldx $vr13, %[tmp], %[stride] \n\t" 386cabdff1aSopenharmony_ci "vldx $vr14, %[tmp], %[stride_2] \n\t" 387cabdff1aSopenharmony_ci "vldx $vr15, %[tmp], %[stride_3] \n\t" 388cabdff1aSopenharmony_ci 389cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 390cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 391cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 392cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 393cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 394cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 395cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 396cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 397cabdff1aSopenharmony_ci 398cabdff1aSopenharmony_ci "vstelm.d $vr0, %[dst], 0, 0 \n\t" 399cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride] \n\t" 400cabdff1aSopenharmony_ci "vstelm.d $vr1, %[dst], 0, 0 \n\t" 401cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride] \n\t" 402cabdff1aSopenharmony_ci "vstelm.d $vr2, %[dst], 0, 0 \n\t" 403cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride] \n\t" 404cabdff1aSopenharmony_ci "vstelm.d $vr3, %[dst], 0, 0 \n\t" 405cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride] \n\t" 406cabdff1aSopenharmony_ci "vstelm.d $vr4, %[dst], 0, 0 \n\t" 407cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride] \n\t" 408cabdff1aSopenharmony_ci "vstelm.d $vr5, %[dst], 0, 0 \n\t" 409cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride] \n\t" 410cabdff1aSopenharmony_ci "vstelm.d $vr6, %[dst], 0, 0 \n\t" 411cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride] \n\t" 412cabdff1aSopenharmony_ci "vstelm.d $vr7, %[dst], 0, 0 \n\t" 413cabdff1aSopenharmony_ci : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src), 414cabdff1aSopenharmony_ci [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 415cabdff1aSopenharmony_ci [stride_4]"=&r"(stride_4) 416cabdff1aSopenharmony_ci : [stride]"r"(stride) 417cabdff1aSopenharmony_ci : "memory" 418cabdff1aSopenharmony_ci ); 419cabdff1aSopenharmony_ci} 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci/* avg_pixels8_8_lsx : dst = avg(src, dst) 422cabdff1aSopenharmony_ci * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8. 423cabdff1aSopenharmony_ci * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 424cabdff1aSopenharmony_cistatic av_always_inline void 425cabdff1aSopenharmony_ciput_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, 426cabdff1aSopenharmony_ci ptrdiff_t dstStride, ptrdiff_t srcStride) 427cabdff1aSopenharmony_ci{ 428cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3, stride_4; 429cabdff1aSopenharmony_ci __asm__ volatile ( 430cabdff1aSopenharmony_ci /* h0~h7 */ 431cabdff1aSopenharmony_ci "slli.d %[stride_2], %[srcStride], 1 \n\t" 432cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" 433cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 434cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 435cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[srcStride] \n\t" 436cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 437cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 438cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 439cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 440cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[srcStride] \n\t" 441cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 442cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 443cabdff1aSopenharmony_ci 444cabdff1aSopenharmony_ci "vld $vr8, %[half], 0x00 \n\t" 445cabdff1aSopenharmony_ci "vld $vr9, %[half], 0x08 \n\t" 446cabdff1aSopenharmony_ci "vld $vr10, %[half], 0x10 \n\t" 447cabdff1aSopenharmony_ci "vld $vr11, %[half], 0x18 \n\t" 448cabdff1aSopenharmony_ci "vld $vr12, %[half], 0x20 \n\t" 449cabdff1aSopenharmony_ci "vld $vr13, %[half], 0x28 \n\t" 450cabdff1aSopenharmony_ci "vld $vr14, %[half], 0x30 \n\t" 451cabdff1aSopenharmony_ci "vld $vr15, %[half], 0x38 \n\t" 452cabdff1aSopenharmony_ci 453cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 454cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 455cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 456cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 457cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 458cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 459cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 460cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ci "vstelm.d $vr0, %[dst], 0, 0 \n\t" 463cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 464cabdff1aSopenharmony_ci "vstelm.d $vr1, %[dst], 0, 0 \n\t" 465cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 466cabdff1aSopenharmony_ci "vstelm.d $vr2, %[dst], 0, 0 \n\t" 467cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 468cabdff1aSopenharmony_ci "vstelm.d $vr3, %[dst], 0, 0 \n\t" 469cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 470cabdff1aSopenharmony_ci "vstelm.d $vr4, %[dst], 0, 0 \n\t" 471cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 472cabdff1aSopenharmony_ci "vstelm.d $vr5, %[dst], 0, 0 \n\t" 473cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 474cabdff1aSopenharmony_ci "vstelm.d $vr6, %[dst], 0, 0 \n\t" 475cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 476cabdff1aSopenharmony_ci "vstelm.d $vr7, %[dst], 0, 0 \n\t" 477cabdff1aSopenharmony_ci : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src), 478cabdff1aSopenharmony_ci [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 479cabdff1aSopenharmony_ci [stride_4]"=&r"(stride_4) 480cabdff1aSopenharmony_ci : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride) 481cabdff1aSopenharmony_ci : "memory" 482cabdff1aSopenharmony_ci ); 483cabdff1aSopenharmony_ci} 484cabdff1aSopenharmony_ci 485cabdff1aSopenharmony_ci/* avg_pixels8_8_lsx : dst = avg(src, dst) 486cabdff1aSopenharmony_ci * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8. 487cabdff1aSopenharmony_ci * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 488cabdff1aSopenharmony_cistatic av_always_inline void 489cabdff1aSopenharmony_ciavg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, 490cabdff1aSopenharmony_ci ptrdiff_t dstStride, ptrdiff_t srcStride) 491cabdff1aSopenharmony_ci{ 492cabdff1aSopenharmony_ci uint8_t *tmp = dst; 493cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3, stride_4; 494cabdff1aSopenharmony_ci __asm__ volatile ( 495cabdff1aSopenharmony_ci /* h0~h7 */ 496cabdff1aSopenharmony_ci "slli.d %[stride_2], %[srcStride], 1 \n\t" 497cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" 498cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 499cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 500cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[srcStride] \n\t" 501cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 502cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 503cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 504cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 505cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[srcStride] \n\t" 506cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 507cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 508cabdff1aSopenharmony_ci 509cabdff1aSopenharmony_ci "vld $vr8, %[half], 0x00 \n\t" 510cabdff1aSopenharmony_ci "vld $vr9, %[half], 0x08 \n\t" 511cabdff1aSopenharmony_ci "vld $vr10, %[half], 0x10 \n\t" 512cabdff1aSopenharmony_ci "vld $vr11, %[half], 0x18 \n\t" 513cabdff1aSopenharmony_ci "vld $vr12, %[half], 0x20 \n\t" 514cabdff1aSopenharmony_ci "vld $vr13, %[half], 0x28 \n\t" 515cabdff1aSopenharmony_ci "vld $vr14, %[half], 0x30 \n\t" 516cabdff1aSopenharmony_ci "vld $vr15, %[half], 0x38 \n\t" 517cabdff1aSopenharmony_ci 518cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 519cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 520cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 521cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 522cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 523cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 524cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 525cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci "slli.d %[stride_2], %[dstStride], 1 \n\t" 528cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[dstStride] \n\t" 529cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 530cabdff1aSopenharmony_ci "vld $vr8, %[tmp], 0 \n\t" 531cabdff1aSopenharmony_ci "vldx $vr9, %[tmp], %[dstStride] \n\t" 532cabdff1aSopenharmony_ci "vldx $vr10, %[tmp], %[stride_2] \n\t" 533cabdff1aSopenharmony_ci "vldx $vr11, %[tmp], %[stride_3] \n\t" 534cabdff1aSopenharmony_ci "add.d %[tmp], %[tmp], %[stride_4] \n\t" 535cabdff1aSopenharmony_ci "vld $vr12, %[tmp], 0 \n\t" 536cabdff1aSopenharmony_ci "vldx $vr13, %[tmp], %[dstStride] \n\t" 537cabdff1aSopenharmony_ci "vldx $vr14, %[tmp], %[stride_2] \n\t" 538cabdff1aSopenharmony_ci "vldx $vr15, %[tmp], %[stride_3] \n\t" 539cabdff1aSopenharmony_ci 540cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 541cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 542cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 543cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 544cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 545cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 546cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 547cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_ci "vstelm.d $vr0, %[dst], 0, 0 \n\t" 550cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 551cabdff1aSopenharmony_ci "vstelm.d $vr1, %[dst], 0, 0 \n\t" 552cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 553cabdff1aSopenharmony_ci "vstelm.d $vr2, %[dst], 0, 0 \n\t" 554cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 555cabdff1aSopenharmony_ci "vstelm.d $vr3, %[dst], 0, 0 \n\t" 556cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 557cabdff1aSopenharmony_ci "vstelm.d $vr4, %[dst], 0, 0 \n\t" 558cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 559cabdff1aSopenharmony_ci "vstelm.d $vr5, %[dst], 0, 0 \n\t" 560cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 561cabdff1aSopenharmony_ci "vstelm.d $vr6, %[dst], 0, 0 \n\t" 562cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstStride] \n\t" 563cabdff1aSopenharmony_ci "vstelm.d $vr7, %[dst], 0, 0 \n\t" 564cabdff1aSopenharmony_ci : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), 565cabdff1aSopenharmony_ci [src]"+&r"(src), [stride_2]"=&r"(stride_2), 566cabdff1aSopenharmony_ci [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4) 567cabdff1aSopenharmony_ci : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride) 568cabdff1aSopenharmony_ci : "memory" 569cabdff1aSopenharmony_ci ); 570cabdff1aSopenharmony_ci} 571cabdff1aSopenharmony_ci 572cabdff1aSopenharmony_ci/* put_pixels16_8_lsx: dst = src */ 573cabdff1aSopenharmony_cistatic av_always_inline void 574cabdff1aSopenharmony_ciput_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) 575cabdff1aSopenharmony_ci{ 576cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3, stride_4; 577cabdff1aSopenharmony_ci __asm__ volatile ( 578cabdff1aSopenharmony_ci "slli.d %[stride_2], %[stride], 1 \n\t" 579cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[stride] \n\t" 580cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 581cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 582cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[stride] \n\t" 583cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 584cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 585cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 586cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 587cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[stride] \n\t" 588cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 589cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 590cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 591cabdff1aSopenharmony_ci 592cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0 \n\t" 593cabdff1aSopenharmony_ci "vstx $vr1, %[dst], %[stride] \n\t" 594cabdff1aSopenharmony_ci "vstx $vr2, %[dst], %[stride_2] \n\t" 595cabdff1aSopenharmony_ci "vstx $vr3, %[dst], %[stride_3] \n\t" 596cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride_4] \n\t" 597cabdff1aSopenharmony_ci "vst $vr4, %[dst], 0 \n\t" 598cabdff1aSopenharmony_ci "vstx $vr5, %[dst], %[stride] \n\t" 599cabdff1aSopenharmony_ci "vstx $vr6, %[dst], %[stride_2] \n\t" 600cabdff1aSopenharmony_ci "vstx $vr7, %[dst], %[stride_3] \n\t" 601cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride_4] \n\t" 602cabdff1aSopenharmony_ci 603cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 604cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[stride] \n\t" 605cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 606cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 607cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 608cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 609cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[stride] \n\t" 610cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 611cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 612cabdff1aSopenharmony_ci 613cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0 \n\t" 614cabdff1aSopenharmony_ci "vstx $vr1, %[dst], %[stride] \n\t" 615cabdff1aSopenharmony_ci "vstx $vr2, %[dst], %[stride_2] \n\t" 616cabdff1aSopenharmony_ci "vstx $vr3, %[dst], %[stride_3] \n\t" 617cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride_4] \n\t" 618cabdff1aSopenharmony_ci "vst $vr4, %[dst], 0 \n\t" 619cabdff1aSopenharmony_ci "vstx $vr5, %[dst], %[stride] \n\t" 620cabdff1aSopenharmony_ci "vstx $vr6, %[dst], %[stride_2] \n\t" 621cabdff1aSopenharmony_ci "vstx $vr7, %[dst], %[stride_3] \n\t" 622cabdff1aSopenharmony_ci : [dst]"+&r"(dst), [src]"+&r"(src), 623cabdff1aSopenharmony_ci [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 624cabdff1aSopenharmony_ci [stride_4]"=&r"(stride_4) 625cabdff1aSopenharmony_ci : [stride]"r"(stride) 626cabdff1aSopenharmony_ci : "memory" 627cabdff1aSopenharmony_ci ); 628cabdff1aSopenharmony_ci} 629cabdff1aSopenharmony_ci 630cabdff1aSopenharmony_ci/* avg_pixels16_8_lsx : dst = avg(src, dst) 631cabdff1aSopenharmony_ci * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8. 632cabdff1aSopenharmony_ci * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 633cabdff1aSopenharmony_cistatic av_always_inline void 634cabdff1aSopenharmony_ciavg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) 635cabdff1aSopenharmony_ci{ 636cabdff1aSopenharmony_ci uint8_t *tmp = dst; 637cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3, stride_4; 638cabdff1aSopenharmony_ci __asm__ volatile ( 639cabdff1aSopenharmony_ci /* h0~h7 */ 640cabdff1aSopenharmony_ci "slli.d %[stride_2], %[stride], 1 \n\t" 641cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[stride] \n\t" 642cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 643cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 644cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[stride] \n\t" 645cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 646cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 647cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 648cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 649cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[stride] \n\t" 650cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 651cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 652cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 653cabdff1aSopenharmony_ci 654cabdff1aSopenharmony_ci "vld $vr8, %[tmp], 0 \n\t" 655cabdff1aSopenharmony_ci "vldx $vr9, %[tmp], %[stride] \n\t" 656cabdff1aSopenharmony_ci "vldx $vr10, %[tmp], %[stride_2] \n\t" 657cabdff1aSopenharmony_ci "vldx $vr11, %[tmp], %[stride_3] \n\t" 658cabdff1aSopenharmony_ci "add.d %[tmp], %[tmp], %[stride_4] \n\t" 659cabdff1aSopenharmony_ci "vld $vr12, %[tmp], 0 \n\t" 660cabdff1aSopenharmony_ci "vldx $vr13, %[tmp], %[stride] \n\t" 661cabdff1aSopenharmony_ci "vldx $vr14, %[tmp], %[stride_2] \n\t" 662cabdff1aSopenharmony_ci "vldx $vr15, %[tmp], %[stride_3] \n\t" 663cabdff1aSopenharmony_ci "add.d %[tmp], %[tmp], %[stride_4] \n\t" 664cabdff1aSopenharmony_ci 665cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 666cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 667cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 668cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 669cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 670cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 671cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 672cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 673cabdff1aSopenharmony_ci 674cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0 \n\t" 675cabdff1aSopenharmony_ci "vstx $vr1, %[dst], %[stride] \n\t" 676cabdff1aSopenharmony_ci "vstx $vr2, %[dst], %[stride_2] \n\t" 677cabdff1aSopenharmony_ci "vstx $vr3, %[dst], %[stride_3] \n\t" 678cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride_4] \n\t" 679cabdff1aSopenharmony_ci "vst $vr4, %[dst], 0 \n\t" 680cabdff1aSopenharmony_ci "vstx $vr5, %[dst], %[stride] \n\t" 681cabdff1aSopenharmony_ci "vstx $vr6, %[dst], %[stride_2] \n\t" 682cabdff1aSopenharmony_ci "vstx $vr7, %[dst], %[stride_3] \n\t" 683cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride_4] \n\t" 684cabdff1aSopenharmony_ci 685cabdff1aSopenharmony_ci /* h8~h15 */ 686cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 687cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[stride] \n\t" 688cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 689cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 690cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 691cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 692cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[stride] \n\t" 693cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 694cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 695cabdff1aSopenharmony_ci 696cabdff1aSopenharmony_ci "vld $vr8, %[tmp], 0 \n\t" 697cabdff1aSopenharmony_ci "vldx $vr9, %[tmp], %[stride] \n\t" 698cabdff1aSopenharmony_ci "vldx $vr10, %[tmp], %[stride_2] \n\t" 699cabdff1aSopenharmony_ci "vldx $vr11, %[tmp], %[stride_3] \n\t" 700cabdff1aSopenharmony_ci "add.d %[tmp], %[tmp], %[stride_4] \n\t" 701cabdff1aSopenharmony_ci "vld $vr12, %[tmp], 0 \n\t" 702cabdff1aSopenharmony_ci "vldx $vr13, %[tmp], %[stride] \n\t" 703cabdff1aSopenharmony_ci "vldx $vr14, %[tmp], %[stride_2] \n\t" 704cabdff1aSopenharmony_ci "vldx $vr15, %[tmp], %[stride_3] \n\t" 705cabdff1aSopenharmony_ci 706cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 707cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 708cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 709cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 710cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 711cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 712cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 713cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 714cabdff1aSopenharmony_ci 715cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0 \n\t" 716cabdff1aSopenharmony_ci "vstx $vr1, %[dst], %[stride] \n\t" 717cabdff1aSopenharmony_ci "vstx $vr2, %[dst], %[stride_2] \n\t" 718cabdff1aSopenharmony_ci "vstx $vr3, %[dst], %[stride_3] \n\t" 719cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[stride_4] \n\t" 720cabdff1aSopenharmony_ci "vst $vr4, %[dst], 0 \n\t" 721cabdff1aSopenharmony_ci "vstx $vr5, %[dst], %[stride] \n\t" 722cabdff1aSopenharmony_ci "vstx $vr6, %[dst], %[stride_2] \n\t" 723cabdff1aSopenharmony_ci "vstx $vr7, %[dst], %[stride_3] \n\t" 724cabdff1aSopenharmony_ci : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src), 725cabdff1aSopenharmony_ci [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 726cabdff1aSopenharmony_ci [stride_4]"=&r"(stride_4) 727cabdff1aSopenharmony_ci : [stride]"r"(stride) 728cabdff1aSopenharmony_ci : "memory" 729cabdff1aSopenharmony_ci ); 730cabdff1aSopenharmony_ci} 731cabdff1aSopenharmony_ci 732cabdff1aSopenharmony_ci/* avg_pixels16_8_lsx : dst = avg(src, dst) 733cabdff1aSopenharmony_ci * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8. 734cabdff1aSopenharmony_ci * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 735cabdff1aSopenharmony_cistatic av_always_inline void 736cabdff1aSopenharmony_ciput_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, 737cabdff1aSopenharmony_ci ptrdiff_t dstStride, ptrdiff_t srcStride) 738cabdff1aSopenharmony_ci{ 739cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3, stride_4; 740cabdff1aSopenharmony_ci ptrdiff_t dstride_2, dstride_3, dstride_4; 741cabdff1aSopenharmony_ci __asm__ volatile ( 742cabdff1aSopenharmony_ci "slli.d %[stride_2], %[srcStride], 1 \n\t" 743cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" 744cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 745cabdff1aSopenharmony_ci "slli.d %[dstride_2], %[dstStride], 1 \n\t" 746cabdff1aSopenharmony_ci "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t" 747cabdff1aSopenharmony_ci "slli.d %[dstride_4], %[dstride_2], 1 \n\t" 748cabdff1aSopenharmony_ci /* h0~h7 */ 749cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 750cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[srcStride] \n\t" 751cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 752cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 753cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 754cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 755cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[srcStride] \n\t" 756cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 757cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 758cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 759cabdff1aSopenharmony_ci 760cabdff1aSopenharmony_ci "vld $vr8, %[half], 0x00 \n\t" 761cabdff1aSopenharmony_ci "vld $vr9, %[half], 0x10 \n\t" 762cabdff1aSopenharmony_ci "vld $vr10, %[half], 0x20 \n\t" 763cabdff1aSopenharmony_ci "vld $vr11, %[half], 0x30 \n\t" 764cabdff1aSopenharmony_ci "vld $vr12, %[half], 0x40 \n\t" 765cabdff1aSopenharmony_ci "vld $vr13, %[half], 0x50 \n\t" 766cabdff1aSopenharmony_ci "vld $vr14, %[half], 0x60 \n\t" 767cabdff1aSopenharmony_ci "vld $vr15, %[half], 0x70 \n\t" 768cabdff1aSopenharmony_ci 769cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 770cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 771cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 772cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 773cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 774cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 775cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 776cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 777cabdff1aSopenharmony_ci 778cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0 \n\t" 779cabdff1aSopenharmony_ci "vstx $vr1, %[dst], %[dstStride] \n\t" 780cabdff1aSopenharmony_ci "vstx $vr2, %[dst], %[dstride_2] \n\t" 781cabdff1aSopenharmony_ci "vstx $vr3, %[dst], %[dstride_3] \n\t" 782cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstride_4] \n\t" 783cabdff1aSopenharmony_ci "vst $vr4, %[dst], 0 \n\t" 784cabdff1aSopenharmony_ci "vstx $vr5, %[dst], %[dstStride] \n\t" 785cabdff1aSopenharmony_ci "vstx $vr6, %[dst], %[dstride_2] \n\t" 786cabdff1aSopenharmony_ci "vstx $vr7, %[dst], %[dstride_3] \n\t" 787cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstride_4] \n\t" 788cabdff1aSopenharmony_ci 789cabdff1aSopenharmony_ci /* h8~h15 */ 790cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 791cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[srcStride] \n\t" 792cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 793cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 794cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 795cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 796cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[srcStride] \n\t" 797cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 798cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 799cabdff1aSopenharmony_ci 800cabdff1aSopenharmony_ci "vld $vr8, %[half], 0x80 \n\t" 801cabdff1aSopenharmony_ci "vld $vr9, %[half], 0x90 \n\t" 802cabdff1aSopenharmony_ci "vld $vr10, %[half], 0xa0 \n\t" 803cabdff1aSopenharmony_ci "vld $vr11, %[half], 0xb0 \n\t" 804cabdff1aSopenharmony_ci "vld $vr12, %[half], 0xc0 \n\t" 805cabdff1aSopenharmony_ci "vld $vr13, %[half], 0xd0 \n\t" 806cabdff1aSopenharmony_ci "vld $vr14, %[half], 0xe0 \n\t" 807cabdff1aSopenharmony_ci "vld $vr15, %[half], 0xf0 \n\t" 808cabdff1aSopenharmony_ci 809cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 810cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 811cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 812cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 813cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 814cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 815cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 816cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 817cabdff1aSopenharmony_ci 818cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0 \n\t" 819cabdff1aSopenharmony_ci "vstx $vr1, %[dst], %[dstStride] \n\t" 820cabdff1aSopenharmony_ci "vstx $vr2, %[dst], %[dstride_2] \n\t" 821cabdff1aSopenharmony_ci "vstx $vr3, %[dst], %[dstride_3] \n\t" 822cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstride_4] \n\t" 823cabdff1aSopenharmony_ci "vst $vr4, %[dst], 0 \n\t" 824cabdff1aSopenharmony_ci "vstx $vr5, %[dst], %[dstStride] \n\t" 825cabdff1aSopenharmony_ci "vstx $vr6, %[dst], %[dstride_2] \n\t" 826cabdff1aSopenharmony_ci "vstx $vr7, %[dst], %[dstride_3] \n\t" 827cabdff1aSopenharmony_ci : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src), 828cabdff1aSopenharmony_ci [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 829cabdff1aSopenharmony_ci [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2), 830cabdff1aSopenharmony_ci [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4) 831cabdff1aSopenharmony_ci : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride) 832cabdff1aSopenharmony_ci : "memory" 833cabdff1aSopenharmony_ci ); 834cabdff1aSopenharmony_ci} 835cabdff1aSopenharmony_ci 836cabdff1aSopenharmony_ci/* avg_pixels16_8_lsx : dst = avg(src, dst) 837cabdff1aSopenharmony_ci * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8. 838cabdff1aSopenharmony_ci * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ 839cabdff1aSopenharmony_cistatic av_always_inline void 840cabdff1aSopenharmony_ciavg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, 841cabdff1aSopenharmony_ci ptrdiff_t dstStride, ptrdiff_t srcStride) 842cabdff1aSopenharmony_ci{ 843cabdff1aSopenharmony_ci uint8_t *tmp = dst; 844cabdff1aSopenharmony_ci ptrdiff_t stride_2, stride_3, stride_4; 845cabdff1aSopenharmony_ci ptrdiff_t dstride_2, dstride_3, dstride_4; 846cabdff1aSopenharmony_ci __asm__ volatile ( 847cabdff1aSopenharmony_ci "slli.d %[stride_2], %[srcStride], 1 \n\t" 848cabdff1aSopenharmony_ci "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" 849cabdff1aSopenharmony_ci "slli.d %[stride_4], %[stride_2], 1 \n\t" 850cabdff1aSopenharmony_ci "slli.d %[dstride_2], %[dstStride], 1 \n\t" 851cabdff1aSopenharmony_ci "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t" 852cabdff1aSopenharmony_ci "slli.d %[dstride_4], %[dstride_2], 1 \n\t" 853cabdff1aSopenharmony_ci /* h0~h7 */ 854cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 855cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[srcStride] \n\t" 856cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 857cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 858cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 859cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 860cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[srcStride] \n\t" 861cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 862cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 863cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 864cabdff1aSopenharmony_ci 865cabdff1aSopenharmony_ci "vld $vr8, %[half], 0x00 \n\t" 866cabdff1aSopenharmony_ci "vld $vr9, %[half], 0x10 \n\t" 867cabdff1aSopenharmony_ci "vld $vr10, %[half], 0x20 \n\t" 868cabdff1aSopenharmony_ci "vld $vr11, %[half], 0x30 \n\t" 869cabdff1aSopenharmony_ci "vld $vr12, %[half], 0x40 \n\t" 870cabdff1aSopenharmony_ci "vld $vr13, %[half], 0x50 \n\t" 871cabdff1aSopenharmony_ci "vld $vr14, %[half], 0x60 \n\t" 872cabdff1aSopenharmony_ci "vld $vr15, %[half], 0x70 \n\t" 873cabdff1aSopenharmony_ci 874cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 875cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 876cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 877cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 878cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 879cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 880cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 881cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 882cabdff1aSopenharmony_ci 883cabdff1aSopenharmony_ci "vld $vr8, %[tmp], 0 \n\t" 884cabdff1aSopenharmony_ci "vldx $vr9, %[tmp], %[dstStride] \n\t" 885cabdff1aSopenharmony_ci "vldx $vr10, %[tmp], %[dstride_2] \n\t" 886cabdff1aSopenharmony_ci "vldx $vr11, %[tmp], %[dstride_3] \n\t" 887cabdff1aSopenharmony_ci "add.d %[tmp], %[tmp], %[dstride_4] \n\t" 888cabdff1aSopenharmony_ci "vld $vr12, %[tmp], 0 \n\t" 889cabdff1aSopenharmony_ci "vldx $vr13, %[tmp], %[dstStride] \n\t" 890cabdff1aSopenharmony_ci "vldx $vr14, %[tmp], %[dstride_2] \n\t" 891cabdff1aSopenharmony_ci "vldx $vr15, %[tmp], %[dstride_3] \n\t" 892cabdff1aSopenharmony_ci "add.d %[tmp], %[tmp], %[dstride_4] \n\t" 893cabdff1aSopenharmony_ci 894cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 895cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 896cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 897cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 898cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 899cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 900cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 901cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 902cabdff1aSopenharmony_ci 903cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0 \n\t" 904cabdff1aSopenharmony_ci "vstx $vr1, %[dst], %[dstStride] \n\t" 905cabdff1aSopenharmony_ci "vstx $vr2, %[dst], %[dstride_2] \n\t" 906cabdff1aSopenharmony_ci "vstx $vr3, %[dst], %[dstride_3] \n\t" 907cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstride_4] \n\t" 908cabdff1aSopenharmony_ci "vst $vr4, %[dst], 0 \n\t" 909cabdff1aSopenharmony_ci "vstx $vr5, %[dst], %[dstStride] \n\t" 910cabdff1aSopenharmony_ci "vstx $vr6, %[dst], %[dstride_2] \n\t" 911cabdff1aSopenharmony_ci "vstx $vr7, %[dst], %[dstride_3] \n\t" 912cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstride_4] \n\t" 913cabdff1aSopenharmony_ci 914cabdff1aSopenharmony_ci /* h8~h15 */ 915cabdff1aSopenharmony_ci "vld $vr0, %[src], 0 \n\t" 916cabdff1aSopenharmony_ci "vldx $vr1, %[src], %[srcStride] \n\t" 917cabdff1aSopenharmony_ci "vldx $vr2, %[src], %[stride_2] \n\t" 918cabdff1aSopenharmony_ci "vldx $vr3, %[src], %[stride_3] \n\t" 919cabdff1aSopenharmony_ci "add.d %[src], %[src], %[stride_4] \n\t" 920cabdff1aSopenharmony_ci "vld $vr4, %[src], 0 \n\t" 921cabdff1aSopenharmony_ci "vldx $vr5, %[src], %[srcStride] \n\t" 922cabdff1aSopenharmony_ci "vldx $vr6, %[src], %[stride_2] \n\t" 923cabdff1aSopenharmony_ci "vldx $vr7, %[src], %[stride_3] \n\t" 924cabdff1aSopenharmony_ci 925cabdff1aSopenharmony_ci "vld $vr8, %[half], 0x80 \n\t" 926cabdff1aSopenharmony_ci "vld $vr9, %[half], 0x90 \n\t" 927cabdff1aSopenharmony_ci "vld $vr10, %[half], 0xa0 \n\t" 928cabdff1aSopenharmony_ci "vld $vr11, %[half], 0xb0 \n\t" 929cabdff1aSopenharmony_ci "vld $vr12, %[half], 0xc0 \n\t" 930cabdff1aSopenharmony_ci "vld $vr13, %[half], 0xd0 \n\t" 931cabdff1aSopenharmony_ci "vld $vr14, %[half], 0xe0 \n\t" 932cabdff1aSopenharmony_ci "vld $vr15, %[half], 0xf0 \n\t" 933cabdff1aSopenharmony_ci 934cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 935cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 936cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 937cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 938cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 939cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 940cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 941cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 942cabdff1aSopenharmony_ci 943cabdff1aSopenharmony_ci "vld $vr8, %[tmp], 0 \n\t" 944cabdff1aSopenharmony_ci "vldx $vr9, %[tmp], %[dstStride] \n\t" 945cabdff1aSopenharmony_ci "vldx $vr10, %[tmp], %[dstride_2] \n\t" 946cabdff1aSopenharmony_ci "vldx $vr11, %[tmp], %[dstride_3] \n\t" 947cabdff1aSopenharmony_ci "add.d %[tmp], %[tmp], %[dstride_4] \n\t" 948cabdff1aSopenharmony_ci "vld $vr12, %[tmp], 0 \n\t" 949cabdff1aSopenharmony_ci "vldx $vr13, %[tmp], %[dstStride] \n\t" 950cabdff1aSopenharmony_ci "vldx $vr14, %[tmp], %[dstride_2] \n\t" 951cabdff1aSopenharmony_ci "vldx $vr15, %[tmp], %[dstride_3] \n\t" 952cabdff1aSopenharmony_ci 953cabdff1aSopenharmony_ci "vavgr.bu $vr0, $vr8, $vr0 \n\t" 954cabdff1aSopenharmony_ci "vavgr.bu $vr1, $vr9, $vr1 \n\t" 955cabdff1aSopenharmony_ci "vavgr.bu $vr2, $vr10, $vr2 \n\t" 956cabdff1aSopenharmony_ci "vavgr.bu $vr3, $vr11, $vr3 \n\t" 957cabdff1aSopenharmony_ci "vavgr.bu $vr4, $vr12, $vr4 \n\t" 958cabdff1aSopenharmony_ci "vavgr.bu $vr5, $vr13, $vr5 \n\t" 959cabdff1aSopenharmony_ci "vavgr.bu $vr6, $vr14, $vr6 \n\t" 960cabdff1aSopenharmony_ci "vavgr.bu $vr7, $vr15, $vr7 \n\t" 961cabdff1aSopenharmony_ci 962cabdff1aSopenharmony_ci "vst $vr0, %[dst], 0 \n\t" 963cabdff1aSopenharmony_ci "vstx $vr1, %[dst], %[dstStride] \n\t" 964cabdff1aSopenharmony_ci "vstx $vr2, %[dst], %[dstride_2] \n\t" 965cabdff1aSopenharmony_ci "vstx $vr3, %[dst], %[dstride_3] \n\t" 966cabdff1aSopenharmony_ci "add.d %[dst], %[dst], %[dstride_4] \n\t" 967cabdff1aSopenharmony_ci "vst $vr4, %[dst], 0 \n\t" 968cabdff1aSopenharmony_ci "vstx $vr5, %[dst], %[dstStride] \n\t" 969cabdff1aSopenharmony_ci "vstx $vr6, %[dst], %[dstride_2] \n\t" 970cabdff1aSopenharmony_ci "vstx $vr7, %[dst], %[dstride_3] \n\t" 971cabdff1aSopenharmony_ci : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src), 972cabdff1aSopenharmony_ci [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), 973cabdff1aSopenharmony_ci [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2), 974cabdff1aSopenharmony_ci [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4) 975cabdff1aSopenharmony_ci : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride) 976cabdff1aSopenharmony_ci : "memory" 977cabdff1aSopenharmony_ci ); 978cabdff1aSopenharmony_ci} 979cabdff1aSopenharmony_ci 980cabdff1aSopenharmony_ci#define QPEL8_H_LOWPASS(out_v) \ 981cabdff1aSopenharmony_ci src00 = __lasx_xvld(src, - 2); \ 982cabdff1aSopenharmony_ci src += srcStride; \ 983cabdff1aSopenharmony_ci src10 = __lasx_xvld(src, - 2); \ 984cabdff1aSopenharmony_ci src += srcStride; \ 985cabdff1aSopenharmony_ci src00 = __lasx_xvpermi_q(src00, src10, 0x02); \ 986cabdff1aSopenharmony_ci src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \ 987cabdff1aSopenharmony_ci src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \ 988cabdff1aSopenharmony_ci src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \ 989cabdff1aSopenharmony_ci src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \ 990cabdff1aSopenharmony_ci src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \ 991cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\ 992cabdff1aSopenharmony_ci src00 = __lasx_xvaddwl_h_bu(src00, src05); \ 993cabdff1aSopenharmony_ci src02 = __lasx_xvmul_h(src02, h_20); \ 994cabdff1aSopenharmony_ci src01 = __lasx_xvmul_h(src01, h_5); \ 995cabdff1aSopenharmony_ci src02 = __lasx_xvssub_h(src02, src01); \ 996cabdff1aSopenharmony_ci src02 = __lasx_xvsadd_h(src02, src00); \ 997cabdff1aSopenharmony_ci src02 = __lasx_xvsadd_h(src02, h_16); \ 998cabdff1aSopenharmony_ci out_v = __lasx_xvssrani_bu_h(src02, src02, 5); \ 999cabdff1aSopenharmony_ci 1000cabdff1aSopenharmony_cistatic av_always_inline void 1001cabdff1aSopenharmony_ciput_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, 1002cabdff1aSopenharmony_ci int srcStride) 1003cabdff1aSopenharmony_ci{ 1004cabdff1aSopenharmony_ci int dstStride_2x = dstStride << 1; 1005cabdff1aSopenharmony_ci __m256i src00, src01, src02, src03, src04, src05, src10; 1006cabdff1aSopenharmony_ci __m256i out0, out1, out2, out3; 1007cabdff1aSopenharmony_ci __m256i h_20 = __lasx_xvldi(0x414); 1008cabdff1aSopenharmony_ci __m256i h_5 = __lasx_xvldi(0x405); 1009cabdff1aSopenharmony_ci __m256i h_16 = __lasx_xvldi(0x410); 1010cabdff1aSopenharmony_ci __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0}; 1011cabdff1aSopenharmony_ci __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0}; 1012cabdff1aSopenharmony_ci __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0}; 1013cabdff1aSopenharmony_ci __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0}; 1014cabdff1aSopenharmony_ci __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0}; 1015cabdff1aSopenharmony_ci 1016cabdff1aSopenharmony_ci QPEL8_H_LOWPASS(out0) 1017cabdff1aSopenharmony_ci QPEL8_H_LOWPASS(out1) 1018cabdff1aSopenharmony_ci QPEL8_H_LOWPASS(out2) 1019cabdff1aSopenharmony_ci QPEL8_H_LOWPASS(out3) 1020cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst, 0, 0); 1021cabdff1aSopenharmony_ci __lasx_xvstelm_d(out0, dst + dstStride, 0, 2); 1022cabdff1aSopenharmony_ci dst += dstStride_2x; 1023cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst, 0, 0); 1024cabdff1aSopenharmony_ci __lasx_xvstelm_d(out1, dst + dstStride, 0, 2); 1025cabdff1aSopenharmony_ci dst += dstStride_2x; 1026cabdff1aSopenharmony_ci __lasx_xvstelm_d(out2, dst, 0, 0); 1027cabdff1aSopenharmony_ci __lasx_xvstelm_d(out2, dst + dstStride, 0, 2); 1028cabdff1aSopenharmony_ci dst += dstStride_2x; 1029cabdff1aSopenharmony_ci __lasx_xvstelm_d(out3, dst, 0, 0); 1030cabdff1aSopenharmony_ci __lasx_xvstelm_d(out3, dst + dstStride, 0, 2); 1031cabdff1aSopenharmony_ci} 1032cabdff1aSopenharmony_ci 1033cabdff1aSopenharmony_ci#define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6, \ 1034cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3, tmp4, tmp5) \ 1035cabdff1aSopenharmony_ci{ \ 1036cabdff1aSopenharmony_ci tmp0 = __lasx_xvpermi_q(src0, src1, 0x02); \ 1037cabdff1aSopenharmony_ci tmp1 = __lasx_xvpermi_q(src1, src2, 0x02); \ 1038cabdff1aSopenharmony_ci tmp2 = __lasx_xvpermi_q(src2, src3, 0x02); \ 1039cabdff1aSopenharmony_ci tmp3 = __lasx_xvpermi_q(src3, src4, 0x02); \ 1040cabdff1aSopenharmony_ci tmp4 = __lasx_xvpermi_q(src4, src5, 0x02); \ 1041cabdff1aSopenharmony_ci tmp5 = __lasx_xvpermi_q(src5, src6, 0x02); \ 1042cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvaddwl_h_bu, tmp2, tmp3, tmp1, tmp4, tmp2, tmp1); \ 1043cabdff1aSopenharmony_ci tmp0 = __lasx_xvaddwl_h_bu(tmp0, tmp5); \ 1044cabdff1aSopenharmony_ci tmp2 = __lasx_xvmul_h(tmp2, h_20); \ 1045cabdff1aSopenharmony_ci tmp1 = __lasx_xvmul_h(tmp1, h_5); \ 1046cabdff1aSopenharmony_ci tmp2 = __lasx_xvssub_h(tmp2, tmp1); \ 1047cabdff1aSopenharmony_ci tmp2 = __lasx_xvsadd_h(tmp2, tmp0); \ 1048cabdff1aSopenharmony_ci tmp2 = __lasx_xvsadd_h(tmp2, h_16); \ 1049cabdff1aSopenharmony_ci tmp2 = __lasx_xvssrani_bu_h(tmp2, tmp2, 5); \ 1050cabdff1aSopenharmony_ci} 1051cabdff1aSopenharmony_ci 1052cabdff1aSopenharmony_cistatic av_always_inline void 1053cabdff1aSopenharmony_ciput_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, 1054cabdff1aSopenharmony_ci int srcStride) 1055cabdff1aSopenharmony_ci{ 1056cabdff1aSopenharmony_ci int srcStride_2x = srcStride << 1; 1057cabdff1aSopenharmony_ci int dstStride_2x = dstStride << 1; 1058cabdff1aSopenharmony_ci int srcStride_4x = srcStride << 2; 1059cabdff1aSopenharmony_ci int srcStride_3x = srcStride_2x + srcStride; 1060cabdff1aSopenharmony_ci __m256i src00, src01, src02, src03, src04, src05, src06; 1061cabdff1aSopenharmony_ci __m256i src07, src08, src09, src10, src11, src12; 1062cabdff1aSopenharmony_ci __m256i tmp00, tmp01, tmp02, tmp03, tmp04, tmp05; 1063cabdff1aSopenharmony_ci __m256i h_20 = __lasx_xvldi(0x414); 1064cabdff1aSopenharmony_ci __m256i h_5 = __lasx_xvldi(0x405); 1065cabdff1aSopenharmony_ci __m256i h_16 = __lasx_xvldi(0x410); 1066cabdff1aSopenharmony_ci 1067cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0, 1068cabdff1aSopenharmony_ci src00, src01); 1069cabdff1aSopenharmony_ci src02 = __lasx_xvld(src, 0); 1070cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src, 1071cabdff1aSopenharmony_ci srcStride_3x, src, srcStride_4x, src03, src04, src05, src06); 1072cabdff1aSopenharmony_ci src += srcStride_4x; 1073cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src, 1074cabdff1aSopenharmony_ci srcStride_3x, src, srcStride_4x, src07, src08, src09, src10); 1075cabdff1aSopenharmony_ci src += srcStride_4x; 1076cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12); 1077cabdff1aSopenharmony_ci 1078cabdff1aSopenharmony_ci QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06, 1079cabdff1aSopenharmony_ci tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1080cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp02, dst, 0, 0); 1081cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2); 1082cabdff1aSopenharmony_ci dst += dstStride_2x; 1083cabdff1aSopenharmony_ci QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08, 1084cabdff1aSopenharmony_ci tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1085cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp02, dst, 0, 0); 1086cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2); 1087cabdff1aSopenharmony_ci dst += dstStride_2x; 1088cabdff1aSopenharmony_ci QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10, 1089cabdff1aSopenharmony_ci tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1090cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp02, dst, 0, 0); 1091cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2); 1092cabdff1aSopenharmony_ci dst += dstStride_2x; 1093cabdff1aSopenharmony_ci QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12, 1094cabdff1aSopenharmony_ci tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1095cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp02, dst, 0, 0); 1096cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2); 1097cabdff1aSopenharmony_ci} 1098cabdff1aSopenharmony_ci 1099cabdff1aSopenharmony_cistatic av_always_inline void 1100cabdff1aSopenharmony_ciavg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, 1101cabdff1aSopenharmony_ci int srcStride) 1102cabdff1aSopenharmony_ci{ 1103cabdff1aSopenharmony_ci int srcStride_2x = srcStride << 1; 1104cabdff1aSopenharmony_ci int srcStride_4x = srcStride << 2; 1105cabdff1aSopenharmony_ci int dstStride_2x = dstStride << 1; 1106cabdff1aSopenharmony_ci int dstStride_4x = dstStride << 2; 1107cabdff1aSopenharmony_ci int srcStride_3x = srcStride_2x + srcStride; 1108cabdff1aSopenharmony_ci int dstStride_3x = dstStride_2x + dstStride; 1109cabdff1aSopenharmony_ci __m256i src00, src01, src02, src03, src04, src05, src06; 1110cabdff1aSopenharmony_ci __m256i src07, src08, src09, src10, src11, src12, tmp00; 1111cabdff1aSopenharmony_ci __m256i tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09; 1112cabdff1aSopenharmony_ci __m256i h_20 = __lasx_xvldi(0x414); 1113cabdff1aSopenharmony_ci __m256i h_5 = __lasx_xvldi(0x405); 1114cabdff1aSopenharmony_ci __m256i h_16 = __lasx_xvldi(0x410); 1115cabdff1aSopenharmony_ci 1116cabdff1aSopenharmony_ci 1117cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0, 1118cabdff1aSopenharmony_ci src00, src01); 1119cabdff1aSopenharmony_ci src02 = __lasx_xvld(src, 0); 1120cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src, 1121cabdff1aSopenharmony_ci srcStride_3x, src, srcStride_4x, src03, src04, src05, src06); 1122cabdff1aSopenharmony_ci src += srcStride_4x; 1123cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src, 1124cabdff1aSopenharmony_ci srcStride_3x, src, srcStride_4x, src07, src08, src09, src10); 1125cabdff1aSopenharmony_ci src += srcStride_4x; 1126cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12); 1127cabdff1aSopenharmony_ci 1128cabdff1aSopenharmony_ci tmp06 = __lasx_xvld(dst, 0); 1129cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, 1130cabdff1aSopenharmony_ci dst, dstStride_3x, dst, dstStride_4x, 1131cabdff1aSopenharmony_ci tmp07, tmp02, tmp03, tmp04); 1132cabdff1aSopenharmony_ci dst += dstStride_4x; 1133cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, 1134cabdff1aSopenharmony_ci tmp05, tmp00); 1135cabdff1aSopenharmony_ci tmp01 = __lasx_xvldx(dst, dstStride_3x); 1136cabdff1aSopenharmony_ci dst -= dstStride_4x; 1137cabdff1aSopenharmony_ci 1138cabdff1aSopenharmony_ci tmp06 = __lasx_xvpermi_q(tmp06, tmp07, 0x02); 1139cabdff1aSopenharmony_ci tmp07 = __lasx_xvpermi_q(tmp02, tmp03, 0x02); 1140cabdff1aSopenharmony_ci tmp08 = __lasx_xvpermi_q(tmp04, tmp05, 0x02); 1141cabdff1aSopenharmony_ci tmp09 = __lasx_xvpermi_q(tmp00, tmp01, 0x02); 1142cabdff1aSopenharmony_ci 1143cabdff1aSopenharmony_ci QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06, 1144cabdff1aSopenharmony_ci tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1145cabdff1aSopenharmony_ci tmp06 = __lasx_xvavgr_bu(tmp06, tmp02); 1146cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp06, dst, 0, 0); 1147cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp06, dst + dstStride, 0, 2); 1148cabdff1aSopenharmony_ci dst += dstStride_2x; 1149cabdff1aSopenharmony_ci QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08, 1150cabdff1aSopenharmony_ci tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1151cabdff1aSopenharmony_ci tmp07 = __lasx_xvavgr_bu(tmp07, tmp02); 1152cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp07, dst, 0, 0); 1153cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp07, dst + dstStride, 0, 2); 1154cabdff1aSopenharmony_ci dst += dstStride_2x; 1155cabdff1aSopenharmony_ci QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10, 1156cabdff1aSopenharmony_ci tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1157cabdff1aSopenharmony_ci tmp08 = __lasx_xvavgr_bu(tmp08, tmp02); 1158cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp08, dst, 0, 0); 1159cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp08, dst + dstStride, 0, 2); 1160cabdff1aSopenharmony_ci dst += dstStride_2x; 1161cabdff1aSopenharmony_ci QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12, 1162cabdff1aSopenharmony_ci tmp00, tmp01, tmp02, tmp03, tmp04, tmp05); 1163cabdff1aSopenharmony_ci tmp09 = __lasx_xvavgr_bu(tmp09, tmp02); 1164cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp09, dst, 0, 0); 1165cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp09, dst + dstStride, 0, 2); 1166cabdff1aSopenharmony_ci} 1167cabdff1aSopenharmony_ci 1168cabdff1aSopenharmony_ci#define QPEL8_HV_LOWPASS_H(tmp) \ 1169cabdff1aSopenharmony_ci{ \ 1170cabdff1aSopenharmony_ci src00 = __lasx_xvld(src, -2); \ 1171cabdff1aSopenharmony_ci src += srcStride; \ 1172cabdff1aSopenharmony_ci src10 = __lasx_xvld(src, -2); \ 1173cabdff1aSopenharmony_ci src += srcStride; \ 1174cabdff1aSopenharmony_ci src00 = __lasx_xvpermi_q(src00, src10, 0x02); \ 1175cabdff1aSopenharmony_ci src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \ 1176cabdff1aSopenharmony_ci src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \ 1177cabdff1aSopenharmony_ci src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \ 1178cabdff1aSopenharmony_ci src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \ 1179cabdff1aSopenharmony_ci src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \ 1180cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\ 1181cabdff1aSopenharmony_ci src00 = __lasx_xvaddwl_h_bu(src00, src05); \ 1182cabdff1aSopenharmony_ci src02 = __lasx_xvmul_h(src02, h_20); \ 1183cabdff1aSopenharmony_ci src01 = __lasx_xvmul_h(src01, h_5); \ 1184cabdff1aSopenharmony_ci src02 = __lasx_xvssub_h(src02, src01); \ 1185cabdff1aSopenharmony_ci tmp = __lasx_xvsadd_h(src02, src00); \ 1186cabdff1aSopenharmony_ci} 1187cabdff1aSopenharmony_ci 1188cabdff1aSopenharmony_ci#define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3, \ 1189cabdff1aSopenharmony_ci src4, src5, temp0, temp1, \ 1190cabdff1aSopenharmony_ci temp2, temp3, temp4, temp5, \ 1191cabdff1aSopenharmony_ci out) \ 1192cabdff1aSopenharmony_ci{ \ 1193cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvaddwl_w_h, src2, src3, src1, src4, temp0, temp2); \ 1194cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvaddwh_w_h, src2, src3, src1, src4, temp1, temp3); \ 1195cabdff1aSopenharmony_ci temp4 = __lasx_xvaddwl_w_h(src0, src5); \ 1196cabdff1aSopenharmony_ci temp5 = __lasx_xvaddwh_w_h(src0, src5); \ 1197cabdff1aSopenharmony_ci temp0 = __lasx_xvmul_w(temp0, w_20); \ 1198cabdff1aSopenharmony_ci temp1 = __lasx_xvmul_w(temp1, w_20); \ 1199cabdff1aSopenharmony_ci temp2 = __lasx_xvmul_w(temp2, w_5); \ 1200cabdff1aSopenharmony_ci temp3 = __lasx_xvmul_w(temp3, w_5); \ 1201cabdff1aSopenharmony_ci temp0 = __lasx_xvssub_w(temp0, temp2); \ 1202cabdff1aSopenharmony_ci temp1 = __lasx_xvssub_w(temp1, temp3); \ 1203cabdff1aSopenharmony_ci temp0 = __lasx_xvsadd_w(temp0, temp4); \ 1204cabdff1aSopenharmony_ci temp1 = __lasx_xvsadd_w(temp1, temp5); \ 1205cabdff1aSopenharmony_ci temp0 = __lasx_xvsadd_w(temp0, w_512); \ 1206cabdff1aSopenharmony_ci temp1 = __lasx_xvsadd_w(temp1, w_512); \ 1207cabdff1aSopenharmony_ci temp0 = __lasx_xvssrani_hu_w(temp0, temp0, 10); \ 1208cabdff1aSopenharmony_ci temp1 = __lasx_xvssrani_hu_w(temp1, temp1, 10); \ 1209cabdff1aSopenharmony_ci temp0 = __lasx_xvpackev_d(temp1, temp0); \ 1210cabdff1aSopenharmony_ci out = __lasx_xvssrani_bu_h(temp0, temp0, 0); \ 1211cabdff1aSopenharmony_ci} 1212cabdff1aSopenharmony_ci 1213cabdff1aSopenharmony_cistatic av_always_inline void 1214cabdff1aSopenharmony_ciput_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1215cabdff1aSopenharmony_ci ptrdiff_t dstStride, ptrdiff_t srcStride) 1216cabdff1aSopenharmony_ci{ 1217cabdff1aSopenharmony_ci __m256i src00, src01, src02, src03, src04, src05, src10; 1218cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; 1219cabdff1aSopenharmony_ci __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12; 1220cabdff1aSopenharmony_ci __m256i h_20 = __lasx_xvldi(0x414); 1221cabdff1aSopenharmony_ci __m256i h_5 = __lasx_xvldi(0x405); 1222cabdff1aSopenharmony_ci __m256i w_20 = __lasx_xvldi(0x814); 1223cabdff1aSopenharmony_ci __m256i w_5 = __lasx_xvldi(0x805); 1224cabdff1aSopenharmony_ci __m256i w_512 = {512}; 1225cabdff1aSopenharmony_ci __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0}; 1226cabdff1aSopenharmony_ci __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0}; 1227cabdff1aSopenharmony_ci __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0}; 1228cabdff1aSopenharmony_ci __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0}; 1229cabdff1aSopenharmony_ci __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0}; 1230cabdff1aSopenharmony_ci 1231cabdff1aSopenharmony_ci w_512 = __lasx_xvreplve0_w(w_512); 1232cabdff1aSopenharmony_ci 1233cabdff1aSopenharmony_ci src -= srcStride << 1; 1234cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp0) 1235cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp2) 1236cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp4) 1237cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp6) 1238cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp8) 1239cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp10) 1240cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp12) 1241cabdff1aSopenharmony_ci tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21); 1242cabdff1aSopenharmony_ci tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21); 1243cabdff1aSopenharmony_ci tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21); 1244cabdff1aSopenharmony_ci tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21); 1245cabdff1aSopenharmony_ci tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21); 1246cabdff1aSopenharmony_ci tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21); 1247cabdff1aSopenharmony_ci 1248cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01, 1249cabdff1aSopenharmony_ci src02, src03, src04, src05, tmp0) 1250cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01, 1251cabdff1aSopenharmony_ci src02, src03, src04, src05, tmp2) 1252cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01, 1253cabdff1aSopenharmony_ci src02, src03, src04, src05, tmp4) 1254cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01, 1255cabdff1aSopenharmony_ci src02, src03, src04, src05, tmp6) 1256cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst, 0, 0); 1257cabdff1aSopenharmony_ci dst += dstStride; 1258cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst, 0, 2); 1259cabdff1aSopenharmony_ci dst += dstStride; 1260cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp2, dst, 0, 0); 1261cabdff1aSopenharmony_ci dst += dstStride; 1262cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp2, dst, 0, 2); 1263cabdff1aSopenharmony_ci dst += dstStride; 1264cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp4, dst, 0, 0); 1265cabdff1aSopenharmony_ci dst += dstStride; 1266cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp4, dst, 0, 2); 1267cabdff1aSopenharmony_ci dst += dstStride; 1268cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp6, dst, 0, 0); 1269cabdff1aSopenharmony_ci dst += dstStride; 1270cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp6, dst, 0, 2); 1271cabdff1aSopenharmony_ci} 1272cabdff1aSopenharmony_ci 1273cabdff1aSopenharmony_cistatic av_always_inline void 1274cabdff1aSopenharmony_ciavg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, 1275cabdff1aSopenharmony_ci int srcStride) 1276cabdff1aSopenharmony_ci{ 1277cabdff1aSopenharmony_ci int dstStride_2x = dstStride << 1; 1278cabdff1aSopenharmony_ci int dstStride_4x = dstStride << 2; 1279cabdff1aSopenharmony_ci int dstStride_3x = dstStride_2x + dstStride; 1280cabdff1aSopenharmony_ci __m256i src00, src01, src02, src03, src04, src05, src10; 1281cabdff1aSopenharmony_ci __m256i dst00, dst01, dst0, dst1, dst2, dst3; 1282cabdff1aSopenharmony_ci __m256i out0, out1, out2, out3; 1283cabdff1aSopenharmony_ci __m256i h_20 = __lasx_xvldi(0x414); 1284cabdff1aSopenharmony_ci __m256i h_5 = __lasx_xvldi(0x405); 1285cabdff1aSopenharmony_ci __m256i h_16 = __lasx_xvldi(0x410); 1286cabdff1aSopenharmony_ci __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0}; 1287cabdff1aSopenharmony_ci __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0}; 1288cabdff1aSopenharmony_ci __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0}; 1289cabdff1aSopenharmony_ci __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0}; 1290cabdff1aSopenharmony_ci __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0}; 1291cabdff1aSopenharmony_ci 1292cabdff1aSopenharmony_ci QPEL8_H_LOWPASS(out0) 1293cabdff1aSopenharmony_ci QPEL8_H_LOWPASS(out1) 1294cabdff1aSopenharmony_ci QPEL8_H_LOWPASS(out2) 1295cabdff1aSopenharmony_ci QPEL8_H_LOWPASS(out3) 1296cabdff1aSopenharmony_ci src00 = __lasx_xvld(dst, 0); 1297cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst, 1298cabdff1aSopenharmony_ci dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04); 1299cabdff1aSopenharmony_ci dst += dstStride_4x; 1300cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, dst00); 1301cabdff1aSopenharmony_ci dst01 = __lasx_xvldx(dst, dstStride_3x); 1302cabdff1aSopenharmony_ci dst -= dstStride_4x; 1303cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(src00, src01, 0x02); 1304cabdff1aSopenharmony_ci dst1 = __lasx_xvpermi_q(src02, src03, 0x02); 1305cabdff1aSopenharmony_ci dst2 = __lasx_xvpermi_q(src04, src05, 0x02); 1306cabdff1aSopenharmony_ci dst3 = __lasx_xvpermi_q(dst00, dst01, 0x02); 1307cabdff1aSopenharmony_ci dst0 = __lasx_xvavgr_bu(dst0, out0); 1308cabdff1aSopenharmony_ci dst1 = __lasx_xvavgr_bu(dst1, out1); 1309cabdff1aSopenharmony_ci dst2 = __lasx_xvavgr_bu(dst2, out2); 1310cabdff1aSopenharmony_ci dst3 = __lasx_xvavgr_bu(dst3, out3); 1311cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 0); 1312cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + dstStride, 0, 2); 1313cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + dstStride_2x, 0, 0); 1314cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + dstStride_3x, 0, 2); 1315cabdff1aSopenharmony_ci dst += dstStride_4x; 1316cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst, 0, 0); 1317cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst + dstStride, 0, 2); 1318cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst + dstStride_2x, 0, 0); 1319cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst + dstStride_3x, 0, 2); 1320cabdff1aSopenharmony_ci} 1321cabdff1aSopenharmony_ci 1322cabdff1aSopenharmony_cistatic av_always_inline void 1323cabdff1aSopenharmony_ciavg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1324cabdff1aSopenharmony_ci ptrdiff_t dstStride, ptrdiff_t srcStride) 1325cabdff1aSopenharmony_ci{ 1326cabdff1aSopenharmony_ci __m256i src00, src01, src02, src03, src04, src05, src10; 1327cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; 1328cabdff1aSopenharmony_ci __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12; 1329cabdff1aSopenharmony_ci __m256i h_20 = __lasx_xvldi(0x414); 1330cabdff1aSopenharmony_ci __m256i h_5 = __lasx_xvldi(0x405); 1331cabdff1aSopenharmony_ci __m256i w_20 = __lasx_xvldi(0x814); 1332cabdff1aSopenharmony_ci __m256i w_5 = __lasx_xvldi(0x805); 1333cabdff1aSopenharmony_ci __m256i w_512 = {512}; 1334cabdff1aSopenharmony_ci __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0}; 1335cabdff1aSopenharmony_ci __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0}; 1336cabdff1aSopenharmony_ci __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0}; 1337cabdff1aSopenharmony_ci __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0}; 1338cabdff1aSopenharmony_ci __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0}; 1339cabdff1aSopenharmony_ci ptrdiff_t dstStride_2x = dstStride << 1; 1340cabdff1aSopenharmony_ci ptrdiff_t dstStride_4x = dstStride << 2; 1341cabdff1aSopenharmony_ci ptrdiff_t dstStride_3x = dstStride_2x + dstStride; 1342cabdff1aSopenharmony_ci 1343cabdff1aSopenharmony_ci w_512 = __lasx_xvreplve0_w(w_512); 1344cabdff1aSopenharmony_ci 1345cabdff1aSopenharmony_ci src -= srcStride << 1; 1346cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp0) 1347cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp2) 1348cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp4) 1349cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp6) 1350cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp8) 1351cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp10) 1352cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_H(tmp12) 1353cabdff1aSopenharmony_ci tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21); 1354cabdff1aSopenharmony_ci tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21); 1355cabdff1aSopenharmony_ci tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21); 1356cabdff1aSopenharmony_ci tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21); 1357cabdff1aSopenharmony_ci tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21); 1358cabdff1aSopenharmony_ci tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21); 1359cabdff1aSopenharmony_ci 1360cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01, 1361cabdff1aSopenharmony_ci src02, src03, src04, src05, tmp0) 1362cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01, 1363cabdff1aSopenharmony_ci src02, src03, src04, src05, tmp2) 1364cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01, 1365cabdff1aSopenharmony_ci src02, src03, src04, src05, tmp4) 1366cabdff1aSopenharmony_ci QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01, 1367cabdff1aSopenharmony_ci src02, src03, src04, src05, tmp6) 1368cabdff1aSopenharmony_ci 1369cabdff1aSopenharmony_ci src00 = __lasx_xvld(dst, 0); 1370cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst, 1371cabdff1aSopenharmony_ci dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04); 1372cabdff1aSopenharmony_ci dst += dstStride_4x; 1373cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, tmp8); 1374cabdff1aSopenharmony_ci tmp9 = __lasx_xvldx(dst, dstStride_3x); 1375cabdff1aSopenharmony_ci dst -= dstStride_4x; 1376cabdff1aSopenharmony_ci tmp1 = __lasx_xvpermi_q(src00, src01, 0x02); 1377cabdff1aSopenharmony_ci tmp3 = __lasx_xvpermi_q(src02, src03, 0x02); 1378cabdff1aSopenharmony_ci tmp5 = __lasx_xvpermi_q(src04, src05, 0x02); 1379cabdff1aSopenharmony_ci tmp7 = __lasx_xvpermi_q(tmp8, tmp9, 0x02); 1380cabdff1aSopenharmony_ci tmp0 = __lasx_xvavgr_bu(tmp0, tmp1); 1381cabdff1aSopenharmony_ci tmp2 = __lasx_xvavgr_bu(tmp2, tmp3); 1382cabdff1aSopenharmony_ci tmp4 = __lasx_xvavgr_bu(tmp4, tmp5); 1383cabdff1aSopenharmony_ci tmp6 = __lasx_xvavgr_bu(tmp6, tmp7); 1384cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst, 0, 0); 1385cabdff1aSopenharmony_ci dst += dstStride; 1386cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp0, dst, 0, 2); 1387cabdff1aSopenharmony_ci dst += dstStride; 1388cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp2, dst, 0, 0); 1389cabdff1aSopenharmony_ci dst += dstStride; 1390cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp2, dst, 0, 2); 1391cabdff1aSopenharmony_ci dst += dstStride; 1392cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp4, dst, 0, 0); 1393cabdff1aSopenharmony_ci dst += dstStride; 1394cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp4, dst, 0, 2); 1395cabdff1aSopenharmony_ci dst += dstStride; 1396cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp6, dst, 0, 0); 1397cabdff1aSopenharmony_ci dst += dstStride; 1398cabdff1aSopenharmony_ci __lasx_xvstelm_d(tmp6, dst, 0, 2); 1399cabdff1aSopenharmony_ci} 1400cabdff1aSopenharmony_ci 1401cabdff1aSopenharmony_cistatic av_always_inline void 1402cabdff1aSopenharmony_ciput_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1403cabdff1aSopenharmony_ci int dstStride, int srcStride) 1404cabdff1aSopenharmony_ci{ 1405cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride); 1406cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride); 1407cabdff1aSopenharmony_ci src += srcStride << 3; 1408cabdff1aSopenharmony_ci dst += dstStride << 3; 1409cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride); 1410cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride); 1411cabdff1aSopenharmony_ci} 1412cabdff1aSopenharmony_ci 1413cabdff1aSopenharmony_cistatic av_always_inline void 1414cabdff1aSopenharmony_ciavg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1415cabdff1aSopenharmony_ci int dstStride, int srcStride) 1416cabdff1aSopenharmony_ci{ 1417cabdff1aSopenharmony_ci avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride); 1418cabdff1aSopenharmony_ci avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride); 1419cabdff1aSopenharmony_ci src += srcStride << 3; 1420cabdff1aSopenharmony_ci dst += dstStride << 3; 1421cabdff1aSopenharmony_ci avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride); 1422cabdff1aSopenharmony_ci avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride); 1423cabdff1aSopenharmony_ci} 1424cabdff1aSopenharmony_ci 1425cabdff1aSopenharmony_cistatic void put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1426cabdff1aSopenharmony_ci int dstStride, int srcStride) 1427cabdff1aSopenharmony_ci{ 1428cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride); 1429cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride); 1430cabdff1aSopenharmony_ci src += 8*srcStride; 1431cabdff1aSopenharmony_ci dst += 8*dstStride; 1432cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride); 1433cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride); 1434cabdff1aSopenharmony_ci} 1435cabdff1aSopenharmony_ci 1436cabdff1aSopenharmony_cistatic void avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1437cabdff1aSopenharmony_ci int dstStride, int srcStride) 1438cabdff1aSopenharmony_ci{ 1439cabdff1aSopenharmony_ci avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride); 1440cabdff1aSopenharmony_ci avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride); 1441cabdff1aSopenharmony_ci src += 8*srcStride; 1442cabdff1aSopenharmony_ci dst += 8*dstStride; 1443cabdff1aSopenharmony_ci avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride); 1444cabdff1aSopenharmony_ci avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride); 1445cabdff1aSopenharmony_ci} 1446cabdff1aSopenharmony_ci 1447cabdff1aSopenharmony_cistatic void put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1448cabdff1aSopenharmony_ci ptrdiff_t dstStride, ptrdiff_t srcStride) 1449cabdff1aSopenharmony_ci{ 1450cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride); 1451cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride); 1452cabdff1aSopenharmony_ci src += srcStride << 3; 1453cabdff1aSopenharmony_ci dst += dstStride << 3; 1454cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride); 1455cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride); 1456cabdff1aSopenharmony_ci} 1457cabdff1aSopenharmony_ci 1458cabdff1aSopenharmony_cistatic void avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, 1459cabdff1aSopenharmony_ci ptrdiff_t dstStride, ptrdiff_t srcStride) 1460cabdff1aSopenharmony_ci{ 1461cabdff1aSopenharmony_ci avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride); 1462cabdff1aSopenharmony_ci avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride); 1463cabdff1aSopenharmony_ci src += srcStride << 3; 1464cabdff1aSopenharmony_ci dst += dstStride << 3; 1465cabdff1aSopenharmony_ci avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride); 1466cabdff1aSopenharmony_ci avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride); 1467cabdff1aSopenharmony_ci} 1468cabdff1aSopenharmony_ci 1469cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, 1470cabdff1aSopenharmony_ci ptrdiff_t stride) 1471cabdff1aSopenharmony_ci{ 1472cabdff1aSopenharmony_ci /* In mmi optimization, it used function ff_put_pixels8_8_mmi 1473cabdff1aSopenharmony_ci * which implemented in hpeldsp_mmi.c */ 1474cabdff1aSopenharmony_ci put_pixels8_8_inline_asm(dst, src, stride); 1475cabdff1aSopenharmony_ci} 1476cabdff1aSopenharmony_ci 1477cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, 1478cabdff1aSopenharmony_ci ptrdiff_t stride) 1479cabdff1aSopenharmony_ci{ 1480cabdff1aSopenharmony_ci uint8_t half[64]; 1481cabdff1aSopenharmony_ci 1482cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride); 1483cabdff1aSopenharmony_ci /* in qpel8, the stride of half and height of block is 8 */ 1484cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, src, half, stride, stride); 1485cabdff1aSopenharmony_ci} 1486cabdff1aSopenharmony_ci 1487cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, 1488cabdff1aSopenharmony_ci ptrdiff_t stride) 1489cabdff1aSopenharmony_ci{ 1490cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride); 1491cabdff1aSopenharmony_ci} 1492cabdff1aSopenharmony_ci 1493cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, 1494cabdff1aSopenharmony_ci ptrdiff_t stride) 1495cabdff1aSopenharmony_ci{ 1496cabdff1aSopenharmony_ci uint8_t half[64]; 1497cabdff1aSopenharmony_ci 1498cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride); 1499cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride); 1500cabdff1aSopenharmony_ci} 1501cabdff1aSopenharmony_ci 1502cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src, 1503cabdff1aSopenharmony_ci ptrdiff_t stride) 1504cabdff1aSopenharmony_ci{ 1505cabdff1aSopenharmony_ci uint8_t half[64]; 1506cabdff1aSopenharmony_ci 1507cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride); 1508cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, src, half, stride, stride); 1509cabdff1aSopenharmony_ci} 1510cabdff1aSopenharmony_ci 1511cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, 1512cabdff1aSopenharmony_ci ptrdiff_t stride) 1513cabdff1aSopenharmony_ci{ 1514cabdff1aSopenharmony_ci uint8_t halfH[64]; 1515cabdff1aSopenharmony_ci uint8_t halfV[64]; 1516cabdff1aSopenharmony_ci 1517cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1518cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride); 1519cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1520cabdff1aSopenharmony_ci} 1521cabdff1aSopenharmony_ci 1522cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, 1523cabdff1aSopenharmony_ci ptrdiff_t stride) 1524cabdff1aSopenharmony_ci{ 1525cabdff1aSopenharmony_ci uint8_t temp[128]; 1526cabdff1aSopenharmony_ci uint8_t *const halfH = temp; 1527cabdff1aSopenharmony_ci uint8_t *const halfHV = temp + 64; 1528cabdff1aSopenharmony_ci 1529cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1530cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1531cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1532cabdff1aSopenharmony_ci} 1533cabdff1aSopenharmony_ci 1534cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, 1535cabdff1aSopenharmony_ci ptrdiff_t stride) 1536cabdff1aSopenharmony_ci{ 1537cabdff1aSopenharmony_ci uint8_t halfH[64]; 1538cabdff1aSopenharmony_ci uint8_t halfV[64]; 1539cabdff1aSopenharmony_ci 1540cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1541cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride); 1542cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1543cabdff1aSopenharmony_ci} 1544cabdff1aSopenharmony_ci 1545cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, 1546cabdff1aSopenharmony_ci ptrdiff_t stride) 1547cabdff1aSopenharmony_ci{ 1548cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride); 1549cabdff1aSopenharmony_ci} 1550cabdff1aSopenharmony_ci 1551cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, 1552cabdff1aSopenharmony_ci ptrdiff_t stride) 1553cabdff1aSopenharmony_ci{ 1554cabdff1aSopenharmony_ci uint8_t temp[128]; 1555cabdff1aSopenharmony_ci uint8_t *const halfHV = temp; 1556cabdff1aSopenharmony_ci uint8_t *const halfH = temp + 64; 1557cabdff1aSopenharmony_ci 1558cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1559cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride); 1560cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1561cabdff1aSopenharmony_ci} 1562cabdff1aSopenharmony_ci 1563cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, 1564cabdff1aSopenharmony_ci ptrdiff_t stride) 1565cabdff1aSopenharmony_ci{ 1566cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride); 1567cabdff1aSopenharmony_ci} 1568cabdff1aSopenharmony_ci 1569cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, 1570cabdff1aSopenharmony_ci ptrdiff_t stride) 1571cabdff1aSopenharmony_ci{ 1572cabdff1aSopenharmony_ci uint8_t temp[128]; 1573cabdff1aSopenharmony_ci uint8_t *const halfHV = temp; 1574cabdff1aSopenharmony_ci uint8_t *const halfH = temp + 64; 1575cabdff1aSopenharmony_ci 1576cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1577cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride); 1578cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1579cabdff1aSopenharmony_ci} 1580cabdff1aSopenharmony_ci 1581cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src, 1582cabdff1aSopenharmony_ci ptrdiff_t stride) 1583cabdff1aSopenharmony_ci{ 1584cabdff1aSopenharmony_ci uint8_t half[64]; 1585cabdff1aSopenharmony_ci 1586cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride); 1587cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride); 1588cabdff1aSopenharmony_ci} 1589cabdff1aSopenharmony_ci 1590cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, 1591cabdff1aSopenharmony_ci ptrdiff_t stride) 1592cabdff1aSopenharmony_ci{ 1593cabdff1aSopenharmony_ci uint8_t halfH[64]; 1594cabdff1aSopenharmony_ci uint8_t halfV[64]; 1595cabdff1aSopenharmony_ci 1596cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1597cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride); 1598cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1599cabdff1aSopenharmony_ci} 1600cabdff1aSopenharmony_ci 1601cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, 1602cabdff1aSopenharmony_ci ptrdiff_t stride) 1603cabdff1aSopenharmony_ci{ 1604cabdff1aSopenharmony_ci uint8_t temp[128]; 1605cabdff1aSopenharmony_ci uint8_t *const halfH = temp; 1606cabdff1aSopenharmony_ci uint8_t *const halfHV = temp + 64; 1607cabdff1aSopenharmony_ci 1608cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1609cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1610cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1611cabdff1aSopenharmony_ci} 1612cabdff1aSopenharmony_ci 1613cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, 1614cabdff1aSopenharmony_ci ptrdiff_t stride) 1615cabdff1aSopenharmony_ci{ 1616cabdff1aSopenharmony_ci uint8_t halfH[64]; 1617cabdff1aSopenharmony_ci uint8_t halfV[64]; 1618cabdff1aSopenharmony_ci 1619cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1620cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride); 1621cabdff1aSopenharmony_ci put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1622cabdff1aSopenharmony_ci} 1623cabdff1aSopenharmony_ci 1624cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, 1625cabdff1aSopenharmony_ci ptrdiff_t stride) 1626cabdff1aSopenharmony_ci{ 1627cabdff1aSopenharmony_ci /* In mmi optimization, it used function ff_avg_pixels8_8_mmi 1628cabdff1aSopenharmony_ci * which implemented in hpeldsp_mmi.c */ 1629cabdff1aSopenharmony_ci avg_pixels8_8_lsx(dst, src, stride); 1630cabdff1aSopenharmony_ci} 1631cabdff1aSopenharmony_ci 1632cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, 1633cabdff1aSopenharmony_ci ptrdiff_t stride) 1634cabdff1aSopenharmony_ci{ 1635cabdff1aSopenharmony_ci uint8_t half[64]; 1636cabdff1aSopenharmony_ci 1637cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride); 1638cabdff1aSopenharmony_ci avg_pixels8_l2_8_lsx(dst, src, half, stride, stride); 1639cabdff1aSopenharmony_ci} 1640cabdff1aSopenharmony_ci 1641cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, 1642cabdff1aSopenharmony_ci ptrdiff_t stride) 1643cabdff1aSopenharmony_ci{ 1644cabdff1aSopenharmony_ci avg_h264_qpel8_h_lowpass_lasx(dst, src, stride, stride); 1645cabdff1aSopenharmony_ci} 1646cabdff1aSopenharmony_ci 1647cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, 1648cabdff1aSopenharmony_ci ptrdiff_t stride) 1649cabdff1aSopenharmony_ci{ 1650cabdff1aSopenharmony_ci uint8_t half[64]; 1651cabdff1aSopenharmony_ci 1652cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(half, src, 8, stride); 1653cabdff1aSopenharmony_ci avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride); 1654cabdff1aSopenharmony_ci} 1655cabdff1aSopenharmony_ci 1656cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, 1657cabdff1aSopenharmony_ci ptrdiff_t stride) 1658cabdff1aSopenharmony_ci{ 1659cabdff1aSopenharmony_ci uint8_t halfH[64]; 1660cabdff1aSopenharmony_ci uint8_t halfV[64]; 1661cabdff1aSopenharmony_ci 1662cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1663cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride); 1664cabdff1aSopenharmony_ci avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1665cabdff1aSopenharmony_ci} 1666cabdff1aSopenharmony_ci 1667cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, 1668cabdff1aSopenharmony_ci ptrdiff_t stride) 1669cabdff1aSopenharmony_ci{ 1670cabdff1aSopenharmony_ci uint8_t temp[128]; 1671cabdff1aSopenharmony_ci uint8_t *const halfH = temp; 1672cabdff1aSopenharmony_ci uint8_t *const halfHV = temp + 64; 1673cabdff1aSopenharmony_ci 1674cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1675cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1676cabdff1aSopenharmony_ci avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1677cabdff1aSopenharmony_ci} 1678cabdff1aSopenharmony_ci 1679cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, 1680cabdff1aSopenharmony_ci ptrdiff_t stride) 1681cabdff1aSopenharmony_ci{ 1682cabdff1aSopenharmony_ci uint8_t halfH[64]; 1683cabdff1aSopenharmony_ci uint8_t halfV[64]; 1684cabdff1aSopenharmony_ci 1685cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src, 8, stride); 1686cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride); 1687cabdff1aSopenharmony_ci avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1688cabdff1aSopenharmony_ci} 1689cabdff1aSopenharmony_ci 1690cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, 1691cabdff1aSopenharmony_ci ptrdiff_t stride) 1692cabdff1aSopenharmony_ci{ 1693cabdff1aSopenharmony_ci avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride); 1694cabdff1aSopenharmony_ci} 1695cabdff1aSopenharmony_ci 1696cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, 1697cabdff1aSopenharmony_ci ptrdiff_t stride) 1698cabdff1aSopenharmony_ci{ 1699cabdff1aSopenharmony_ci uint8_t temp[128]; 1700cabdff1aSopenharmony_ci uint8_t *const halfHV = temp; 1701cabdff1aSopenharmony_ci uint8_t *const halfH = temp + 64; 1702cabdff1aSopenharmony_ci 1703cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1704cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride); 1705cabdff1aSopenharmony_ci avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1706cabdff1aSopenharmony_ci} 1707cabdff1aSopenharmony_ci 1708cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, 1709cabdff1aSopenharmony_ci ptrdiff_t stride) 1710cabdff1aSopenharmony_ci{ 1711cabdff1aSopenharmony_ci avg_h264_qpel8_hv_lowpass_lasx(dst, src, stride, stride); 1712cabdff1aSopenharmony_ci} 1713cabdff1aSopenharmony_ci 1714cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, 1715cabdff1aSopenharmony_ci ptrdiff_t stride) 1716cabdff1aSopenharmony_ci{ 1717cabdff1aSopenharmony_ci uint8_t temp[128]; 1718cabdff1aSopenharmony_ci uint8_t *const halfHV = temp; 1719cabdff1aSopenharmony_ci uint8_t *const halfH = temp + 64; 1720cabdff1aSopenharmony_ci 1721cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1722cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride); 1723cabdff1aSopenharmony_ci avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1724cabdff1aSopenharmony_ci} 1725cabdff1aSopenharmony_ci 1726cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, 1727cabdff1aSopenharmony_ci ptrdiff_t stride) 1728cabdff1aSopenharmony_ci{ 1729cabdff1aSopenharmony_ci uint8_t halfH[64]; 1730cabdff1aSopenharmony_ci uint8_t halfV[64]; 1731cabdff1aSopenharmony_ci 1732cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1733cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride); 1734cabdff1aSopenharmony_ci avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1735cabdff1aSopenharmony_ci} 1736cabdff1aSopenharmony_ci 1737cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, 1738cabdff1aSopenharmony_ci ptrdiff_t stride) 1739cabdff1aSopenharmony_ci{ 1740cabdff1aSopenharmony_ci uint8_t temp[128]; 1741cabdff1aSopenharmony_ci uint8_t *const halfH = temp; 1742cabdff1aSopenharmony_ci uint8_t *const halfHV = temp + 64; 1743cabdff1aSopenharmony_ci 1744cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1745cabdff1aSopenharmony_ci put_h264_qpel8_hv_lowpass_lasx(halfHV, src, 8, stride); 1746cabdff1aSopenharmony_ci avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); 1747cabdff1aSopenharmony_ci} 1748cabdff1aSopenharmony_ci 1749cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, 1750cabdff1aSopenharmony_ci ptrdiff_t stride) 1751cabdff1aSopenharmony_ci{ 1752cabdff1aSopenharmony_ci uint8_t halfH[64]; 1753cabdff1aSopenharmony_ci uint8_t halfV[64]; 1754cabdff1aSopenharmony_ci 1755cabdff1aSopenharmony_ci put_h264_qpel8_h_lowpass_lasx(halfH, src + stride, 8, stride); 1756cabdff1aSopenharmony_ci put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride); 1757cabdff1aSopenharmony_ci avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); 1758cabdff1aSopenharmony_ci} 1759cabdff1aSopenharmony_ci 1760cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, 1761cabdff1aSopenharmony_ci ptrdiff_t stride) 1762cabdff1aSopenharmony_ci{ 1763cabdff1aSopenharmony_ci /* In mmi optimization, it used function ff_put_pixels16_8_mmi 1764cabdff1aSopenharmony_ci * which implemented in hpeldsp_mmi.c */ 1765cabdff1aSopenharmony_ci put_pixels16_8_lsx(dst, src, stride); 1766cabdff1aSopenharmony_ci} 1767cabdff1aSopenharmony_ci 1768cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, 1769cabdff1aSopenharmony_ci ptrdiff_t stride) 1770cabdff1aSopenharmony_ci{ 1771cabdff1aSopenharmony_ci uint8_t half[256]; 1772cabdff1aSopenharmony_ci 1773cabdff1aSopenharmony_ci put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride); 1774cabdff1aSopenharmony_ci put_pixels16_l2_8_lsx(dst, src, half, stride, stride); 1775cabdff1aSopenharmony_ci} 1776cabdff1aSopenharmony_ci 1777cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, 1778cabdff1aSopenharmony_ci ptrdiff_t stride) 1779cabdff1aSopenharmony_ci{ 1780cabdff1aSopenharmony_ci put_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride); 1781cabdff1aSopenharmony_ci} 1782cabdff1aSopenharmony_ci 1783cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, 1784cabdff1aSopenharmony_ci ptrdiff_t stride) 1785cabdff1aSopenharmony_ci{ 1786cabdff1aSopenharmony_ci uint8_t half[256]; 1787cabdff1aSopenharmony_ci 1788cabdff1aSopenharmony_ci put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride); 1789cabdff1aSopenharmony_ci put_pixels16_l2_8_lsx(dst, src+1, half, stride, stride); 1790cabdff1aSopenharmony_ci} 1791cabdff1aSopenharmony_ci 1792cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, 1793cabdff1aSopenharmony_ci ptrdiff_t stride) 1794cabdff1aSopenharmony_ci{ 1795cabdff1aSopenharmony_ci uint8_t half[256]; 1796cabdff1aSopenharmony_ci 1797cabdff1aSopenharmony_ci put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride); 1798cabdff1aSopenharmony_ci put_pixels16_l2_8_lsx(dst, src, half, stride, stride); 1799cabdff1aSopenharmony_ci} 1800cabdff1aSopenharmony_ci 1801cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, 1802cabdff1aSopenharmony_ci ptrdiff_t stride) 1803cabdff1aSopenharmony_ci{ 1804cabdff1aSopenharmony_ci avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2), 1805cabdff1aSopenharmony_ci dst, stride); 1806cabdff1aSopenharmony_ci} 1807cabdff1aSopenharmony_ci 1808cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, 1809cabdff1aSopenharmony_ci ptrdiff_t stride) 1810cabdff1aSopenharmony_ci{ 1811cabdff1aSopenharmony_ci uint8_t temp[512]; 1812cabdff1aSopenharmony_ci uint8_t *const halfH = temp; 1813cabdff1aSopenharmony_ci uint8_t *const halfHV = temp + 256; 1814cabdff1aSopenharmony_ci 1815cabdff1aSopenharmony_ci put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride); 1816cabdff1aSopenharmony_ci put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1817cabdff1aSopenharmony_ci put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1818cabdff1aSopenharmony_ci} 1819cabdff1aSopenharmony_ci 1820cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, 1821cabdff1aSopenharmony_ci ptrdiff_t stride) 1822cabdff1aSopenharmony_ci{ 1823cabdff1aSopenharmony_ci avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2) + 1, 1824cabdff1aSopenharmony_ci dst, stride); 1825cabdff1aSopenharmony_ci} 1826cabdff1aSopenharmony_ci 1827cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, 1828cabdff1aSopenharmony_ci ptrdiff_t stride) 1829cabdff1aSopenharmony_ci{ 1830cabdff1aSopenharmony_ci put_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride); 1831cabdff1aSopenharmony_ci} 1832cabdff1aSopenharmony_ci 1833cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, 1834cabdff1aSopenharmony_ci ptrdiff_t stride) 1835cabdff1aSopenharmony_ci{ 1836cabdff1aSopenharmony_ci uint8_t temp[512]; 1837cabdff1aSopenharmony_ci uint8_t *const halfHV = temp; 1838cabdff1aSopenharmony_ci uint8_t *const halfH = temp + 256; 1839cabdff1aSopenharmony_ci 1840cabdff1aSopenharmony_ci put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1841cabdff1aSopenharmony_ci put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride); 1842cabdff1aSopenharmony_ci put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1843cabdff1aSopenharmony_ci} 1844cabdff1aSopenharmony_ci 1845cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, 1846cabdff1aSopenharmony_ci ptrdiff_t stride) 1847cabdff1aSopenharmony_ci{ 1848cabdff1aSopenharmony_ci put_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride); 1849cabdff1aSopenharmony_ci} 1850cabdff1aSopenharmony_ci 1851cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, 1852cabdff1aSopenharmony_ci ptrdiff_t stride) 1853cabdff1aSopenharmony_ci{ 1854cabdff1aSopenharmony_ci uint8_t temp[512]; 1855cabdff1aSopenharmony_ci uint8_t *const halfHV = temp; 1856cabdff1aSopenharmony_ci uint8_t *const halfH = temp + 256; 1857cabdff1aSopenharmony_ci 1858cabdff1aSopenharmony_ci put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1859cabdff1aSopenharmony_ci put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride); 1860cabdff1aSopenharmony_ci put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1861cabdff1aSopenharmony_ci} 1862cabdff1aSopenharmony_ci 1863cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, 1864cabdff1aSopenharmony_ci ptrdiff_t stride) 1865cabdff1aSopenharmony_ci{ 1866cabdff1aSopenharmony_ci uint8_t half[256]; 1867cabdff1aSopenharmony_ci 1868cabdff1aSopenharmony_ci put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride); 1869cabdff1aSopenharmony_ci put_pixels16_l2_8_lsx(dst, src+stride, half, stride, stride); 1870cabdff1aSopenharmony_ci} 1871cabdff1aSopenharmony_ci 1872cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, 1873cabdff1aSopenharmony_ci ptrdiff_t stride) 1874cabdff1aSopenharmony_ci{ 1875cabdff1aSopenharmony_ci avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2, (uint8_t*)src - (stride * 2), 1876cabdff1aSopenharmony_ci dst, stride); 1877cabdff1aSopenharmony_ci} 1878cabdff1aSopenharmony_ci 1879cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, 1880cabdff1aSopenharmony_ci ptrdiff_t stride) 1881cabdff1aSopenharmony_ci{ 1882cabdff1aSopenharmony_ci uint8_t temp[512]; 1883cabdff1aSopenharmony_ci uint8_t *const halfH = temp; 1884cabdff1aSopenharmony_ci uint8_t *const halfHV = temp + 256; 1885cabdff1aSopenharmony_ci 1886cabdff1aSopenharmony_ci put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride); 1887cabdff1aSopenharmony_ci put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1888cabdff1aSopenharmony_ci put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1889cabdff1aSopenharmony_ci} 1890cabdff1aSopenharmony_ci 1891cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, 1892cabdff1aSopenharmony_ci ptrdiff_t stride) 1893cabdff1aSopenharmony_ci{ 1894cabdff1aSopenharmony_ci avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2, 1895cabdff1aSopenharmony_ci (uint8_t*)src - (stride * 2) + 1, dst, stride); 1896cabdff1aSopenharmony_ci} 1897cabdff1aSopenharmony_ci 1898cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, 1899cabdff1aSopenharmony_ci ptrdiff_t stride) 1900cabdff1aSopenharmony_ci{ 1901cabdff1aSopenharmony_ci /* In mmi optimization, it used function ff_avg_pixels16_8_mmi 1902cabdff1aSopenharmony_ci * which implemented in hpeldsp_mmi.c */ 1903cabdff1aSopenharmony_ci avg_pixels16_8_lsx(dst, src, stride); 1904cabdff1aSopenharmony_ci} 1905cabdff1aSopenharmony_ci 1906cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, 1907cabdff1aSopenharmony_ci ptrdiff_t stride) 1908cabdff1aSopenharmony_ci{ 1909cabdff1aSopenharmony_ci uint8_t half[256]; 1910cabdff1aSopenharmony_ci 1911cabdff1aSopenharmony_ci put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride); 1912cabdff1aSopenharmony_ci avg_pixels16_l2_8_lsx(dst, src, half, stride, stride); 1913cabdff1aSopenharmony_ci} 1914cabdff1aSopenharmony_ci 1915cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, 1916cabdff1aSopenharmony_ci ptrdiff_t stride) 1917cabdff1aSopenharmony_ci{ 1918cabdff1aSopenharmony_ci avg_h264_qpel16_h_lowpass_lasx(dst, src, stride, stride); 1919cabdff1aSopenharmony_ci} 1920cabdff1aSopenharmony_ci 1921cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, 1922cabdff1aSopenharmony_ci ptrdiff_t stride) 1923cabdff1aSopenharmony_ci{ 1924cabdff1aSopenharmony_ci uint8_t half[256]; 1925cabdff1aSopenharmony_ci 1926cabdff1aSopenharmony_ci put_h264_qpel16_h_lowpass_lasx(half, src, 16, stride); 1927cabdff1aSopenharmony_ci avg_pixels16_l2_8_lsx(dst, src+1, half, stride, stride); 1928cabdff1aSopenharmony_ci} 1929cabdff1aSopenharmony_ci 1930cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, 1931cabdff1aSopenharmony_ci ptrdiff_t stride) 1932cabdff1aSopenharmony_ci{ 1933cabdff1aSopenharmony_ci uint8_t half[256]; 1934cabdff1aSopenharmony_ci 1935cabdff1aSopenharmony_ci put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride); 1936cabdff1aSopenharmony_ci avg_pixels16_l2_8_lsx(dst, src, half, stride, stride); 1937cabdff1aSopenharmony_ci} 1938cabdff1aSopenharmony_ci 1939cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, 1940cabdff1aSopenharmony_ci ptrdiff_t stride) 1941cabdff1aSopenharmony_ci{ 1942cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2, 1943cabdff1aSopenharmony_ci (uint8_t*)src - (stride * 2), 1944cabdff1aSopenharmony_ci dst, stride); 1945cabdff1aSopenharmony_ci} 1946cabdff1aSopenharmony_ci 1947cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, 1948cabdff1aSopenharmony_ci ptrdiff_t stride) 1949cabdff1aSopenharmony_ci{ 1950cabdff1aSopenharmony_ci uint8_t temp[512]; 1951cabdff1aSopenharmony_ci uint8_t *const halfH = temp; 1952cabdff1aSopenharmony_ci uint8_t *const halfHV = temp + 256; 1953cabdff1aSopenharmony_ci 1954cabdff1aSopenharmony_ci put_h264_qpel16_h_lowpass_lasx(halfH, src, 16, stride); 1955cabdff1aSopenharmony_ci put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1956cabdff1aSopenharmony_ci avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1957cabdff1aSopenharmony_ci} 1958cabdff1aSopenharmony_ci 1959cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, 1960cabdff1aSopenharmony_ci ptrdiff_t stride) 1961cabdff1aSopenharmony_ci{ 1962cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src - 2, 1963cabdff1aSopenharmony_ci (uint8_t*)src - (stride * 2) + 1, 1964cabdff1aSopenharmony_ci dst, stride); 1965cabdff1aSopenharmony_ci} 1966cabdff1aSopenharmony_ci 1967cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, 1968cabdff1aSopenharmony_ci ptrdiff_t stride) 1969cabdff1aSopenharmony_ci{ 1970cabdff1aSopenharmony_ci avg_h264_qpel16_v_lowpass_lasx(dst, src, stride, stride); 1971cabdff1aSopenharmony_ci} 1972cabdff1aSopenharmony_ci 1973cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, 1974cabdff1aSopenharmony_ci ptrdiff_t stride) 1975cabdff1aSopenharmony_ci{ 1976cabdff1aSopenharmony_ci uint8_t temp[512]; 1977cabdff1aSopenharmony_ci uint8_t *const halfHV = temp; 1978cabdff1aSopenharmony_ci uint8_t *const halfH = temp + 256; 1979cabdff1aSopenharmony_ci 1980cabdff1aSopenharmony_ci put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1981cabdff1aSopenharmony_ci put_h264_qpel16_v_lowpass_lasx(halfH, src, 16, stride); 1982cabdff1aSopenharmony_ci avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 1983cabdff1aSopenharmony_ci} 1984cabdff1aSopenharmony_ci 1985cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, 1986cabdff1aSopenharmony_ci ptrdiff_t stride) 1987cabdff1aSopenharmony_ci{ 1988cabdff1aSopenharmony_ci avg_h264_qpel16_hv_lowpass_lasx(dst, src, stride, stride); 1989cabdff1aSopenharmony_ci} 1990cabdff1aSopenharmony_ci 1991cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, 1992cabdff1aSopenharmony_ci ptrdiff_t stride) 1993cabdff1aSopenharmony_ci{ 1994cabdff1aSopenharmony_ci uint8_t temp[512]; 1995cabdff1aSopenharmony_ci uint8_t *const halfHV = temp; 1996cabdff1aSopenharmony_ci uint8_t *const halfH = temp + 256; 1997cabdff1aSopenharmony_ci 1998cabdff1aSopenharmony_ci put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 1999cabdff1aSopenharmony_ci put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride); 2000cabdff1aSopenharmony_ci avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 2001cabdff1aSopenharmony_ci} 2002cabdff1aSopenharmony_ci 2003cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, 2004cabdff1aSopenharmony_ci ptrdiff_t stride) 2005cabdff1aSopenharmony_ci{ 2006cabdff1aSopenharmony_ci uint8_t half[256]; 2007cabdff1aSopenharmony_ci 2008cabdff1aSopenharmony_ci put_h264_qpel16_v_lowpass_lasx(half, src, 16, stride); 2009cabdff1aSopenharmony_ci avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride); 2010cabdff1aSopenharmony_ci} 2011cabdff1aSopenharmony_ci 2012cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, 2013cabdff1aSopenharmony_ci ptrdiff_t stride) 2014cabdff1aSopenharmony_ci{ 2015cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2, 2016cabdff1aSopenharmony_ci (uint8_t*)src - (stride * 2), 2017cabdff1aSopenharmony_ci dst, stride); 2018cabdff1aSopenharmony_ci} 2019cabdff1aSopenharmony_ci 2020cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, 2021cabdff1aSopenharmony_ci ptrdiff_t stride) 2022cabdff1aSopenharmony_ci{ 2023cabdff1aSopenharmony_ci uint8_t temp[512]; 2024cabdff1aSopenharmony_ci uint8_t *const halfH = temp; 2025cabdff1aSopenharmony_ci uint8_t *const halfHV = temp + 256; 2026cabdff1aSopenharmony_ci 2027cabdff1aSopenharmony_ci put_h264_qpel16_h_lowpass_lasx(halfH, src + stride, 16, stride); 2028cabdff1aSopenharmony_ci put_h264_qpel16_hv_lowpass_lasx(halfHV, src, 16, stride); 2029cabdff1aSopenharmony_ci avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); 2030cabdff1aSopenharmony_ci} 2031cabdff1aSopenharmony_ci 2032cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, 2033cabdff1aSopenharmony_ci ptrdiff_t stride) 2034cabdff1aSopenharmony_ci{ 2035cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_16x16_lasx((uint8_t*)src + stride - 2, 2036cabdff1aSopenharmony_ci (uint8_t*)src - (stride * 2) + 1, 2037cabdff1aSopenharmony_ci dst, stride); 2038cabdff1aSopenharmony_ci} 2039