1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h" 22cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 23cabdff1aSopenharmony_ci#include "vp9dsp_mips.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ 26cabdff1aSopenharmony_ci p1_out, p0_out, q0_out, q1_out) \ 27cabdff1aSopenharmony_ci{ \ 28cabdff1aSopenharmony_ci v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2; \ 29cabdff1aSopenharmony_ci const v16i8 cnst4b = __msa_ldi_b(4); \ 30cabdff1aSopenharmony_ci const v16i8 cnst3b = __msa_ldi_b(3); \ 31cabdff1aSopenharmony_ci \ 32cabdff1aSopenharmony_ci p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \ 33cabdff1aSopenharmony_ci p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \ 34cabdff1aSopenharmony_ci q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \ 35cabdff1aSopenharmony_ci q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \ 36cabdff1aSopenharmony_ci \ 37cabdff1aSopenharmony_ci filt = __msa_subs_s_b(p1_m, q1_m); \ 38cabdff1aSopenharmony_ci \ 39cabdff1aSopenharmony_ci filt = filt & (v16i8) hev_in; \ 40cabdff1aSopenharmony_ci \ 41cabdff1aSopenharmony_ci q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ 42cabdff1aSopenharmony_ci filt = __msa_adds_s_b(filt, q0_sub_p0); \ 43cabdff1aSopenharmony_ci filt = __msa_adds_s_b(filt, q0_sub_p0); \ 44cabdff1aSopenharmony_ci filt = __msa_adds_s_b(filt, q0_sub_p0); \ 45cabdff1aSopenharmony_ci filt = filt & (v16i8) mask_in; \ 46cabdff1aSopenharmony_ci \ 47cabdff1aSopenharmony_ci filt1 = __msa_adds_s_b(filt, cnst4b); \ 48cabdff1aSopenharmony_ci filt1 >>= 3; \ 49cabdff1aSopenharmony_ci \ 50cabdff1aSopenharmony_ci filt2 = __msa_adds_s_b(filt, cnst3b); \ 51cabdff1aSopenharmony_ci filt2 >>= 3; \ 52cabdff1aSopenharmony_ci \ 53cabdff1aSopenharmony_ci q0_m = __msa_subs_s_b(q0_m, filt1); \ 54cabdff1aSopenharmony_ci q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \ 55cabdff1aSopenharmony_ci p0_m = __msa_adds_s_b(p0_m, filt2); \ 56cabdff1aSopenharmony_ci p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \ 57cabdff1aSopenharmony_ci \ 58cabdff1aSopenharmony_ci filt = __msa_srari_b(filt1, 1); \ 59cabdff1aSopenharmony_ci hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \ 60cabdff1aSopenharmony_ci filt = filt & (v16i8) hev_in; \ 61cabdff1aSopenharmony_ci \ 62cabdff1aSopenharmony_ci q1_m = __msa_subs_s_b(q1_m, filt); \ 63cabdff1aSopenharmony_ci q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \ 64cabdff1aSopenharmony_ci p1_m = __msa_adds_s_b(p1_m, filt); \ 65cabdff1aSopenharmony_ci p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \ 66cabdff1aSopenharmony_ci} 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ 69cabdff1aSopenharmony_ci{ \ 70cabdff1aSopenharmony_ci v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ 71cabdff1aSopenharmony_ci v16u8 zero_in = { 0 }; \ 72cabdff1aSopenharmony_ci \ 73cabdff1aSopenharmony_ci tmp = __msa_ori_b(zero_in, 1); \ 74cabdff1aSopenharmony_ci p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ 75cabdff1aSopenharmony_ci q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ 76cabdff1aSopenharmony_ci p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ 77cabdff1aSopenharmony_ci q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ 78cabdff1aSopenharmony_ci \ 79cabdff1aSopenharmony_ci p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ 80cabdff1aSopenharmony_ci flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ 81cabdff1aSopenharmony_ci p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ 82cabdff1aSopenharmony_ci flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ 83cabdff1aSopenharmony_ci \ 84cabdff1aSopenharmony_ci flat_out = (tmp < (v16u8) flat_out); \ 85cabdff1aSopenharmony_ci flat_out = __msa_xori_b(flat_out, 0xff); \ 86cabdff1aSopenharmony_ci flat_out = flat_out & (mask); \ 87cabdff1aSopenharmony_ci} 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \ 90cabdff1aSopenharmony_ci q5_in, q6_in, q7_in, flat_in, flat2_out) \ 91cabdff1aSopenharmony_ci{ \ 92cabdff1aSopenharmony_ci v16u8 tmp, zero_in = { 0 }; \ 93cabdff1aSopenharmony_ci v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ 94cabdff1aSopenharmony_ci v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ 95cabdff1aSopenharmony_ci \ 96cabdff1aSopenharmony_ci tmp = __msa_ori_b(zero_in, 1); \ 97cabdff1aSopenharmony_ci p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ 98cabdff1aSopenharmony_ci q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ 99cabdff1aSopenharmony_ci p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ 100cabdff1aSopenharmony_ci q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ 101cabdff1aSopenharmony_ci p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ 102cabdff1aSopenharmony_ci q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ 103cabdff1aSopenharmony_ci p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ 104cabdff1aSopenharmony_ci q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ 105cabdff1aSopenharmony_ci \ 106cabdff1aSopenharmony_ci p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ 107cabdff1aSopenharmony_ci flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ 108cabdff1aSopenharmony_ci flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ 109cabdff1aSopenharmony_ci p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ 110cabdff1aSopenharmony_ci flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ 111cabdff1aSopenharmony_ci p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ 112cabdff1aSopenharmony_ci flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ 113cabdff1aSopenharmony_ci \ 114cabdff1aSopenharmony_ci flat2_out = (tmp < (v16u8) flat2_out); \ 115cabdff1aSopenharmony_ci flat2_out = __msa_xori_b(flat2_out, 0xff); \ 116cabdff1aSopenharmony_ci flat2_out = flat2_out & flat_in; \ 117cabdff1aSopenharmony_ci} 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ci#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \ 120cabdff1aSopenharmony_ci q0_in, q1_in, q2_in, q3_in, \ 121cabdff1aSopenharmony_ci p2_filt8_out, p1_filt8_out, p0_filt8_out, \ 122cabdff1aSopenharmony_ci q0_filt8_out, q1_filt8_out, q2_filt8_out) \ 123cabdff1aSopenharmony_ci{ \ 124cabdff1aSopenharmony_ci v8u16 tmp0, tmp1, tmp2; \ 125cabdff1aSopenharmony_ci \ 126cabdff1aSopenharmony_ci tmp2 = p2_in + p1_in + p0_in; \ 127cabdff1aSopenharmony_ci tmp0 = p3_in << 1; \ 128cabdff1aSopenharmony_ci \ 129cabdff1aSopenharmony_ci tmp0 = tmp0 + tmp2 + q0_in; \ 130cabdff1aSopenharmony_ci tmp1 = tmp0 + p3_in + p2_in; \ 131cabdff1aSopenharmony_ci p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ 132cabdff1aSopenharmony_ci \ 133cabdff1aSopenharmony_ci tmp1 = tmp0 + p1_in + q1_in; \ 134cabdff1aSopenharmony_ci p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ 135cabdff1aSopenharmony_ci \ 136cabdff1aSopenharmony_ci tmp1 = q2_in + q1_in + q0_in; \ 137cabdff1aSopenharmony_ci tmp2 = tmp2 + tmp1; \ 138cabdff1aSopenharmony_ci tmp0 = tmp2 + (p0_in); \ 139cabdff1aSopenharmony_ci tmp0 = tmp0 + (p3_in); \ 140cabdff1aSopenharmony_ci p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \ 141cabdff1aSopenharmony_ci \ 142cabdff1aSopenharmony_ci tmp0 = q2_in + q3_in; \ 143cabdff1aSopenharmony_ci tmp0 = p0_in + tmp1 + tmp0; \ 144cabdff1aSopenharmony_ci tmp1 = q3_in + q3_in; \ 145cabdff1aSopenharmony_ci tmp1 = tmp1 + tmp0; \ 146cabdff1aSopenharmony_ci q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ 147cabdff1aSopenharmony_ci \ 148cabdff1aSopenharmony_ci tmp0 = tmp2 + q3_in; \ 149cabdff1aSopenharmony_ci tmp1 = tmp0 + q0_in; \ 150cabdff1aSopenharmony_ci q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ 151cabdff1aSopenharmony_ci \ 152cabdff1aSopenharmony_ci tmp1 = tmp0 - p2_in; \ 153cabdff1aSopenharmony_ci tmp0 = q1_in + q3_in; \ 154cabdff1aSopenharmony_ci tmp1 = tmp0 + tmp1; \ 155cabdff1aSopenharmony_ci q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ 156cabdff1aSopenharmony_ci} 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_ci#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \ 159cabdff1aSopenharmony_ci q0_in, q1_in, q2_in, q3_in, \ 160cabdff1aSopenharmony_ci limit_in, b_limit_in, thresh_in, \ 161cabdff1aSopenharmony_ci hev_out, mask_out, flat_out) \ 162cabdff1aSopenharmony_ci{ \ 163cabdff1aSopenharmony_ci v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ 164cabdff1aSopenharmony_ci v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ 165cabdff1aSopenharmony_ci \ 166cabdff1aSopenharmony_ci /* absolute subtraction of pixel values */ \ 167cabdff1aSopenharmony_ci p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ 168cabdff1aSopenharmony_ci p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ 169cabdff1aSopenharmony_ci p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ 170cabdff1aSopenharmony_ci q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ 171cabdff1aSopenharmony_ci q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ 172cabdff1aSopenharmony_ci q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ 173cabdff1aSopenharmony_ci p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ 174cabdff1aSopenharmony_ci p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ 175cabdff1aSopenharmony_ci \ 176cabdff1aSopenharmony_ci /* calculation of hev */ \ 177cabdff1aSopenharmony_ci flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ 178cabdff1aSopenharmony_ci hev_out = thresh_in < (v16u8) flat_out; \ 179cabdff1aSopenharmony_ci \ 180cabdff1aSopenharmony_ci /* calculation of mask */ \ 181cabdff1aSopenharmony_ci p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ 182cabdff1aSopenharmony_ci p1_asub_q1_m >>= 1; \ 183cabdff1aSopenharmony_ci p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ 184cabdff1aSopenharmony_ci \ 185cabdff1aSopenharmony_ci mask_out = b_limit_in < p0_asub_q0_m; \ 186cabdff1aSopenharmony_ci mask_out = __msa_max_u_b(flat_out, mask_out); \ 187cabdff1aSopenharmony_ci p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ 188cabdff1aSopenharmony_ci mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ 189cabdff1aSopenharmony_ci q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ 190cabdff1aSopenharmony_ci mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ 191cabdff1aSopenharmony_ci \ 192cabdff1aSopenharmony_ci mask_out = limit_in < (v16u8) mask_out; \ 193cabdff1aSopenharmony_ci mask_out = __msa_xori_b(mask_out, 0xff); \ 194cabdff1aSopenharmony_ci} 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_civoid ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch, 197cabdff1aSopenharmony_ci int32_t b_limit_ptr, 198cabdff1aSopenharmony_ci int32_t limit_ptr, 199cabdff1aSopenharmony_ci int32_t thresh_ptr) 200cabdff1aSopenharmony_ci{ 201cabdff1aSopenharmony_ci uint64_t p1_d, p0_d, q0_d, q1_d; 202cabdff1aSopenharmony_ci v16u8 mask, hev, flat, thresh, b_limit, limit; 203cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; 204cabdff1aSopenharmony_ci 205cabdff1aSopenharmony_ci /* load vector elements */ 206cabdff1aSopenharmony_ci LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 207cabdff1aSopenharmony_ci 208cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 209cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 210cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 211cabdff1aSopenharmony_ci 212cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 213cabdff1aSopenharmony_ci hev, mask, flat); 214cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 215cabdff1aSopenharmony_ci q1_out); 216cabdff1aSopenharmony_ci 217cabdff1aSopenharmony_ci p1_d = __msa_copy_u_d((v2i64) p1_out, 0); 218cabdff1aSopenharmony_ci p0_d = __msa_copy_u_d((v2i64) p0_out, 0); 219cabdff1aSopenharmony_ci q0_d = __msa_copy_u_d((v2i64) q0_out, 0); 220cabdff1aSopenharmony_ci q1_d = __msa_copy_u_d((v2i64) q1_out, 0); 221cabdff1aSopenharmony_ci SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); 222cabdff1aSopenharmony_ci} 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci 225cabdff1aSopenharmony_civoid ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch, 226cabdff1aSopenharmony_ci int32_t b_limit_ptr, 227cabdff1aSopenharmony_ci int32_t limit_ptr, 228cabdff1aSopenharmony_ci int32_t thresh_ptr) 229cabdff1aSopenharmony_ci{ 230cabdff1aSopenharmony_ci v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; 231cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_ci /* load vector elements */ 234cabdff1aSopenharmony_ci LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 235cabdff1aSopenharmony_ci 236cabdff1aSopenharmony_ci thresh0 = (v16u8) __msa_fill_b(thresh_ptr); 237cabdff1aSopenharmony_ci thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8); 238cabdff1aSopenharmony_ci thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0); 239cabdff1aSopenharmony_ci 240cabdff1aSopenharmony_ci b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr); 241cabdff1aSopenharmony_ci b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8); 242cabdff1aSopenharmony_ci b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0); 243cabdff1aSopenharmony_ci 244cabdff1aSopenharmony_ci limit0 = (v16u8) __msa_fill_b(limit_ptr); 245cabdff1aSopenharmony_ci limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8); 246cabdff1aSopenharmony_ci limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0); 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, 249cabdff1aSopenharmony_ci hev, mask, flat); 250cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); 251cabdff1aSopenharmony_ci 252cabdff1aSopenharmony_ci ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); 253cabdff1aSopenharmony_ci} 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_civoid ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch, 256cabdff1aSopenharmony_ci int32_t b_limit_ptr, 257cabdff1aSopenharmony_ci int32_t limit_ptr, 258cabdff1aSopenharmony_ci int32_t thresh_ptr) 259cabdff1aSopenharmony_ci{ 260cabdff1aSopenharmony_ci uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 261cabdff1aSopenharmony_ci v16u8 mask, hev, flat, thresh, b_limit, limit; 262cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 263cabdff1aSopenharmony_ci v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 264cabdff1aSopenharmony_ci v8i16 p2_filter8, p1_filter8, p0_filter8; 265cabdff1aSopenharmony_ci v8i16 q0_filter8, q1_filter8, q2_filter8; 266cabdff1aSopenharmony_ci v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; 267cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 268cabdff1aSopenharmony_ci 269cabdff1aSopenharmony_ci /* load vector elements */ 270cabdff1aSopenharmony_ci LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 273cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 274cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 277cabdff1aSopenharmony_ci hev, mask, flat); 278cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 279cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 280cabdff1aSopenharmony_ci q1_out); 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 283cabdff1aSopenharmony_ci 284cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 285cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 286cabdff1aSopenharmony_ci p1_d = __msa_copy_u_d((v2i64) p1_out, 0); 287cabdff1aSopenharmony_ci p0_d = __msa_copy_u_d((v2i64) p0_out, 0); 288cabdff1aSopenharmony_ci q0_d = __msa_copy_u_d((v2i64) q0_out, 0); 289cabdff1aSopenharmony_ci q1_d = __msa_copy_u_d((v2i64) q1_out, 0); 290cabdff1aSopenharmony_ci SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); 291cabdff1aSopenharmony_ci } else { 292cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 293cabdff1aSopenharmony_ci zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, 294cabdff1aSopenharmony_ci q2_r, q3_r); 295cabdff1aSopenharmony_ci VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, 296cabdff1aSopenharmony_ci p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); 297cabdff1aSopenharmony_ci 298cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 299cabdff1aSopenharmony_ci PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, 300cabdff1aSopenharmony_ci zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, 301cabdff1aSopenharmony_ci q0_filter8); 302cabdff1aSopenharmony_ci PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); 303cabdff1aSopenharmony_ci 304cabdff1aSopenharmony_ci /* store pixel values */ 305cabdff1aSopenharmony_ci p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat); 306cabdff1aSopenharmony_ci p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat); 307cabdff1aSopenharmony_ci p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat); 308cabdff1aSopenharmony_ci q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat); 309cabdff1aSopenharmony_ci q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat); 310cabdff1aSopenharmony_ci q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat); 311cabdff1aSopenharmony_ci 312cabdff1aSopenharmony_ci p2_d = __msa_copy_u_d((v2i64) p2_out, 0); 313cabdff1aSopenharmony_ci p1_d = __msa_copy_u_d((v2i64) p1_out, 0); 314cabdff1aSopenharmony_ci p0_d = __msa_copy_u_d((v2i64) p0_out, 0); 315cabdff1aSopenharmony_ci q0_d = __msa_copy_u_d((v2i64) q0_out, 0); 316cabdff1aSopenharmony_ci q1_d = __msa_copy_u_d((v2i64) q1_out, 0); 317cabdff1aSopenharmony_ci q2_d = __msa_copy_u_d((v2i64) q2_out, 0); 318cabdff1aSopenharmony_ci 319cabdff1aSopenharmony_ci src -= 3 * pitch; 320cabdff1aSopenharmony_ci 321cabdff1aSopenharmony_ci SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); 322cabdff1aSopenharmony_ci src += (4 * pitch); 323cabdff1aSopenharmony_ci SD(q1_d, src); 324cabdff1aSopenharmony_ci src += pitch; 325cabdff1aSopenharmony_ci SD(q2_d, src); 326cabdff1aSopenharmony_ci } 327cabdff1aSopenharmony_ci} 328cabdff1aSopenharmony_ci 329cabdff1aSopenharmony_civoid ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch, 330cabdff1aSopenharmony_ci int32_t b_limit_ptr, 331cabdff1aSopenharmony_ci int32_t limit_ptr, 332cabdff1aSopenharmony_ci int32_t thresh_ptr) 333cabdff1aSopenharmony_ci{ 334cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 335cabdff1aSopenharmony_ci v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 336cabdff1aSopenharmony_ci v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; 337cabdff1aSopenharmony_ci v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 338cabdff1aSopenharmony_ci v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 339cabdff1aSopenharmony_ci v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 340cabdff1aSopenharmony_ci v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 341cabdff1aSopenharmony_ci v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 342cabdff1aSopenharmony_ci v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 343cabdff1aSopenharmony_ci v16u8 zero = { 0 }; 344cabdff1aSopenharmony_ci 345cabdff1aSopenharmony_ci /* load vector elements */ 346cabdff1aSopenharmony_ci LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 349cabdff1aSopenharmony_ci tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8); 350cabdff1aSopenharmony_ci thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh); 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 353cabdff1aSopenharmony_ci tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8); 354cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit); 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 357cabdff1aSopenharmony_ci tmp = (v16u8) __msa_fill_b(limit_ptr >> 8); 358cabdff1aSopenharmony_ci limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit); 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci /* mask and hev */ 361cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 362cabdff1aSopenharmony_ci hev, mask, flat); 363cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 364cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 365cabdff1aSopenharmony_ci q1_out); 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 368cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 369cabdff1aSopenharmony_ci ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 370cabdff1aSopenharmony_ci } else { 371cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 372cabdff1aSopenharmony_ci zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, 373cabdff1aSopenharmony_ci q2_r, q3_r); 374cabdff1aSopenharmony_ci VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 375cabdff1aSopenharmony_ci p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 378cabdff1aSopenharmony_ci p0_l); 379cabdff1aSopenharmony_ci ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 380cabdff1aSopenharmony_ci q3_l); 381cabdff1aSopenharmony_ci VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 382cabdff1aSopenharmony_ci p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 383cabdff1aSopenharmony_ci 384cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 385cabdff1aSopenharmony_ci PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 386cabdff1aSopenharmony_ci p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 387cabdff1aSopenharmony_ci p0_filt8_r, q0_filt8_r); 388cabdff1aSopenharmony_ci PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, 389cabdff1aSopenharmony_ci q1_filt8_r, q2_filt8_r); 390cabdff1aSopenharmony_ci 391cabdff1aSopenharmony_ci /* store pixel values */ 392cabdff1aSopenharmony_ci p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 393cabdff1aSopenharmony_ci p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 394cabdff1aSopenharmony_ci p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 395cabdff1aSopenharmony_ci q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 396cabdff1aSopenharmony_ci q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 397cabdff1aSopenharmony_ci q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci src -= 3 * pitch; 400cabdff1aSopenharmony_ci 401cabdff1aSopenharmony_ci ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); 402cabdff1aSopenharmony_ci src += (4 * pitch); 403cabdff1aSopenharmony_ci ST_UB2(q1_out, q2_out, src, pitch); 404cabdff1aSopenharmony_ci src += (2 * pitch); 405cabdff1aSopenharmony_ci } 406cabdff1aSopenharmony_ci} 407cabdff1aSopenharmony_ci 408cabdff1aSopenharmony_civoid ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch, 409cabdff1aSopenharmony_ci int32_t b_limit_ptr, 410cabdff1aSopenharmony_ci int32_t limit_ptr, 411cabdff1aSopenharmony_ci int32_t thresh_ptr) 412cabdff1aSopenharmony_ci{ 413cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 414cabdff1aSopenharmony_ci v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 415cabdff1aSopenharmony_ci v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; 416cabdff1aSopenharmony_ci v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 417cabdff1aSopenharmony_ci v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 418cabdff1aSopenharmony_ci v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 419cabdff1aSopenharmony_ci v16u8 zero = { 0 }; 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci /* load vector elements */ 422cabdff1aSopenharmony_ci LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 425cabdff1aSopenharmony_ci tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8); 426cabdff1aSopenharmony_ci thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh); 427cabdff1aSopenharmony_ci 428cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 429cabdff1aSopenharmony_ci tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8); 430cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit); 431cabdff1aSopenharmony_ci 432cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 433cabdff1aSopenharmony_ci tmp = (v16u8) __msa_fill_b(limit_ptr >> 8); 434cabdff1aSopenharmony_ci limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit); 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci /* mask and hev */ 437cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 438cabdff1aSopenharmony_ci hev, mask, flat); 439cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 440cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 441cabdff1aSopenharmony_ci q1_out); 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 444cabdff1aSopenharmony_ci 445cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 446cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 447cabdff1aSopenharmony_ci ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 448cabdff1aSopenharmony_ci } else { 449cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 450cabdff1aSopenharmony_ci zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, 451cabdff1aSopenharmony_ci q2_r, q3_r); 452cabdff1aSopenharmony_ci VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 453cabdff1aSopenharmony_ci p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 454cabdff1aSopenharmony_ci 455cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 456cabdff1aSopenharmony_ci PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, 457cabdff1aSopenharmony_ci p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r, 458cabdff1aSopenharmony_ci p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); 459cabdff1aSopenharmony_ci PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, 460cabdff1aSopenharmony_ci q1_filt8_r, q2_filt8_r); 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ci /* store pixel values */ 463cabdff1aSopenharmony_ci p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 464cabdff1aSopenharmony_ci p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 465cabdff1aSopenharmony_ci p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 466cabdff1aSopenharmony_ci q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 467cabdff1aSopenharmony_ci q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 468cabdff1aSopenharmony_ci q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 469cabdff1aSopenharmony_ci 470cabdff1aSopenharmony_ci src -= 3 * pitch; 471cabdff1aSopenharmony_ci 472cabdff1aSopenharmony_ci ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); 473cabdff1aSopenharmony_ci src += (4 * pitch); 474cabdff1aSopenharmony_ci ST_UB2(q1_out, q2_out, src, pitch); 475cabdff1aSopenharmony_ci src += (2 * pitch); 476cabdff1aSopenharmony_ci } 477cabdff1aSopenharmony_ci} 478cabdff1aSopenharmony_ci 479cabdff1aSopenharmony_civoid ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch, 480cabdff1aSopenharmony_ci int32_t b_limit_ptr, 481cabdff1aSopenharmony_ci int32_t limit_ptr, 482cabdff1aSopenharmony_ci int32_t thresh_ptr) 483cabdff1aSopenharmony_ci{ 484cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 485cabdff1aSopenharmony_ci v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 486cabdff1aSopenharmony_ci v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; 487cabdff1aSopenharmony_ci v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 488cabdff1aSopenharmony_ci v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 489cabdff1aSopenharmony_ci v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 490cabdff1aSopenharmony_ci v16u8 zero = { 0 }; 491cabdff1aSopenharmony_ci 492cabdff1aSopenharmony_ci /* load vector elements */ 493cabdff1aSopenharmony_ci LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 494cabdff1aSopenharmony_ci 495cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 496cabdff1aSopenharmony_ci tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8); 497cabdff1aSopenharmony_ci thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh); 498cabdff1aSopenharmony_ci 499cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 500cabdff1aSopenharmony_ci tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8); 501cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit); 502cabdff1aSopenharmony_ci 503cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 504cabdff1aSopenharmony_ci tmp = (v16u8) __msa_fill_b(limit_ptr >> 8); 505cabdff1aSopenharmony_ci limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit); 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci /* mask and hev */ 508cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 509cabdff1aSopenharmony_ci hev, mask, flat); 510cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 511cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 512cabdff1aSopenharmony_ci q1_out); 513cabdff1aSopenharmony_ci 514cabdff1aSopenharmony_ci flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero); 515cabdff1aSopenharmony_ci 516cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 517cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 518cabdff1aSopenharmony_ci ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 519cabdff1aSopenharmony_ci } else { 520cabdff1aSopenharmony_ci ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 521cabdff1aSopenharmony_ci p0_l); 522cabdff1aSopenharmony_ci ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 523cabdff1aSopenharmony_ci q3_l); 524cabdff1aSopenharmony_ci VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 525cabdff1aSopenharmony_ci p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 528cabdff1aSopenharmony_ci PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, 529cabdff1aSopenharmony_ci p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, 530cabdff1aSopenharmony_ci p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l); 531cabdff1aSopenharmony_ci PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, 532cabdff1aSopenharmony_ci q1_filt8_l, q2_filt8_l); 533cabdff1aSopenharmony_ci 534cabdff1aSopenharmony_ci /* store pixel values */ 535cabdff1aSopenharmony_ci p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat); 536cabdff1aSopenharmony_ci p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat); 537cabdff1aSopenharmony_ci p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat); 538cabdff1aSopenharmony_ci q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat); 539cabdff1aSopenharmony_ci q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat); 540cabdff1aSopenharmony_ci q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat); 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_ci src -= 3 * pitch; 543cabdff1aSopenharmony_ci 544cabdff1aSopenharmony_ci ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); 545cabdff1aSopenharmony_ci src += (4 * pitch); 546cabdff1aSopenharmony_ci ST_UB2(q1_out, q2_out, src, pitch); 547cabdff1aSopenharmony_ci src += (2 * pitch); 548cabdff1aSopenharmony_ci } 549cabdff1aSopenharmony_ci} 550cabdff1aSopenharmony_ci 551cabdff1aSopenharmony_cistatic int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch, 552cabdff1aSopenharmony_ci uint8_t *filter48, 553cabdff1aSopenharmony_ci int32_t b_limit_ptr, 554cabdff1aSopenharmony_ci int32_t limit_ptr, 555cabdff1aSopenharmony_ci int32_t thresh_ptr) 556cabdff1aSopenharmony_ci{ 557cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 558cabdff1aSopenharmony_ci v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 559cabdff1aSopenharmony_ci v16u8 flat, mask, hev, thresh, b_limit, limit; 560cabdff1aSopenharmony_ci v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 561cabdff1aSopenharmony_ci v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 562cabdff1aSopenharmony_ci v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 563cabdff1aSopenharmony_ci v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 564cabdff1aSopenharmony_ci v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 565cabdff1aSopenharmony_ci v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 566cabdff1aSopenharmony_ci v16u8 zero = { 0 }; 567cabdff1aSopenharmony_ci 568cabdff1aSopenharmony_ci /* load vector elements */ 569cabdff1aSopenharmony_ci LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 570cabdff1aSopenharmony_ci 571cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 572cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 573cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 574cabdff1aSopenharmony_ci 575cabdff1aSopenharmony_ci /* mask and hev */ 576cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 577cabdff1aSopenharmony_ci hev, mask, flat); 578cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 579cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 580cabdff1aSopenharmony_ci q1_out); 581cabdff1aSopenharmony_ci 582cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 583cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 584cabdff1aSopenharmony_ci ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 585cabdff1aSopenharmony_ci 586cabdff1aSopenharmony_ci return 1; 587cabdff1aSopenharmony_ci } else { 588cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 589cabdff1aSopenharmony_ci zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, 590cabdff1aSopenharmony_ci q2_r, q3_r); 591cabdff1aSopenharmony_ci VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 592cabdff1aSopenharmony_ci p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 593cabdff1aSopenharmony_ci 594cabdff1aSopenharmony_ci ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 595cabdff1aSopenharmony_ci p0_l); 596cabdff1aSopenharmony_ci ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 597cabdff1aSopenharmony_ci q3_l); 598cabdff1aSopenharmony_ci VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 599cabdff1aSopenharmony_ci p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 600cabdff1aSopenharmony_ci 601cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 602cabdff1aSopenharmony_ci PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 603cabdff1aSopenharmony_ci p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 604cabdff1aSopenharmony_ci p0_filt8_r, q0_filt8_r); 605cabdff1aSopenharmony_ci PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 606cabdff1aSopenharmony_ci q2_filt8_r); 607cabdff1aSopenharmony_ci 608cabdff1aSopenharmony_ci /* store pixel values */ 609cabdff1aSopenharmony_ci p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 610cabdff1aSopenharmony_ci p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 611cabdff1aSopenharmony_ci p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 612cabdff1aSopenharmony_ci q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 613cabdff1aSopenharmony_ci q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 614cabdff1aSopenharmony_ci q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 615cabdff1aSopenharmony_ci 616cabdff1aSopenharmony_ci ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 617cabdff1aSopenharmony_ci filter48 += (4 * 16); 618cabdff1aSopenharmony_ci ST_UB2(q1_out, q2_out, filter48, 16); 619cabdff1aSopenharmony_ci filter48 += (2 * 16); 620cabdff1aSopenharmony_ci ST_UB(flat, filter48); 621cabdff1aSopenharmony_ci 622cabdff1aSopenharmony_ci return 0; 623cabdff1aSopenharmony_ci } 624cabdff1aSopenharmony_ci} 625cabdff1aSopenharmony_ci 626cabdff1aSopenharmony_cistatic void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48) 627cabdff1aSopenharmony_ci{ 628cabdff1aSopenharmony_ci v16u8 flat, flat2, filter8; 629cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 630cabdff1aSopenharmony_ci v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 631cabdff1aSopenharmony_ci v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in; 632cabdff1aSopenharmony_ci v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in; 633cabdff1aSopenharmony_ci v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in; 634cabdff1aSopenharmony_ci v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in; 635cabdff1aSopenharmony_ci v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; 636cabdff1aSopenharmony_ci v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; 637cabdff1aSopenharmony_ci v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; 638cabdff1aSopenharmony_ci v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; 639cabdff1aSopenharmony_ci v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; 640cabdff1aSopenharmony_ci v8i16 l_out, r_out; 641cabdff1aSopenharmony_ci 642cabdff1aSopenharmony_ci flat = LD_UB(filter48 + 96); 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); 645cabdff1aSopenharmony_ci LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); 646cabdff1aSopenharmony_ci VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 647cabdff1aSopenharmony_ci 648cabdff1aSopenharmony_ci /* if flat2 is zero for all pixels, then no need to calculate other filter */ 649cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat2)) { 650cabdff1aSopenharmony_ci LD_UB4(filter48, 16, p2, p1, p0, q0); 651cabdff1aSopenharmony_ci LD_UB2(filter48 + 4 * 16, 16, q1, q2); 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_ci src -= 3 * pitch; 654cabdff1aSopenharmony_ci ST_UB4(p2, p1, p0, q0, src, pitch); 655cabdff1aSopenharmony_ci src += (4 * pitch); 656cabdff1aSopenharmony_ci ST_UB2(q1, q2, src, pitch); 657cabdff1aSopenharmony_ci } else { 658cabdff1aSopenharmony_ci src -= 7 * pitch; 659cabdff1aSopenharmony_ci 660cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, 661cabdff1aSopenharmony_ci zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, 662cabdff1aSopenharmony_ci p3_r_in, p2_r_in, p1_r_in, p0_r_in); 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ci q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0); 665cabdff1aSopenharmony_ci 666cabdff1aSopenharmony_ci tmp0_r = p7_r_in << 3; 667cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 668cabdff1aSopenharmony_ci tmp0_r += p6_r_in; 669cabdff1aSopenharmony_ci tmp0_r += q0_r_in; 670cabdff1aSopenharmony_ci tmp1_r = p6_r_in + p5_r_in; 671cabdff1aSopenharmony_ci tmp1_r += p4_r_in; 672cabdff1aSopenharmony_ci tmp1_r += p3_r_in; 673cabdff1aSopenharmony_ci tmp1_r += p2_r_in; 674cabdff1aSopenharmony_ci tmp1_r += p1_r_in; 675cabdff1aSopenharmony_ci tmp1_r += p0_r_in; 676cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 677cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 678cabdff1aSopenharmony_ci 679cabdff1aSopenharmony_ci ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, 680cabdff1aSopenharmony_ci p5_l_in, p4_l_in); 681cabdff1aSopenharmony_ci ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, 682cabdff1aSopenharmony_ci p1_l_in, p0_l_in); 683cabdff1aSopenharmony_ci q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0); 684cabdff1aSopenharmony_ci 685cabdff1aSopenharmony_ci tmp0_l = p7_l_in << 3; 686cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 687cabdff1aSopenharmony_ci tmp0_l += p6_l_in; 688cabdff1aSopenharmony_ci tmp0_l += q0_l_in; 689cabdff1aSopenharmony_ci tmp1_l = p6_l_in + p5_l_in; 690cabdff1aSopenharmony_ci tmp1_l += p4_l_in; 691cabdff1aSopenharmony_ci tmp1_l += p3_l_in; 692cabdff1aSopenharmony_ci tmp1_l += p2_l_in; 693cabdff1aSopenharmony_ci tmp1_l += p1_l_in; 694cabdff1aSopenharmony_ci tmp1_l += p0_l_in; 695cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 696cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 697cabdff1aSopenharmony_ci 698cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 699cabdff1aSopenharmony_ci p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2); 700cabdff1aSopenharmony_ci ST_UB(p6, src); 701cabdff1aSopenharmony_ci src += pitch; 702cabdff1aSopenharmony_ci 703cabdff1aSopenharmony_ci /* p5 */ 704cabdff1aSopenharmony_ci q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1); 705cabdff1aSopenharmony_ci tmp0_r = p5_r_in - p6_r_in; 706cabdff1aSopenharmony_ci tmp0_r += q1_r_in; 707cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 708cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 709cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 710cabdff1aSopenharmony_ci 711cabdff1aSopenharmony_ci q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1); 712cabdff1aSopenharmony_ci tmp0_l = p5_l_in - p6_l_in; 713cabdff1aSopenharmony_ci tmp0_l += q1_l_in; 714cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 715cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 716cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 717cabdff1aSopenharmony_ci 718cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 719cabdff1aSopenharmony_ci p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2); 720cabdff1aSopenharmony_ci ST_UB(p5, src); 721cabdff1aSopenharmony_ci src += pitch; 722cabdff1aSopenharmony_ci 723cabdff1aSopenharmony_ci /* p4 */ 724cabdff1aSopenharmony_ci q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2); 725cabdff1aSopenharmony_ci tmp0_r = p4_r_in - p5_r_in; 726cabdff1aSopenharmony_ci tmp0_r += q2_r_in; 727cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 728cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 729cabdff1aSopenharmony_ci r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4); 730cabdff1aSopenharmony_ci 731cabdff1aSopenharmony_ci q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2); 732cabdff1aSopenharmony_ci tmp0_l = p4_l_in - p5_l_in; 733cabdff1aSopenharmony_ci tmp0_l += q2_l_in; 734cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 735cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 736cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 737cabdff1aSopenharmony_ci 738cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 739cabdff1aSopenharmony_ci p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2); 740cabdff1aSopenharmony_ci ST_UB(p4, src); 741cabdff1aSopenharmony_ci src += pitch; 742cabdff1aSopenharmony_ci 743cabdff1aSopenharmony_ci /* p3 */ 744cabdff1aSopenharmony_ci q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3); 745cabdff1aSopenharmony_ci tmp0_r = p3_r_in - p4_r_in; 746cabdff1aSopenharmony_ci tmp0_r += q3_r_in; 747cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 748cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 749cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 750cabdff1aSopenharmony_ci 751cabdff1aSopenharmony_ci q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3); 752cabdff1aSopenharmony_ci tmp0_l = p3_l_in - p4_l_in; 753cabdff1aSopenharmony_ci tmp0_l += q3_l_in; 754cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 755cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 756cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 757cabdff1aSopenharmony_ci 758cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 759cabdff1aSopenharmony_ci p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2); 760cabdff1aSopenharmony_ci ST_UB(p3, src); 761cabdff1aSopenharmony_ci src += pitch; 762cabdff1aSopenharmony_ci 763cabdff1aSopenharmony_ci /* p2 */ 764cabdff1aSopenharmony_ci q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4); 765cabdff1aSopenharmony_ci filter8 = LD_UB(filter48); 766cabdff1aSopenharmony_ci tmp0_r = p2_r_in - p3_r_in; 767cabdff1aSopenharmony_ci tmp0_r += q4_r_in; 768cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 769cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 770cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 771cabdff1aSopenharmony_ci 772cabdff1aSopenharmony_ci q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4); 773cabdff1aSopenharmony_ci tmp0_l = p2_l_in - p3_l_in; 774cabdff1aSopenharmony_ci tmp0_l += q4_l_in; 775cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 776cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 777cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 778cabdff1aSopenharmony_ci 779cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 780cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 781cabdff1aSopenharmony_ci ST_UB(filter8, src); 782cabdff1aSopenharmony_ci src += pitch; 783cabdff1aSopenharmony_ci 784cabdff1aSopenharmony_ci /* p1 */ 785cabdff1aSopenharmony_ci q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5); 786cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 16); 787cabdff1aSopenharmony_ci tmp0_r = p1_r_in - p2_r_in; 788cabdff1aSopenharmony_ci tmp0_r += q5_r_in; 789cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 790cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 791cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 792cabdff1aSopenharmony_ci 793cabdff1aSopenharmony_ci q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5); 794cabdff1aSopenharmony_ci tmp0_l = p1_l_in - p2_l_in; 795cabdff1aSopenharmony_ci tmp0_l += q5_l_in; 796cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 797cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 798cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 799cabdff1aSopenharmony_ci 800cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 801cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 802cabdff1aSopenharmony_ci ST_UB(filter8, src); 803cabdff1aSopenharmony_ci src += pitch; 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_ci /* p0 */ 806cabdff1aSopenharmony_ci q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6); 807cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 32); 808cabdff1aSopenharmony_ci tmp0_r = p0_r_in - p1_r_in; 809cabdff1aSopenharmony_ci tmp0_r += q6_r_in; 810cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 811cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 812cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 813cabdff1aSopenharmony_ci 814cabdff1aSopenharmony_ci q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6); 815cabdff1aSopenharmony_ci tmp0_l = p0_l_in - p1_l_in; 816cabdff1aSopenharmony_ci tmp0_l += q6_l_in; 817cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 818cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 819cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 820cabdff1aSopenharmony_ci 821cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 822cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 823cabdff1aSopenharmony_ci ST_UB(filter8, src); 824cabdff1aSopenharmony_ci src += pitch; 825cabdff1aSopenharmony_ci 826cabdff1aSopenharmony_ci /* q0 */ 827cabdff1aSopenharmony_ci q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7); 828cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 48); 829cabdff1aSopenharmony_ci tmp0_r = q7_r_in - p0_r_in; 830cabdff1aSopenharmony_ci tmp0_r += q0_r_in; 831cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 832cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 833cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 834cabdff1aSopenharmony_ci 835cabdff1aSopenharmony_ci q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7); 836cabdff1aSopenharmony_ci tmp0_l = q7_l_in - p0_l_in; 837cabdff1aSopenharmony_ci tmp0_l += q0_l_in; 838cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 839cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 840cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 841cabdff1aSopenharmony_ci 842cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 843cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 844cabdff1aSopenharmony_ci ST_UB(filter8, src); 845cabdff1aSopenharmony_ci src += pitch; 846cabdff1aSopenharmony_ci 847cabdff1aSopenharmony_ci /* q1 */ 848cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 64); 849cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q0_r_in; 850cabdff1aSopenharmony_ci tmp0_r += q1_r_in; 851cabdff1aSopenharmony_ci tmp0_r -= p6_r_in; 852cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 853cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 854cabdff1aSopenharmony_ci 855cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q0_l_in; 856cabdff1aSopenharmony_ci tmp0_l += q1_l_in; 857cabdff1aSopenharmony_ci tmp0_l -= p6_l_in; 858cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 859cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 860cabdff1aSopenharmony_ci 861cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 862cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 863cabdff1aSopenharmony_ci ST_UB(filter8, src); 864cabdff1aSopenharmony_ci src += pitch; 865cabdff1aSopenharmony_ci 866cabdff1aSopenharmony_ci /* q2 */ 867cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 80); 868cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q1_r_in; 869cabdff1aSopenharmony_ci tmp0_r += q2_r_in; 870cabdff1aSopenharmony_ci tmp0_r -= p5_r_in; 871cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 872cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 873cabdff1aSopenharmony_ci 874cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q1_l_in; 875cabdff1aSopenharmony_ci tmp0_l += q2_l_in; 876cabdff1aSopenharmony_ci tmp0_l -= p5_l_in; 877cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 878cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 879cabdff1aSopenharmony_ci 880cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 881cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 882cabdff1aSopenharmony_ci ST_UB(filter8, src); 883cabdff1aSopenharmony_ci src += pitch; 884cabdff1aSopenharmony_ci 885cabdff1aSopenharmony_ci /* q3 */ 886cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q2_r_in; 887cabdff1aSopenharmony_ci tmp0_r += q3_r_in; 888cabdff1aSopenharmony_ci tmp0_r -= p4_r_in; 889cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 890cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 891cabdff1aSopenharmony_ci 892cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q2_l_in; 893cabdff1aSopenharmony_ci tmp0_l += q3_l_in; 894cabdff1aSopenharmony_ci tmp0_l -= p4_l_in; 895cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 896cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 897cabdff1aSopenharmony_ci 898cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 899cabdff1aSopenharmony_ci q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2); 900cabdff1aSopenharmony_ci ST_UB(q3, src); 901cabdff1aSopenharmony_ci src += pitch; 902cabdff1aSopenharmony_ci 903cabdff1aSopenharmony_ci /* q4 */ 904cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q3_r_in; 905cabdff1aSopenharmony_ci tmp0_r += q4_r_in; 906cabdff1aSopenharmony_ci tmp0_r -= p3_r_in; 907cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 908cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 909cabdff1aSopenharmony_ci 910cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q3_l_in; 911cabdff1aSopenharmony_ci tmp0_l += q4_l_in; 912cabdff1aSopenharmony_ci tmp0_l -= p3_l_in; 913cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 914cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 915cabdff1aSopenharmony_ci 916cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 917cabdff1aSopenharmony_ci q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2); 918cabdff1aSopenharmony_ci ST_UB(q4, src); 919cabdff1aSopenharmony_ci src += pitch; 920cabdff1aSopenharmony_ci 921cabdff1aSopenharmony_ci /* q5 */ 922cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q4_r_in; 923cabdff1aSopenharmony_ci tmp0_r += q5_r_in; 924cabdff1aSopenharmony_ci tmp0_r -= p2_r_in; 925cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 926cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 927cabdff1aSopenharmony_ci 928cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q4_l_in; 929cabdff1aSopenharmony_ci tmp0_l += q5_l_in; 930cabdff1aSopenharmony_ci tmp0_l -= p2_l_in; 931cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 932cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 933cabdff1aSopenharmony_ci 934cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 935cabdff1aSopenharmony_ci q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2); 936cabdff1aSopenharmony_ci ST_UB(q5, src); 937cabdff1aSopenharmony_ci src += pitch; 938cabdff1aSopenharmony_ci 939cabdff1aSopenharmony_ci /* q6 */ 940cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q5_r_in; 941cabdff1aSopenharmony_ci tmp0_r += q6_r_in; 942cabdff1aSopenharmony_ci tmp0_r -= p1_r_in; 943cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 944cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 945cabdff1aSopenharmony_ci 946cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q5_l_in; 947cabdff1aSopenharmony_ci tmp0_l += q6_l_in; 948cabdff1aSopenharmony_ci tmp0_l -= p1_l_in; 949cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 950cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 951cabdff1aSopenharmony_ci 952cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 953cabdff1aSopenharmony_ci q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2); 954cabdff1aSopenharmony_ci ST_UB(q6, src); 955cabdff1aSopenharmony_ci } 956cabdff1aSopenharmony_ci} 957cabdff1aSopenharmony_ci 958cabdff1aSopenharmony_civoid ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch, 959cabdff1aSopenharmony_ci int32_t b_limit_ptr, 960cabdff1aSopenharmony_ci int32_t limit_ptr, 961cabdff1aSopenharmony_ci int32_t thresh_ptr) 962cabdff1aSopenharmony_ci{ 963cabdff1aSopenharmony_ci uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT); 964cabdff1aSopenharmony_ci uint8_t early_exit = 0; 965cabdff1aSopenharmony_ci 966cabdff1aSopenharmony_ci early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], 967cabdff1aSopenharmony_ci b_limit_ptr, limit_ptr, thresh_ptr); 968cabdff1aSopenharmony_ci 969cabdff1aSopenharmony_ci if (0 == early_exit) { 970cabdff1aSopenharmony_ci vp9_hz_lpf_t16_16w(src, pitch, filter48); 971cabdff1aSopenharmony_ci } 972cabdff1aSopenharmony_ci} 973cabdff1aSopenharmony_ci 974cabdff1aSopenharmony_civoid ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch, 975cabdff1aSopenharmony_ci int32_t b_limit_ptr, 976cabdff1aSopenharmony_ci int32_t limit_ptr, 977cabdff1aSopenharmony_ci int32_t thresh_ptr) 978cabdff1aSopenharmony_ci{ 979cabdff1aSopenharmony_ci uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 980cabdff1aSopenharmony_ci uint64_t dword0, dword1; 981cabdff1aSopenharmony_ci v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; 982cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; 983cabdff1aSopenharmony_ci v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 984cabdff1aSopenharmony_ci v16u8 p0_filter16, p1_filter16; 985cabdff1aSopenharmony_ci v8i16 p2_filter8, p1_filter8, p0_filter8; 986cabdff1aSopenharmony_ci v8i16 q0_filter8, q1_filter8, q2_filter8; 987cabdff1aSopenharmony_ci v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; 988cabdff1aSopenharmony_ci v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; 989cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 990cabdff1aSopenharmony_ci v8u16 tmp0, tmp1, tmp2; 991cabdff1aSopenharmony_ci 992cabdff1aSopenharmony_ci /* load vector elements */ 993cabdff1aSopenharmony_ci LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 994cabdff1aSopenharmony_ci 995cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 996cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 997cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 998cabdff1aSopenharmony_ci 999cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1000cabdff1aSopenharmony_ci hev, mask, flat); 1001cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1002cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1003cabdff1aSopenharmony_ci q1_out); 1004cabdff1aSopenharmony_ci 1005cabdff1aSopenharmony_ci flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 1006cabdff1aSopenharmony_ci 1007cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 1008cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 1009cabdff1aSopenharmony_ci p1_d = __msa_copy_u_d((v2i64) p1_out, 0); 1010cabdff1aSopenharmony_ci p0_d = __msa_copy_u_d((v2i64) p0_out, 0); 1011cabdff1aSopenharmony_ci q0_d = __msa_copy_u_d((v2i64) q0_out, 0); 1012cabdff1aSopenharmony_ci q1_d = __msa_copy_u_d((v2i64) q1_out, 0); 1013cabdff1aSopenharmony_ci SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); 1014cabdff1aSopenharmony_ci } else { 1015cabdff1aSopenharmony_ci /* convert 8 bit input data into 16 bit */ 1016cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, 1017cabdff1aSopenharmony_ci q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, 1018cabdff1aSopenharmony_ci q1_r, q2_r, q3_r); 1019cabdff1aSopenharmony_ci VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, 1020cabdff1aSopenharmony_ci p2_filter8, p1_filter8, p0_filter8, q0_filter8, 1021cabdff1aSopenharmony_ci q1_filter8, q2_filter8); 1022cabdff1aSopenharmony_ci 1023cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 1024cabdff1aSopenharmony_ci PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, 1025cabdff1aSopenharmony_ci zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, 1026cabdff1aSopenharmony_ci q0_filter8); 1027cabdff1aSopenharmony_ci PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, 1028cabdff1aSopenharmony_ci q2_filter8); 1029cabdff1aSopenharmony_ci 1030cabdff1aSopenharmony_ci /* store pixel values */ 1031cabdff1aSopenharmony_ci p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat); 1032cabdff1aSopenharmony_ci p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat); 1033cabdff1aSopenharmony_ci p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat); 1034cabdff1aSopenharmony_ci q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat); 1035cabdff1aSopenharmony_ci q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat); 1036cabdff1aSopenharmony_ci q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat); 1037cabdff1aSopenharmony_ci 1038cabdff1aSopenharmony_ci /* load 16 vector elements */ 1039cabdff1aSopenharmony_ci LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); 1040cabdff1aSopenharmony_ci LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); 1041cabdff1aSopenharmony_ci 1042cabdff1aSopenharmony_ci VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 1043cabdff1aSopenharmony_ci 1044cabdff1aSopenharmony_ci /* if flat2 is zero for all pixels, then no need to calculate other filter */ 1045cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat2)) { 1046cabdff1aSopenharmony_ci p2_d = __msa_copy_u_d((v2i64) p2_out, 0); 1047cabdff1aSopenharmony_ci p1_d = __msa_copy_u_d((v2i64) p1_out, 0); 1048cabdff1aSopenharmony_ci p0_d = __msa_copy_u_d((v2i64) p0_out, 0); 1049cabdff1aSopenharmony_ci q0_d = __msa_copy_u_d((v2i64) q0_out, 0); 1050cabdff1aSopenharmony_ci q1_d = __msa_copy_u_d((v2i64) q1_out, 0); 1051cabdff1aSopenharmony_ci q2_d = __msa_copy_u_d((v2i64) q2_out, 0); 1052cabdff1aSopenharmony_ci 1053cabdff1aSopenharmony_ci SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); 1054cabdff1aSopenharmony_ci SD(q1_d, src + pitch); 1055cabdff1aSopenharmony_ci SD(q2_d, src + 2 * pitch); 1056cabdff1aSopenharmony_ci } else { 1057cabdff1aSopenharmony_ci /* LSB(right) 8 pixel operation */ 1058cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, 1059cabdff1aSopenharmony_ci zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, 1060cabdff1aSopenharmony_ci q4_r, q5_r, q6_r, q7_r); 1061cabdff1aSopenharmony_ci 1062cabdff1aSopenharmony_ci tmp0 = p7_r << 3; 1063cabdff1aSopenharmony_ci tmp0 -= p7_r; 1064cabdff1aSopenharmony_ci tmp0 += p6_r; 1065cabdff1aSopenharmony_ci tmp0 += q0_r; 1066cabdff1aSopenharmony_ci 1067cabdff1aSopenharmony_ci src -= 7 * pitch; 1068cabdff1aSopenharmony_ci 1069cabdff1aSopenharmony_ci /* calculation of p6 and p5 */ 1070cabdff1aSopenharmony_ci tmp1 = p6_r + p5_r + p4_r + p3_r; 1071cabdff1aSopenharmony_ci tmp1 += (p2_r + p1_r + p0_r); 1072cabdff1aSopenharmony_ci tmp1 += tmp0; 1073cabdff1aSopenharmony_ci p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1074cabdff1aSopenharmony_ci tmp0 = p5_r - p6_r + q1_r - p7_r; 1075cabdff1aSopenharmony_ci tmp1 += tmp0; 1076cabdff1aSopenharmony_ci p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1077cabdff1aSopenharmony_ci PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1078cabdff1aSopenharmony_ci p0_filter16, p1_filter16); 1079cabdff1aSopenharmony_ci p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); 1080cabdff1aSopenharmony_ci p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); 1081cabdff1aSopenharmony_ci dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1082cabdff1aSopenharmony_ci dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1083cabdff1aSopenharmony_ci SD(dword0, src); 1084cabdff1aSopenharmony_ci src += pitch; 1085cabdff1aSopenharmony_ci SD(dword1, src); 1086cabdff1aSopenharmony_ci src += pitch; 1087cabdff1aSopenharmony_ci 1088cabdff1aSopenharmony_ci /* calculation of p4 and p3 */ 1089cabdff1aSopenharmony_ci tmp0 = p4_r - p5_r + q2_r - p7_r; 1090cabdff1aSopenharmony_ci tmp2 = p3_r - p4_r + q3_r - p7_r; 1091cabdff1aSopenharmony_ci tmp1 += tmp0; 1092cabdff1aSopenharmony_ci p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1093cabdff1aSopenharmony_ci tmp1 += tmp2; 1094cabdff1aSopenharmony_ci p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1095cabdff1aSopenharmony_ci PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1096cabdff1aSopenharmony_ci p0_filter16, p1_filter16); 1097cabdff1aSopenharmony_ci p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); 1098cabdff1aSopenharmony_ci p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); 1099cabdff1aSopenharmony_ci dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1100cabdff1aSopenharmony_ci dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1101cabdff1aSopenharmony_ci SD(dword0, src); 1102cabdff1aSopenharmony_ci src += pitch; 1103cabdff1aSopenharmony_ci SD(dword1, src); 1104cabdff1aSopenharmony_ci src += pitch; 1105cabdff1aSopenharmony_ci 1106cabdff1aSopenharmony_ci /* calculation of p2 and p1 */ 1107cabdff1aSopenharmony_ci tmp0 = p2_r - p3_r + q4_r - p7_r; 1108cabdff1aSopenharmony_ci tmp2 = p1_r - p2_r + q5_r - p7_r; 1109cabdff1aSopenharmony_ci tmp1 += tmp0; 1110cabdff1aSopenharmony_ci p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1111cabdff1aSopenharmony_ci tmp1 += tmp2; 1112cabdff1aSopenharmony_ci p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1113cabdff1aSopenharmony_ci PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1114cabdff1aSopenharmony_ci p0_filter16, p1_filter16); 1115cabdff1aSopenharmony_ci p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); 1116cabdff1aSopenharmony_ci p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); 1117cabdff1aSopenharmony_ci dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1118cabdff1aSopenharmony_ci dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1119cabdff1aSopenharmony_ci SD(dword0, src); 1120cabdff1aSopenharmony_ci src += pitch; 1121cabdff1aSopenharmony_ci SD(dword1, src); 1122cabdff1aSopenharmony_ci src += pitch; 1123cabdff1aSopenharmony_ci 1124cabdff1aSopenharmony_ci /* calculation of p0 and q0 */ 1125cabdff1aSopenharmony_ci tmp0 = (p0_r - p1_r) + (q6_r - p7_r); 1126cabdff1aSopenharmony_ci tmp2 = (q7_r - p0_r) + (q0_r - p7_r); 1127cabdff1aSopenharmony_ci tmp1 += tmp0; 1128cabdff1aSopenharmony_ci p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1129cabdff1aSopenharmony_ci tmp1 += tmp2; 1130cabdff1aSopenharmony_ci p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1131cabdff1aSopenharmony_ci PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1132cabdff1aSopenharmony_ci p0_filter16, p1_filter16); 1133cabdff1aSopenharmony_ci p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); 1134cabdff1aSopenharmony_ci p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); 1135cabdff1aSopenharmony_ci dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1136cabdff1aSopenharmony_ci dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1137cabdff1aSopenharmony_ci SD(dword0, src); 1138cabdff1aSopenharmony_ci src += pitch; 1139cabdff1aSopenharmony_ci SD(dword1, src); 1140cabdff1aSopenharmony_ci src += pitch; 1141cabdff1aSopenharmony_ci 1142cabdff1aSopenharmony_ci /* calculation of q1 and q2 */ 1143cabdff1aSopenharmony_ci tmp0 = q7_r - q0_r + q1_r - p6_r; 1144cabdff1aSopenharmony_ci tmp2 = q7_r - q1_r + q2_r - p5_r; 1145cabdff1aSopenharmony_ci tmp1 += tmp0; 1146cabdff1aSopenharmony_ci p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1147cabdff1aSopenharmony_ci tmp1 += tmp2; 1148cabdff1aSopenharmony_ci p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1149cabdff1aSopenharmony_ci PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1150cabdff1aSopenharmony_ci p0_filter16, p1_filter16); 1151cabdff1aSopenharmony_ci p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); 1152cabdff1aSopenharmony_ci p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); 1153cabdff1aSopenharmony_ci dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1154cabdff1aSopenharmony_ci dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1155cabdff1aSopenharmony_ci SD(dword0, src); 1156cabdff1aSopenharmony_ci src += pitch; 1157cabdff1aSopenharmony_ci SD(dword1, src); 1158cabdff1aSopenharmony_ci src += pitch; 1159cabdff1aSopenharmony_ci 1160cabdff1aSopenharmony_ci /* calculation of q3 and q4 */ 1161cabdff1aSopenharmony_ci tmp0 = (q7_r - q2_r) + (q3_r - p4_r); 1162cabdff1aSopenharmony_ci tmp2 = (q7_r - q3_r) + (q4_r - p3_r); 1163cabdff1aSopenharmony_ci tmp1 += tmp0; 1164cabdff1aSopenharmony_ci p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1165cabdff1aSopenharmony_ci tmp1 += tmp2; 1166cabdff1aSopenharmony_ci p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1167cabdff1aSopenharmony_ci PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1168cabdff1aSopenharmony_ci p0_filter16, p1_filter16); 1169cabdff1aSopenharmony_ci p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); 1170cabdff1aSopenharmony_ci p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); 1171cabdff1aSopenharmony_ci dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1172cabdff1aSopenharmony_ci dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1173cabdff1aSopenharmony_ci SD(dword0, src); 1174cabdff1aSopenharmony_ci src += pitch; 1175cabdff1aSopenharmony_ci SD(dword1, src); 1176cabdff1aSopenharmony_ci src += pitch; 1177cabdff1aSopenharmony_ci 1178cabdff1aSopenharmony_ci /* calculation of q5 and q6 */ 1179cabdff1aSopenharmony_ci tmp0 = (q7_r - q4_r) + (q5_r - p2_r); 1180cabdff1aSopenharmony_ci tmp2 = (q7_r - q5_r) + (q6_r - p1_r); 1181cabdff1aSopenharmony_ci tmp1 += tmp0; 1182cabdff1aSopenharmony_ci p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1183cabdff1aSopenharmony_ci tmp1 += tmp2; 1184cabdff1aSopenharmony_ci p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1185cabdff1aSopenharmony_ci PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1186cabdff1aSopenharmony_ci p0_filter16, p1_filter16); 1187cabdff1aSopenharmony_ci p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); 1188cabdff1aSopenharmony_ci p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); 1189cabdff1aSopenharmony_ci dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1190cabdff1aSopenharmony_ci dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1191cabdff1aSopenharmony_ci SD(dword0, src); 1192cabdff1aSopenharmony_ci src += pitch; 1193cabdff1aSopenharmony_ci SD(dword1, src); 1194cabdff1aSopenharmony_ci } 1195cabdff1aSopenharmony_ci } 1196cabdff1aSopenharmony_ci} 1197cabdff1aSopenharmony_ci 1198cabdff1aSopenharmony_civoid ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch, 1199cabdff1aSopenharmony_ci int32_t b_limit_ptr, 1200cabdff1aSopenharmony_ci int32_t limit_ptr, 1201cabdff1aSopenharmony_ci int32_t thresh_ptr) 1202cabdff1aSopenharmony_ci{ 1203cabdff1aSopenharmony_ci v16u8 mask, hev, flat, limit, thresh, b_limit; 1204cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1205cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3; 1206cabdff1aSopenharmony_ci 1207cabdff1aSopenharmony_ci LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 1208cabdff1aSopenharmony_ci 1209cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 1210cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1211cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 1212cabdff1aSopenharmony_ci 1213cabdff1aSopenharmony_ci TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, 1214cabdff1aSopenharmony_ci p3, p2, p1, p0, q0, q1, q2, q3); 1215cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1216cabdff1aSopenharmony_ci hev, mask, flat); 1217cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); 1218cabdff1aSopenharmony_ci ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); 1219cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1220cabdff1aSopenharmony_ci 1221cabdff1aSopenharmony_ci src -= 2; 1222cabdff1aSopenharmony_ci ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1223cabdff1aSopenharmony_ci} 1224cabdff1aSopenharmony_ci 1225cabdff1aSopenharmony_civoid ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch, 1226cabdff1aSopenharmony_ci int32_t b_limit_ptr, 1227cabdff1aSopenharmony_ci int32_t limit_ptr, 1228cabdff1aSopenharmony_ci int32_t thresh_ptr) 1229cabdff1aSopenharmony_ci{ 1230cabdff1aSopenharmony_ci v16u8 mask, hev, flat; 1231cabdff1aSopenharmony_ci v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; 1232cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1233cabdff1aSopenharmony_ci v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 1234cabdff1aSopenharmony_ci v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 1235cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 1236cabdff1aSopenharmony_ci 1237cabdff1aSopenharmony_ci LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 1238cabdff1aSopenharmony_ci LD_UB8(src - 4 + (8 * pitch), pitch, 1239cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15); 1240cabdff1aSopenharmony_ci 1241cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 1242cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15, 1243cabdff1aSopenharmony_ci p3, p2, p1, p0, q0, q1, q2, q3); 1244cabdff1aSopenharmony_ci 1245cabdff1aSopenharmony_ci thresh0 = (v16u8) __msa_fill_b(thresh_ptr); 1246cabdff1aSopenharmony_ci thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8); 1247cabdff1aSopenharmony_ci thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0); 1248cabdff1aSopenharmony_ci 1249cabdff1aSopenharmony_ci b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr); 1250cabdff1aSopenharmony_ci b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8); 1251cabdff1aSopenharmony_ci b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0); 1252cabdff1aSopenharmony_ci 1253cabdff1aSopenharmony_ci limit0 = (v16u8) __msa_fill_b(limit_ptr); 1254cabdff1aSopenharmony_ci limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8); 1255cabdff1aSopenharmony_ci limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0); 1256cabdff1aSopenharmony_ci 1257cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, 1258cabdff1aSopenharmony_ci hev, mask, flat); 1259cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); 1260cabdff1aSopenharmony_ci ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); 1261cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); 1262cabdff1aSopenharmony_ci ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); 1263cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); 1264cabdff1aSopenharmony_ci 1265cabdff1aSopenharmony_ci src -= 2; 1266cabdff1aSopenharmony_ci 1267cabdff1aSopenharmony_ci ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1268cabdff1aSopenharmony_ci ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch); 1269cabdff1aSopenharmony_ci} 1270cabdff1aSopenharmony_ci 1271cabdff1aSopenharmony_civoid ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch, 1272cabdff1aSopenharmony_ci int32_t b_limit_ptr, 1273cabdff1aSopenharmony_ci int32_t limit_ptr, 1274cabdff1aSopenharmony_ci int32_t thresh_ptr) 1275cabdff1aSopenharmony_ci{ 1276cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1277cabdff1aSopenharmony_ci v16u8 p1_out, p0_out, q0_out, q1_out; 1278cabdff1aSopenharmony_ci v16u8 flat, mask, hev, thresh, b_limit, limit; 1279cabdff1aSopenharmony_ci v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1280cabdff1aSopenharmony_ci v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 1281cabdff1aSopenharmony_ci v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 1282cabdff1aSopenharmony_ci v16u8 zero = { 0 }; 1283cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, vec4; 1284cabdff1aSopenharmony_ci 1285cabdff1aSopenharmony_ci /* load vector elements */ 1286cabdff1aSopenharmony_ci LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); 1287cabdff1aSopenharmony_ci 1288cabdff1aSopenharmony_ci TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, 1289cabdff1aSopenharmony_ci p3, p2, p1, p0, q0, q1, q2, q3); 1290cabdff1aSopenharmony_ci 1291cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 1292cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1293cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 1294cabdff1aSopenharmony_ci 1295cabdff1aSopenharmony_ci /* mask and hev */ 1296cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1297cabdff1aSopenharmony_ci hev, mask, flat); 1298cabdff1aSopenharmony_ci /* flat4 */ 1299cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1300cabdff1aSopenharmony_ci /* filter4 */ 1301cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1302cabdff1aSopenharmony_ci q1_out); 1303cabdff1aSopenharmony_ci 1304cabdff1aSopenharmony_ci flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 1305cabdff1aSopenharmony_ci 1306cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 1307cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 1308cabdff1aSopenharmony_ci /* Store 4 pixels p1-_q1 */ 1309cabdff1aSopenharmony_ci ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1310cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1311cabdff1aSopenharmony_ci 1312cabdff1aSopenharmony_ci src -= 2; 1313cabdff1aSopenharmony_ci ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1314cabdff1aSopenharmony_ci } else { 1315cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 1316cabdff1aSopenharmony_ci zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 1317cabdff1aSopenharmony_ci q3_r); 1318cabdff1aSopenharmony_ci VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1319cabdff1aSopenharmony_ci p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1320cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 1321cabdff1aSopenharmony_ci PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, 1322cabdff1aSopenharmony_ci p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, 1323cabdff1aSopenharmony_ci p0_filt8_r, q0_filt8_r); 1324cabdff1aSopenharmony_ci PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, 1325cabdff1aSopenharmony_ci q2_filt8_r); 1326cabdff1aSopenharmony_ci 1327cabdff1aSopenharmony_ci /* store pixel values */ 1328cabdff1aSopenharmony_ci p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 1329cabdff1aSopenharmony_ci p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 1330cabdff1aSopenharmony_ci p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 1331cabdff1aSopenharmony_ci q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 1332cabdff1aSopenharmony_ci q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 1333cabdff1aSopenharmony_ci q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 1334cabdff1aSopenharmony_ci 1335cabdff1aSopenharmony_ci /* Store 6 pixels p2-_q2 */ 1336cabdff1aSopenharmony_ci ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1337cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1338cabdff1aSopenharmony_ci vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1); 1339cabdff1aSopenharmony_ci 1340cabdff1aSopenharmony_ci src -= 3; 1341cabdff1aSopenharmony_ci ST_W4(vec2, 0, 1, 2, 3, src, pitch); 1342cabdff1aSopenharmony_ci ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch); 1343cabdff1aSopenharmony_ci src += (4 * pitch); 1344cabdff1aSopenharmony_ci ST_W4(vec3, 0, 1, 2, 3, src, pitch); 1345cabdff1aSopenharmony_ci ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch); 1346cabdff1aSopenharmony_ci } 1347cabdff1aSopenharmony_ci} 1348cabdff1aSopenharmony_ci 1349cabdff1aSopenharmony_civoid ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch, 1350cabdff1aSopenharmony_ci int32_t b_limit_ptr, 1351cabdff1aSopenharmony_ci int32_t limit_ptr, 1352cabdff1aSopenharmony_ci int32_t thresh_ptr) 1353cabdff1aSopenharmony_ci{ 1354cabdff1aSopenharmony_ci uint8_t *temp_src; 1355cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1356cabdff1aSopenharmony_ci v16u8 p1_out, p0_out, q0_out, q1_out; 1357cabdff1aSopenharmony_ci v16u8 flat, mask, hev, thresh, b_limit, limit; 1358cabdff1aSopenharmony_ci v16u8 row4, row5, row6, row7, row12, row13, row14, row15; 1359cabdff1aSopenharmony_ci v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1360cabdff1aSopenharmony_ci v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 1361cabdff1aSopenharmony_ci v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 1362cabdff1aSopenharmony_ci v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 1363cabdff1aSopenharmony_ci v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 1364cabdff1aSopenharmony_ci v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 1365cabdff1aSopenharmony_ci v16u8 zero = { 0 }; 1366cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1367cabdff1aSopenharmony_ci 1368cabdff1aSopenharmony_ci temp_src = src - 4; 1369cabdff1aSopenharmony_ci 1370cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); 1371cabdff1aSopenharmony_ci temp_src += (8 * pitch); 1372cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); 1373cabdff1aSopenharmony_ci 1374cabdff1aSopenharmony_ci /* transpose 16x8 matrix into 8x16 */ 1375cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, 1376cabdff1aSopenharmony_ci q3, q2, q1, q0, row12, row13, row14, row15, 1377cabdff1aSopenharmony_ci p3, p2, p1, p0, q0, q1, q2, q3); 1378cabdff1aSopenharmony_ci 1379cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 1380cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8); 1381cabdff1aSopenharmony_ci thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh); 1382cabdff1aSopenharmony_ci 1383cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1384cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8); 1385cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit); 1386cabdff1aSopenharmony_ci 1387cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 1388cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8); 1389cabdff1aSopenharmony_ci limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit); 1390cabdff1aSopenharmony_ci 1391cabdff1aSopenharmony_ci /* mask and hev */ 1392cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1393cabdff1aSopenharmony_ci hev, mask, flat); 1394cabdff1aSopenharmony_ci /* flat4 */ 1395cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1396cabdff1aSopenharmony_ci /* filter4 */ 1397cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1398cabdff1aSopenharmony_ci q1_out); 1399cabdff1aSopenharmony_ci 1400cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 1401cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 1402cabdff1aSopenharmony_ci ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1403cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1404cabdff1aSopenharmony_ci ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1405cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec4, vec5); 1406cabdff1aSopenharmony_ci 1407cabdff1aSopenharmony_ci src -= 2; 1408cabdff1aSopenharmony_ci ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1409cabdff1aSopenharmony_ci ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch); 1410cabdff1aSopenharmony_ci } else { 1411cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 1412cabdff1aSopenharmony_ci zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 1413cabdff1aSopenharmony_ci q3_r); 1414cabdff1aSopenharmony_ci VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1415cabdff1aSopenharmony_ci p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1416cabdff1aSopenharmony_ci 1417cabdff1aSopenharmony_ci ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 1418cabdff1aSopenharmony_ci p0_l); 1419cabdff1aSopenharmony_ci ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 1420cabdff1aSopenharmony_ci q3_l); 1421cabdff1aSopenharmony_ci 1422cabdff1aSopenharmony_ci /* filter8 */ 1423cabdff1aSopenharmony_ci VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 1424cabdff1aSopenharmony_ci p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 1425cabdff1aSopenharmony_ci 1426cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 1427cabdff1aSopenharmony_ci PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 1428cabdff1aSopenharmony_ci p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 1429cabdff1aSopenharmony_ci p0_filt8_r, q0_filt8_r); 1430cabdff1aSopenharmony_ci PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 1431cabdff1aSopenharmony_ci q2_filt8_r); 1432cabdff1aSopenharmony_ci 1433cabdff1aSopenharmony_ci /* store pixel values */ 1434cabdff1aSopenharmony_ci p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 1435cabdff1aSopenharmony_ci p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 1436cabdff1aSopenharmony_ci p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 1437cabdff1aSopenharmony_ci q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 1438cabdff1aSopenharmony_ci q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 1439cabdff1aSopenharmony_ci q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 1440cabdff1aSopenharmony_ci 1441cabdff1aSopenharmony_ci ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1442cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1443cabdff1aSopenharmony_ci ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 1444cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec6, vec7); 1445cabdff1aSopenharmony_ci ILVRL_B2_SH(q2, q1, vec2, vec5); 1446cabdff1aSopenharmony_ci 1447cabdff1aSopenharmony_ci src -= 3; 1448cabdff1aSopenharmony_ci ST_W4(vec3, 0, 1, 2, 3, src, pitch); 1449cabdff1aSopenharmony_ci ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch); 1450cabdff1aSopenharmony_ci src += (4 * pitch); 1451cabdff1aSopenharmony_ci ST_W4(vec4, 0, 1, 2, 3, src, pitch); 1452cabdff1aSopenharmony_ci ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch); 1453cabdff1aSopenharmony_ci src += (4 * pitch); 1454cabdff1aSopenharmony_ci ST_W4(vec6, 0, 1, 2, 3, src, pitch); 1455cabdff1aSopenharmony_ci ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch); 1456cabdff1aSopenharmony_ci src += (4 * pitch); 1457cabdff1aSopenharmony_ci ST_W4(vec7, 0, 1, 2, 3, src, pitch); 1458cabdff1aSopenharmony_ci ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch); 1459cabdff1aSopenharmony_ci } 1460cabdff1aSopenharmony_ci} 1461cabdff1aSopenharmony_ci 1462cabdff1aSopenharmony_civoid ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch, 1463cabdff1aSopenharmony_ci int32_t b_limit_ptr, 1464cabdff1aSopenharmony_ci int32_t limit_ptr, 1465cabdff1aSopenharmony_ci int32_t thresh_ptr) 1466cabdff1aSopenharmony_ci{ 1467cabdff1aSopenharmony_ci uint8_t *temp_src; 1468cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1469cabdff1aSopenharmony_ci v16u8 p1_out, p0_out, q0_out, q1_out; 1470cabdff1aSopenharmony_ci v16u8 flat, mask, hev, thresh, b_limit, limit; 1471cabdff1aSopenharmony_ci v16u8 row4, row5, row6, row7, row12, row13, row14, row15; 1472cabdff1aSopenharmony_ci v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1473cabdff1aSopenharmony_ci v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 1474cabdff1aSopenharmony_ci v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 1475cabdff1aSopenharmony_ci v16u8 zero = { 0 }; 1476cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1477cabdff1aSopenharmony_ci 1478cabdff1aSopenharmony_ci temp_src = src - 4; 1479cabdff1aSopenharmony_ci 1480cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); 1481cabdff1aSopenharmony_ci temp_src += (8 * pitch); 1482cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); 1483cabdff1aSopenharmony_ci 1484cabdff1aSopenharmony_ci /* transpose 16x8 matrix into 8x16 */ 1485cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, 1486cabdff1aSopenharmony_ci q3, q2, q1, q0, row12, row13, row14, row15, 1487cabdff1aSopenharmony_ci p3, p2, p1, p0, q0, q1, q2, q3); 1488cabdff1aSopenharmony_ci 1489cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 1490cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8); 1491cabdff1aSopenharmony_ci thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh); 1492cabdff1aSopenharmony_ci 1493cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1494cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8); 1495cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit); 1496cabdff1aSopenharmony_ci 1497cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 1498cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8); 1499cabdff1aSopenharmony_ci limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit); 1500cabdff1aSopenharmony_ci 1501cabdff1aSopenharmony_ci /* mask and hev */ 1502cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1503cabdff1aSopenharmony_ci hev, mask, flat); 1504cabdff1aSopenharmony_ci /* flat4 */ 1505cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1506cabdff1aSopenharmony_ci /* filter4 */ 1507cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1508cabdff1aSopenharmony_ci q1_out); 1509cabdff1aSopenharmony_ci 1510cabdff1aSopenharmony_ci flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 1511cabdff1aSopenharmony_ci 1512cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 1513cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 1514cabdff1aSopenharmony_ci ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1515cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1516cabdff1aSopenharmony_ci ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1517cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec4, vec5); 1518cabdff1aSopenharmony_ci 1519cabdff1aSopenharmony_ci src -= 2; 1520cabdff1aSopenharmony_ci ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1521cabdff1aSopenharmony_ci ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch); 1522cabdff1aSopenharmony_ci } else { 1523cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 1524cabdff1aSopenharmony_ci zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 1525cabdff1aSopenharmony_ci q3_r); 1526cabdff1aSopenharmony_ci VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1527cabdff1aSopenharmony_ci p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1528cabdff1aSopenharmony_ci 1529cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 1530cabdff1aSopenharmony_ci PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, 1531cabdff1aSopenharmony_ci p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r, 1532cabdff1aSopenharmony_ci p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); 1533cabdff1aSopenharmony_ci PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, 1534cabdff1aSopenharmony_ci q1_filt8_r, q2_filt8_r); 1535cabdff1aSopenharmony_ci 1536cabdff1aSopenharmony_ci /* store pixel values */ 1537cabdff1aSopenharmony_ci p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 1538cabdff1aSopenharmony_ci p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 1539cabdff1aSopenharmony_ci p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 1540cabdff1aSopenharmony_ci q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 1541cabdff1aSopenharmony_ci q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 1542cabdff1aSopenharmony_ci q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 1543cabdff1aSopenharmony_ci 1544cabdff1aSopenharmony_ci ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1545cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1546cabdff1aSopenharmony_ci ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 1547cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec6, vec7); 1548cabdff1aSopenharmony_ci ILVRL_B2_SH(q2, q1, vec2, vec5); 1549cabdff1aSopenharmony_ci 1550cabdff1aSopenharmony_ci src -= 3; 1551cabdff1aSopenharmony_ci ST_W4(vec3, 0, 1, 2, 3, src, pitch); 1552cabdff1aSopenharmony_ci ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch); 1553cabdff1aSopenharmony_ci src += (4 * pitch); 1554cabdff1aSopenharmony_ci ST_W4(vec4, 0, 1, 2, 3, src, pitch); 1555cabdff1aSopenharmony_ci ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch); 1556cabdff1aSopenharmony_ci src += (4 * pitch); 1557cabdff1aSopenharmony_ci ST_W4(vec6, 0, 1, 2, 3, src, pitch); 1558cabdff1aSopenharmony_ci ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch); 1559cabdff1aSopenharmony_ci src += (4 * pitch); 1560cabdff1aSopenharmony_ci ST_W4(vec7, 0, 1, 2, 3, src, pitch); 1561cabdff1aSopenharmony_ci ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch); 1562cabdff1aSopenharmony_ci } 1563cabdff1aSopenharmony_ci} 1564cabdff1aSopenharmony_ci 1565cabdff1aSopenharmony_civoid ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch, 1566cabdff1aSopenharmony_ci int32_t b_limit_ptr, 1567cabdff1aSopenharmony_ci int32_t limit_ptr, 1568cabdff1aSopenharmony_ci int32_t thresh_ptr) 1569cabdff1aSopenharmony_ci{ 1570cabdff1aSopenharmony_ci uint8_t *temp_src; 1571cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1572cabdff1aSopenharmony_ci v16u8 p1_out, p0_out, q0_out, q1_out; 1573cabdff1aSopenharmony_ci v16u8 flat, mask, hev, thresh, b_limit, limit; 1574cabdff1aSopenharmony_ci v16u8 row4, row5, row6, row7, row12, row13, row14, row15; 1575cabdff1aSopenharmony_ci v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 1576cabdff1aSopenharmony_ci v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 1577cabdff1aSopenharmony_ci v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 1578cabdff1aSopenharmony_ci v16u8 zero = { 0 }; 1579cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1580cabdff1aSopenharmony_ci 1581cabdff1aSopenharmony_ci temp_src = src - 4; 1582cabdff1aSopenharmony_ci 1583cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); 1584cabdff1aSopenharmony_ci temp_src += (8 * pitch); 1585cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); 1586cabdff1aSopenharmony_ci 1587cabdff1aSopenharmony_ci /* transpose 16x8 matrix into 8x16 */ 1588cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, 1589cabdff1aSopenharmony_ci q3, q2, q1, q0, row12, row13, row14, row15, 1590cabdff1aSopenharmony_ci p3, p2, p1, p0, q0, q1, q2, q3); 1591cabdff1aSopenharmony_ci 1592cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 1593cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8); 1594cabdff1aSopenharmony_ci thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh); 1595cabdff1aSopenharmony_ci 1596cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1597cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8); 1598cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit); 1599cabdff1aSopenharmony_ci 1600cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 1601cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8); 1602cabdff1aSopenharmony_ci limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit); 1603cabdff1aSopenharmony_ci 1604cabdff1aSopenharmony_ci /* mask and hev */ 1605cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1606cabdff1aSopenharmony_ci hev, mask, flat); 1607cabdff1aSopenharmony_ci /* flat4 */ 1608cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1609cabdff1aSopenharmony_ci /* filter4 */ 1610cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1611cabdff1aSopenharmony_ci q1_out); 1612cabdff1aSopenharmony_ci 1613cabdff1aSopenharmony_ci flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero); 1614cabdff1aSopenharmony_ci 1615cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 1616cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 1617cabdff1aSopenharmony_ci ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1618cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1619cabdff1aSopenharmony_ci ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1620cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec4, vec5); 1621cabdff1aSopenharmony_ci 1622cabdff1aSopenharmony_ci src -= 2; 1623cabdff1aSopenharmony_ci ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1624cabdff1aSopenharmony_ci ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch); 1625cabdff1aSopenharmony_ci } else { 1626cabdff1aSopenharmony_ci ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 1627cabdff1aSopenharmony_ci p0_l); 1628cabdff1aSopenharmony_ci ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 1629cabdff1aSopenharmony_ci q3_l); 1630cabdff1aSopenharmony_ci 1631cabdff1aSopenharmony_ci VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 1632cabdff1aSopenharmony_ci p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 1633cabdff1aSopenharmony_ci 1634cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 1635cabdff1aSopenharmony_ci PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, 1636cabdff1aSopenharmony_ci p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, 1637cabdff1aSopenharmony_ci p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l); 1638cabdff1aSopenharmony_ci PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, 1639cabdff1aSopenharmony_ci q1_filt8_l, q2_filt8_l); 1640cabdff1aSopenharmony_ci 1641cabdff1aSopenharmony_ci /* store pixel values */ 1642cabdff1aSopenharmony_ci p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat); 1643cabdff1aSopenharmony_ci p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat); 1644cabdff1aSopenharmony_ci p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat); 1645cabdff1aSopenharmony_ci q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat); 1646cabdff1aSopenharmony_ci q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat); 1647cabdff1aSopenharmony_ci q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat); 1648cabdff1aSopenharmony_ci 1649cabdff1aSopenharmony_ci ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1650cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1651cabdff1aSopenharmony_ci ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 1652cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec6, vec7); 1653cabdff1aSopenharmony_ci ILVRL_B2_SH(q2, q1, vec2, vec5); 1654cabdff1aSopenharmony_ci 1655cabdff1aSopenharmony_ci src -= 3; 1656cabdff1aSopenharmony_ci ST_W4(vec3, 0, 1, 2, 3, src, pitch); 1657cabdff1aSopenharmony_ci ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch); 1658cabdff1aSopenharmony_ci src += (4 * pitch); 1659cabdff1aSopenharmony_ci ST_W4(vec4, 0, 1, 2, 3, src, pitch); 1660cabdff1aSopenharmony_ci ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch); 1661cabdff1aSopenharmony_ci src += (4 * pitch); 1662cabdff1aSopenharmony_ci ST_W4(vec6, 0, 1, 2, 3, src, pitch); 1663cabdff1aSopenharmony_ci ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch); 1664cabdff1aSopenharmony_ci src += (4 * pitch); 1665cabdff1aSopenharmony_ci ST_W4(vec7, 0, 1, 2, 3, src, pitch); 1666cabdff1aSopenharmony_ci ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch); 1667cabdff1aSopenharmony_ci } 1668cabdff1aSopenharmony_ci} 1669cabdff1aSopenharmony_ci 1670cabdff1aSopenharmony_cistatic void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, 1671cabdff1aSopenharmony_ci uint8_t *output, int32_t out_pitch) 1672cabdff1aSopenharmony_ci{ 1673cabdff1aSopenharmony_ci v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; 1674cabdff1aSopenharmony_ci v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1675cabdff1aSopenharmony_ci v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1676cabdff1aSopenharmony_ci v16i8 zeros = { 0 }; 1677cabdff1aSopenharmony_ci 1678cabdff1aSopenharmony_ci LD_UB8(input, in_pitch, 1679cabdff1aSopenharmony_ci p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org); 1680cabdff1aSopenharmony_ci /* 8x8 transpose */ 1681cabdff1aSopenharmony_ci TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, 1682cabdff1aSopenharmony_ci p0_org, p7, p6, p5, p4, p3, p2, p1, p0); 1683cabdff1aSopenharmony_ci /* 8x8 transpose */ 1684cabdff1aSopenharmony_ci ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, 1685cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1686cabdff1aSopenharmony_ci ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); 1687cabdff1aSopenharmony_ci ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); 1688cabdff1aSopenharmony_ci ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); 1689cabdff1aSopenharmony_ci ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); 1690cabdff1aSopenharmony_ci SLDI_B4_UB(zeros, q0, zeros, q2, zeros, q4, zeros, q6, 8, q1, q3, q5, q7); 1691cabdff1aSopenharmony_ci 1692cabdff1aSopenharmony_ci ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); 1693cabdff1aSopenharmony_ci output += (8 * out_pitch); 1694cabdff1aSopenharmony_ci ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); 1695cabdff1aSopenharmony_ci} 1696cabdff1aSopenharmony_ci 1697cabdff1aSopenharmony_cistatic void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, 1698cabdff1aSopenharmony_ci uint8_t *output, int32_t out_pitch) 1699cabdff1aSopenharmony_ci{ 1700cabdff1aSopenharmony_ci v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; 1701cabdff1aSopenharmony_ci v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1702cabdff1aSopenharmony_ci 1703cabdff1aSopenharmony_ci LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); 1704cabdff1aSopenharmony_ci LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); 1705cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, 1706cabdff1aSopenharmony_ci q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); 1707cabdff1aSopenharmony_ci ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); 1708cabdff1aSopenharmony_ci} 1709cabdff1aSopenharmony_ci 1710cabdff1aSopenharmony_cistatic void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, 1711cabdff1aSopenharmony_ci uint8_t *output, int32_t out_pitch) 1712cabdff1aSopenharmony_ci{ 1713cabdff1aSopenharmony_ci v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 1714cabdff1aSopenharmony_ci v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 1715cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; 1716cabdff1aSopenharmony_ci v4i32 tmp2, tmp3; 1717cabdff1aSopenharmony_ci v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1718cabdff1aSopenharmony_ci 1719cabdff1aSopenharmony_ci LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); 1720cabdff1aSopenharmony_ci input += (8 * in_pitch); 1721cabdff1aSopenharmony_ci LD_UB8(input, in_pitch, 1722cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15); 1723cabdff1aSopenharmony_ci 1724cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 1725cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15, 1726cabdff1aSopenharmony_ci p7, p6, p5, p4, p3, p2, p1, p0); 1727cabdff1aSopenharmony_ci 1728cabdff1aSopenharmony_ci /* transpose 16x8 matrix into 8x16 */ 1729cabdff1aSopenharmony_ci /* total 8 intermediate register and 32 instructions */ 1730cabdff1aSopenharmony_ci q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0); 1731cabdff1aSopenharmony_ci q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1); 1732cabdff1aSopenharmony_ci q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2); 1733cabdff1aSopenharmony_ci q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3); 1734cabdff1aSopenharmony_ci q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4); 1735cabdff1aSopenharmony_ci q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5); 1736cabdff1aSopenharmony_ci q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6); 1737cabdff1aSopenharmony_ci q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7); 1738cabdff1aSopenharmony_ci 1739cabdff1aSopenharmony_ci ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); 1740cabdff1aSopenharmony_ci tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7); 1741cabdff1aSopenharmony_ci tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5); 1742cabdff1aSopenharmony_ci 1743cabdff1aSopenharmony_ci ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); 1744cabdff1aSopenharmony_ci tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3); 1745cabdff1aSopenharmony_ci tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1); 1746cabdff1aSopenharmony_ci 1747cabdff1aSopenharmony_ci ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); 1748cabdff1aSopenharmony_ci q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2); 1749cabdff1aSopenharmony_ci q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2); 1750cabdff1aSopenharmony_ci 1751cabdff1aSopenharmony_ci tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0); 1752cabdff1aSopenharmony_ci tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5); 1753cabdff1aSopenharmony_ci q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2); 1754cabdff1aSopenharmony_ci q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2); 1755cabdff1aSopenharmony_ci 1756cabdff1aSopenharmony_ci ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); 1757cabdff1aSopenharmony_ci q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2); 1758cabdff1aSopenharmony_ci q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2); 1759cabdff1aSopenharmony_ci 1760cabdff1aSopenharmony_ci tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4); 1761cabdff1aSopenharmony_ci tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6); 1762cabdff1aSopenharmony_ci q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2); 1763cabdff1aSopenharmony_ci q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2); 1764cabdff1aSopenharmony_ci 1765cabdff1aSopenharmony_ci ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); 1766cabdff1aSopenharmony_ci output += (8 * out_pitch); 1767cabdff1aSopenharmony_ci ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); 1768cabdff1aSopenharmony_ci} 1769cabdff1aSopenharmony_ci 1770cabdff1aSopenharmony_cistatic int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, 1771cabdff1aSopenharmony_ci uint8_t *src_org, int32_t pitch_org, 1772cabdff1aSopenharmony_ci int32_t b_limit_ptr, 1773cabdff1aSopenharmony_ci int32_t limit_ptr, 1774cabdff1aSopenharmony_ci int32_t thresh_ptr) 1775cabdff1aSopenharmony_ci{ 1776cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1777cabdff1aSopenharmony_ci v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 1778cabdff1aSopenharmony_ci v16u8 flat, mask, hev, thresh, b_limit, limit; 1779cabdff1aSopenharmony_ci v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1780cabdff1aSopenharmony_ci v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 1781cabdff1aSopenharmony_ci v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 1782cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 1783cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3; 1784cabdff1aSopenharmony_ci 1785cabdff1aSopenharmony_ci /* load vector elements */ 1786cabdff1aSopenharmony_ci LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); 1787cabdff1aSopenharmony_ci 1788cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 1789cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1790cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 1791cabdff1aSopenharmony_ci 1792cabdff1aSopenharmony_ci /* mask and hev */ 1793cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1794cabdff1aSopenharmony_ci hev, mask, flat); 1795cabdff1aSopenharmony_ci /* flat4 */ 1796cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1797cabdff1aSopenharmony_ci /* filter4 */ 1798cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1799cabdff1aSopenharmony_ci q1_out); 1800cabdff1aSopenharmony_ci 1801cabdff1aSopenharmony_ci flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 1802cabdff1aSopenharmony_ci 1803cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 1804cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 1805cabdff1aSopenharmony_ci ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1806cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1807cabdff1aSopenharmony_ci ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org); 1808cabdff1aSopenharmony_ci return 1; 1809cabdff1aSopenharmony_ci } else { 1810cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 1811cabdff1aSopenharmony_ci zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 1812cabdff1aSopenharmony_ci q3_r); 1813cabdff1aSopenharmony_ci VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1814cabdff1aSopenharmony_ci p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1815cabdff1aSopenharmony_ci 1816cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 1817cabdff1aSopenharmony_ci p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r); 1818cabdff1aSopenharmony_ci p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r); 1819cabdff1aSopenharmony_ci p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r); 1820cabdff1aSopenharmony_ci q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r); 1821cabdff1aSopenharmony_ci q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r); 1822cabdff1aSopenharmony_ci q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r); 1823cabdff1aSopenharmony_ci 1824cabdff1aSopenharmony_ci /* store pixel values */ 1825cabdff1aSopenharmony_ci p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat); 1826cabdff1aSopenharmony_ci p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat); 1827cabdff1aSopenharmony_ci p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat); 1828cabdff1aSopenharmony_ci q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat); 1829cabdff1aSopenharmony_ci q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat); 1830cabdff1aSopenharmony_ci q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat); 1831cabdff1aSopenharmony_ci 1832cabdff1aSopenharmony_ci ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 1833cabdff1aSopenharmony_ci filter48 += (4 * 16); 1834cabdff1aSopenharmony_ci ST_UB2(q1_out, q2_out, filter48, 16); 1835cabdff1aSopenharmony_ci filter48 += (2 * 16); 1836cabdff1aSopenharmony_ci ST_UB(flat, filter48); 1837cabdff1aSopenharmony_ci 1838cabdff1aSopenharmony_ci return 0; 1839cabdff1aSopenharmony_ci } 1840cabdff1aSopenharmony_ci} 1841cabdff1aSopenharmony_ci 1842cabdff1aSopenharmony_cistatic int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, 1843cabdff1aSopenharmony_ci uint8_t *filter48) 1844cabdff1aSopenharmony_ci{ 1845cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 1846cabdff1aSopenharmony_ci v16u8 filter8, flat, flat2; 1847cabdff1aSopenharmony_ci v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1848cabdff1aSopenharmony_ci v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in; 1849cabdff1aSopenharmony_ci v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in; 1850cabdff1aSopenharmony_ci v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in; 1851cabdff1aSopenharmony_ci v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in; 1852cabdff1aSopenharmony_ci v8u16 tmp0_r, tmp1_r; 1853cabdff1aSopenharmony_ci v8i16 r_out; 1854cabdff1aSopenharmony_ci 1855cabdff1aSopenharmony_ci flat = LD_UB(filter48 + 6 * 16); 1856cabdff1aSopenharmony_ci 1857cabdff1aSopenharmony_ci LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); 1858cabdff1aSopenharmony_ci LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); 1859cabdff1aSopenharmony_ci 1860cabdff1aSopenharmony_ci VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 1861cabdff1aSopenharmony_ci 1862cabdff1aSopenharmony_ci /* if flat2 is zero for all pixels, then no need to calculate other filter */ 1863cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat2)) { 1864cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, vec4; 1865cabdff1aSopenharmony_ci 1866cabdff1aSopenharmony_ci LD_UB4(filter48, 16, p2, p1, p0, q0); 1867cabdff1aSopenharmony_ci LD_UB2(filter48 + 4 * 16, 16, q1, q2); 1868cabdff1aSopenharmony_ci 1869cabdff1aSopenharmony_ci ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1870cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1871cabdff1aSopenharmony_ci vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1); 1872cabdff1aSopenharmony_ci 1873cabdff1aSopenharmony_ci src_org -= 3; 1874cabdff1aSopenharmony_ci ST_W4(vec3, 0, 1, 2, 3, src_org, pitch); 1875cabdff1aSopenharmony_ci ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch); 1876cabdff1aSopenharmony_ci src_org += (4 * pitch); 1877cabdff1aSopenharmony_ci ST_W4(vec4, 0, 1, 2, 3, src_org, pitch); 1878cabdff1aSopenharmony_ci ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch); 1879cabdff1aSopenharmony_ci 1880cabdff1aSopenharmony_ci return 1; 1881cabdff1aSopenharmony_ci } else { 1882cabdff1aSopenharmony_ci src -= 7 * 16; 1883cabdff1aSopenharmony_ci 1884cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, 1885cabdff1aSopenharmony_ci zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, 1886cabdff1aSopenharmony_ci p3_r_in, p2_r_in, p1_r_in, p0_r_in); 1887cabdff1aSopenharmony_ci q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0); 1888cabdff1aSopenharmony_ci 1889cabdff1aSopenharmony_ci tmp0_r = p7_r_in << 3; 1890cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 1891cabdff1aSopenharmony_ci tmp0_r += p6_r_in; 1892cabdff1aSopenharmony_ci tmp0_r += q0_r_in; 1893cabdff1aSopenharmony_ci tmp1_r = p6_r_in + p5_r_in; 1894cabdff1aSopenharmony_ci tmp1_r += p4_r_in; 1895cabdff1aSopenharmony_ci tmp1_r += p3_r_in; 1896cabdff1aSopenharmony_ci tmp1_r += p2_r_in; 1897cabdff1aSopenharmony_ci tmp1_r += p1_r_in; 1898cabdff1aSopenharmony_ci tmp1_r += p0_r_in; 1899cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 1900cabdff1aSopenharmony_ci 1901cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1902cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1903cabdff1aSopenharmony_ci p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2); 1904cabdff1aSopenharmony_ci ST_D1(p6, 0, src); 1905cabdff1aSopenharmony_ci src += 16; 1906cabdff1aSopenharmony_ci 1907cabdff1aSopenharmony_ci /* p5 */ 1908cabdff1aSopenharmony_ci q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1); 1909cabdff1aSopenharmony_ci tmp0_r = p5_r_in - p6_r_in; 1910cabdff1aSopenharmony_ci tmp0_r += q1_r_in; 1911cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 1912cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 1913cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1914cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1915cabdff1aSopenharmony_ci p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2); 1916cabdff1aSopenharmony_ci ST_D1(p5, 0, src); 1917cabdff1aSopenharmony_ci src += 16; 1918cabdff1aSopenharmony_ci 1919cabdff1aSopenharmony_ci /* p4 */ 1920cabdff1aSopenharmony_ci q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2); 1921cabdff1aSopenharmony_ci tmp0_r = p4_r_in - p5_r_in; 1922cabdff1aSopenharmony_ci tmp0_r += q2_r_in; 1923cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 1924cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 1925cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1926cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1927cabdff1aSopenharmony_ci p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2); 1928cabdff1aSopenharmony_ci ST_D1(p4, 0, src); 1929cabdff1aSopenharmony_ci src += 16; 1930cabdff1aSopenharmony_ci 1931cabdff1aSopenharmony_ci /* p3 */ 1932cabdff1aSopenharmony_ci q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3); 1933cabdff1aSopenharmony_ci tmp0_r = p3_r_in - p4_r_in; 1934cabdff1aSopenharmony_ci tmp0_r += q3_r_in; 1935cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 1936cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 1937cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1938cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1939cabdff1aSopenharmony_ci p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2); 1940cabdff1aSopenharmony_ci ST_D1(p3, 0, src); 1941cabdff1aSopenharmony_ci src += 16; 1942cabdff1aSopenharmony_ci 1943cabdff1aSopenharmony_ci /* p2 */ 1944cabdff1aSopenharmony_ci q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4); 1945cabdff1aSopenharmony_ci filter8 = LD_UB(filter48); 1946cabdff1aSopenharmony_ci tmp0_r = p2_r_in - p3_r_in; 1947cabdff1aSopenharmony_ci tmp0_r += q4_r_in; 1948cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 1949cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 1950cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1951cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1952cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 1953cabdff1aSopenharmony_ci ST_D1(filter8, 0, src); 1954cabdff1aSopenharmony_ci src += 16; 1955cabdff1aSopenharmony_ci 1956cabdff1aSopenharmony_ci /* p1 */ 1957cabdff1aSopenharmony_ci q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5); 1958cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 16); 1959cabdff1aSopenharmony_ci tmp0_r = p1_r_in - p2_r_in; 1960cabdff1aSopenharmony_ci tmp0_r += q5_r_in; 1961cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 1962cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 1963cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1964cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1965cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 1966cabdff1aSopenharmony_ci ST_D1(filter8, 0, src); 1967cabdff1aSopenharmony_ci src += 16; 1968cabdff1aSopenharmony_ci 1969cabdff1aSopenharmony_ci /* p0 */ 1970cabdff1aSopenharmony_ci q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6); 1971cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 32); 1972cabdff1aSopenharmony_ci tmp0_r = p0_r_in - p1_r_in; 1973cabdff1aSopenharmony_ci tmp0_r += q6_r_in; 1974cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 1975cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 1976cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1977cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1978cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 1979cabdff1aSopenharmony_ci ST_D1(filter8, 0, src); 1980cabdff1aSopenharmony_ci src += 16; 1981cabdff1aSopenharmony_ci 1982cabdff1aSopenharmony_ci /* q0 */ 1983cabdff1aSopenharmony_ci q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7); 1984cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 48); 1985cabdff1aSopenharmony_ci tmp0_r = q7_r_in - p0_r_in; 1986cabdff1aSopenharmony_ci tmp0_r += q0_r_in; 1987cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 1988cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 1989cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1990cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1991cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 1992cabdff1aSopenharmony_ci ST_D1(filter8, 0, src); 1993cabdff1aSopenharmony_ci src += 16; 1994cabdff1aSopenharmony_ci 1995cabdff1aSopenharmony_ci /* q1 */ 1996cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 64); 1997cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q0_r_in; 1998cabdff1aSopenharmony_ci tmp0_r += q1_r_in; 1999cabdff1aSopenharmony_ci tmp0_r -= p6_r_in; 2000cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2001cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2002cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2003cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2004cabdff1aSopenharmony_ci ST_D1(filter8, 0, src); 2005cabdff1aSopenharmony_ci src += 16; 2006cabdff1aSopenharmony_ci 2007cabdff1aSopenharmony_ci /* q2 */ 2008cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 80); 2009cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q1_r_in; 2010cabdff1aSopenharmony_ci tmp0_r += q2_r_in; 2011cabdff1aSopenharmony_ci tmp0_r -= p5_r_in; 2012cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2013cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2014cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2015cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2016cabdff1aSopenharmony_ci ST_D1(filter8, 0, src); 2017cabdff1aSopenharmony_ci src += 16; 2018cabdff1aSopenharmony_ci 2019cabdff1aSopenharmony_ci /* q3 */ 2020cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q2_r_in; 2021cabdff1aSopenharmony_ci tmp0_r += q3_r_in; 2022cabdff1aSopenharmony_ci tmp0_r -= p4_r_in; 2023cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2024cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2025cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2026cabdff1aSopenharmony_ci q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2); 2027cabdff1aSopenharmony_ci ST_D1(q3, 0, src); 2028cabdff1aSopenharmony_ci src += 16; 2029cabdff1aSopenharmony_ci 2030cabdff1aSopenharmony_ci /* q4 */ 2031cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q3_r_in; 2032cabdff1aSopenharmony_ci tmp0_r += q4_r_in; 2033cabdff1aSopenharmony_ci tmp0_r -= p3_r_in; 2034cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2035cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2036cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2037cabdff1aSopenharmony_ci q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2); 2038cabdff1aSopenharmony_ci ST_D1(q4, 0, src); 2039cabdff1aSopenharmony_ci src += 16; 2040cabdff1aSopenharmony_ci 2041cabdff1aSopenharmony_ci /* q5 */ 2042cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q4_r_in; 2043cabdff1aSopenharmony_ci tmp0_r += q5_r_in; 2044cabdff1aSopenharmony_ci tmp0_r -= p2_r_in; 2045cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2046cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2047cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2048cabdff1aSopenharmony_ci q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2); 2049cabdff1aSopenharmony_ci ST_D1(q5, 0, src); 2050cabdff1aSopenharmony_ci src += 16; 2051cabdff1aSopenharmony_ci 2052cabdff1aSopenharmony_ci /* q6 */ 2053cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q5_r_in; 2054cabdff1aSopenharmony_ci tmp0_r += q6_r_in; 2055cabdff1aSopenharmony_ci tmp0_r -= p1_r_in; 2056cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2057cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2058cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2059cabdff1aSopenharmony_ci q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2); 2060cabdff1aSopenharmony_ci ST_D1(q6, 0, src); 2061cabdff1aSopenharmony_ci 2062cabdff1aSopenharmony_ci return 0; 2063cabdff1aSopenharmony_ci } 2064cabdff1aSopenharmony_ci} 2065cabdff1aSopenharmony_ci 2066cabdff1aSopenharmony_civoid ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch, 2067cabdff1aSopenharmony_ci int32_t b_limit_ptr, 2068cabdff1aSopenharmony_ci int32_t limit_ptr, 2069cabdff1aSopenharmony_ci int32_t thresh_ptr) 2070cabdff1aSopenharmony_ci{ 2071cabdff1aSopenharmony_ci uint8_t early_exit = 0; 2072cabdff1aSopenharmony_ci uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT); 2073cabdff1aSopenharmony_ci uint8_t *filter48 = &transposed_input[16 * 16]; 2074cabdff1aSopenharmony_ci 2075cabdff1aSopenharmony_ci vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); 2076cabdff1aSopenharmony_ci 2077cabdff1aSopenharmony_ci early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), 2078cabdff1aSopenharmony_ci &filter48[0], src, pitch, 2079cabdff1aSopenharmony_ci b_limit_ptr, limit_ptr, thresh_ptr); 2080cabdff1aSopenharmony_ci 2081cabdff1aSopenharmony_ci if (0 == early_exit) { 2082cabdff1aSopenharmony_ci early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, 2083cabdff1aSopenharmony_ci &filter48[0]); 2084cabdff1aSopenharmony_ci 2085cabdff1aSopenharmony_ci if (0 == early_exit) { 2086cabdff1aSopenharmony_ci vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); 2087cabdff1aSopenharmony_ci } 2088cabdff1aSopenharmony_ci } 2089cabdff1aSopenharmony_ci} 2090cabdff1aSopenharmony_ci 2091cabdff1aSopenharmony_cistatic int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, 2092cabdff1aSopenharmony_ci uint8_t *src_org, ptrdiff_t pitch, 2093cabdff1aSopenharmony_ci int32_t b_limit_ptr, 2094cabdff1aSopenharmony_ci int32_t limit_ptr, 2095cabdff1aSopenharmony_ci int32_t thresh_ptr) 2096cabdff1aSopenharmony_ci{ 2097cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 2098cabdff1aSopenharmony_ci v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 2099cabdff1aSopenharmony_ci v16u8 flat, mask, hev, thresh, b_limit, limit; 2100cabdff1aSopenharmony_ci v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 2101cabdff1aSopenharmony_ci v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 2102cabdff1aSopenharmony_ci v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 2103cabdff1aSopenharmony_ci v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 2104cabdff1aSopenharmony_ci v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 2105cabdff1aSopenharmony_ci v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 2106cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 2107cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, vec4, vec5; 2108cabdff1aSopenharmony_ci 2109cabdff1aSopenharmony_ci /* load vector elements */ 2110cabdff1aSopenharmony_ci LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); 2111cabdff1aSopenharmony_ci 2112cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_ptr); 2113cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 2114cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_ptr); 2115cabdff1aSopenharmony_ci 2116cabdff1aSopenharmony_ci /* mask and hev */ 2117cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 2118cabdff1aSopenharmony_ci hev, mask, flat); 2119cabdff1aSopenharmony_ci /* flat4 */ 2120cabdff1aSopenharmony_ci VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 2121cabdff1aSopenharmony_ci /* filter4 */ 2122cabdff1aSopenharmony_ci VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 2123cabdff1aSopenharmony_ci q1_out); 2124cabdff1aSopenharmony_ci 2125cabdff1aSopenharmony_ci /* if flat is zero for all pixels, then no need to calculate other filter */ 2126cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat)) { 2127cabdff1aSopenharmony_ci ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 2128cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec2, vec3); 2129cabdff1aSopenharmony_ci ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 2130cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec4, vec5); 2131cabdff1aSopenharmony_ci 2132cabdff1aSopenharmony_ci src_org -= 2; 2133cabdff1aSopenharmony_ci ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch); 2134cabdff1aSopenharmony_ci ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch); 2135cabdff1aSopenharmony_ci 2136cabdff1aSopenharmony_ci return 1; 2137cabdff1aSopenharmony_ci } else { 2138cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 2139cabdff1aSopenharmony_ci zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 2140cabdff1aSopenharmony_ci q3_r); 2141cabdff1aSopenharmony_ci VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 2142cabdff1aSopenharmony_ci p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 2143cabdff1aSopenharmony_ci ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 2144cabdff1aSopenharmony_ci p0_l); 2145cabdff1aSopenharmony_ci ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 2146cabdff1aSopenharmony_ci q3_l); 2147cabdff1aSopenharmony_ci VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 2148cabdff1aSopenharmony_ci p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 2149cabdff1aSopenharmony_ci 2150cabdff1aSopenharmony_ci /* convert 16 bit output data into 8 bit */ 2151cabdff1aSopenharmony_ci PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 2152cabdff1aSopenharmony_ci p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 2153cabdff1aSopenharmony_ci p0_filt8_r, q0_filt8_r); 2154cabdff1aSopenharmony_ci PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 2155cabdff1aSopenharmony_ci q2_filt8_r); 2156cabdff1aSopenharmony_ci 2157cabdff1aSopenharmony_ci /* store pixel values */ 2158cabdff1aSopenharmony_ci p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 2159cabdff1aSopenharmony_ci p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 2160cabdff1aSopenharmony_ci p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 2161cabdff1aSopenharmony_ci q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 2162cabdff1aSopenharmony_ci q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 2163cabdff1aSopenharmony_ci q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 2164cabdff1aSopenharmony_ci 2165cabdff1aSopenharmony_ci ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 2166cabdff1aSopenharmony_ci filter48 += (4 * 16); 2167cabdff1aSopenharmony_ci ST_UB2(q1_out, q2_out, filter48, 16); 2168cabdff1aSopenharmony_ci filter48 += (2 * 16); 2169cabdff1aSopenharmony_ci ST_UB(flat, filter48); 2170cabdff1aSopenharmony_ci 2171cabdff1aSopenharmony_ci return 0; 2172cabdff1aSopenharmony_ci } 2173cabdff1aSopenharmony_ci} 2174cabdff1aSopenharmony_ci 2175cabdff1aSopenharmony_cistatic int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, 2176cabdff1aSopenharmony_ci uint8_t *filter48) 2177cabdff1aSopenharmony_ci{ 2178cabdff1aSopenharmony_ci v16u8 flat, flat2, filter8; 2179cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 2180cabdff1aSopenharmony_ci v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 2181cabdff1aSopenharmony_ci v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in; 2182cabdff1aSopenharmony_ci v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in; 2183cabdff1aSopenharmony_ci v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in; 2184cabdff1aSopenharmony_ci v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in; 2185cabdff1aSopenharmony_ci v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; 2186cabdff1aSopenharmony_ci v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; 2187cabdff1aSopenharmony_ci v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; 2188cabdff1aSopenharmony_ci v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; 2189cabdff1aSopenharmony_ci v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; 2190cabdff1aSopenharmony_ci v8i16 l_out, r_out; 2191cabdff1aSopenharmony_ci 2192cabdff1aSopenharmony_ci flat = LD_UB(filter48 + 6 * 16); 2193cabdff1aSopenharmony_ci 2194cabdff1aSopenharmony_ci LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); 2195cabdff1aSopenharmony_ci LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); 2196cabdff1aSopenharmony_ci 2197cabdff1aSopenharmony_ci VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 2198cabdff1aSopenharmony_ci 2199cabdff1aSopenharmony_ci /* if flat2 is zero for all pixels, then no need to calculate other filter */ 2200cabdff1aSopenharmony_ci if (__msa_test_bz_v(flat2)) { 2201cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2202cabdff1aSopenharmony_ci 2203cabdff1aSopenharmony_ci LD_UB4(filter48, 16, p2, p1, p0, q0); 2204cabdff1aSopenharmony_ci LD_UB2(filter48 + 4 * 16, 16, q1, q2); 2205cabdff1aSopenharmony_ci 2206cabdff1aSopenharmony_ci ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 2207cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec3, vec4); 2208cabdff1aSopenharmony_ci ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 2209cabdff1aSopenharmony_ci ILVRL_H2_SH(vec1, vec0, vec6, vec7); 2210cabdff1aSopenharmony_ci ILVRL_B2_SH(q2, q1, vec2, vec5); 2211cabdff1aSopenharmony_ci 2212cabdff1aSopenharmony_ci src_org -= 3; 2213cabdff1aSopenharmony_ci ST_W4(vec3, 0, 1, 2, 3, src_org, pitch); 2214cabdff1aSopenharmony_ci ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch); 2215cabdff1aSopenharmony_ci src_org += (4 * pitch); 2216cabdff1aSopenharmony_ci ST_W4(vec4, 0, 1, 2, 3, src_org, pitch); 2217cabdff1aSopenharmony_ci ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch); 2218cabdff1aSopenharmony_ci src_org += (4 * pitch); 2219cabdff1aSopenharmony_ci ST_W4(vec6, 0, 1, 2, 3, src_org, pitch); 2220cabdff1aSopenharmony_ci ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch); 2221cabdff1aSopenharmony_ci src_org += (4 * pitch); 2222cabdff1aSopenharmony_ci ST_W4(vec7, 0, 1, 2, 3, src_org, pitch); 2223cabdff1aSopenharmony_ci ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch); 2224cabdff1aSopenharmony_ci 2225cabdff1aSopenharmony_ci return 1; 2226cabdff1aSopenharmony_ci } else { 2227cabdff1aSopenharmony_ci src -= 7 * 16; 2228cabdff1aSopenharmony_ci 2229cabdff1aSopenharmony_ci ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, 2230cabdff1aSopenharmony_ci zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, 2231cabdff1aSopenharmony_ci p3_r_in, p2_r_in, p1_r_in, p0_r_in); 2232cabdff1aSopenharmony_ci q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0); 2233cabdff1aSopenharmony_ci 2234cabdff1aSopenharmony_ci tmp0_r = p7_r_in << 3; 2235cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 2236cabdff1aSopenharmony_ci tmp0_r += p6_r_in; 2237cabdff1aSopenharmony_ci tmp0_r += q0_r_in; 2238cabdff1aSopenharmony_ci tmp1_r = p6_r_in + p5_r_in; 2239cabdff1aSopenharmony_ci tmp1_r += p4_r_in; 2240cabdff1aSopenharmony_ci tmp1_r += p3_r_in; 2241cabdff1aSopenharmony_ci tmp1_r += p2_r_in; 2242cabdff1aSopenharmony_ci tmp1_r += p1_r_in; 2243cabdff1aSopenharmony_ci tmp1_r += p0_r_in; 2244cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2245cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2246cabdff1aSopenharmony_ci 2247cabdff1aSopenharmony_ci ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, 2248cabdff1aSopenharmony_ci p5_l_in, p4_l_in); 2249cabdff1aSopenharmony_ci ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, 2250cabdff1aSopenharmony_ci p1_l_in, p0_l_in); 2251cabdff1aSopenharmony_ci q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0); 2252cabdff1aSopenharmony_ci 2253cabdff1aSopenharmony_ci tmp0_l = p7_l_in << 3; 2254cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 2255cabdff1aSopenharmony_ci tmp0_l += p6_l_in; 2256cabdff1aSopenharmony_ci tmp0_l += q0_l_in; 2257cabdff1aSopenharmony_ci tmp1_l = p6_l_in + p5_l_in; 2258cabdff1aSopenharmony_ci tmp1_l += p4_l_in; 2259cabdff1aSopenharmony_ci tmp1_l += p3_l_in; 2260cabdff1aSopenharmony_ci tmp1_l += p2_l_in; 2261cabdff1aSopenharmony_ci tmp1_l += p1_l_in; 2262cabdff1aSopenharmony_ci tmp1_l += p0_l_in; 2263cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2264cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2265cabdff1aSopenharmony_ci 2266cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2267cabdff1aSopenharmony_ci p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2); 2268cabdff1aSopenharmony_ci ST_UB(p6, src); 2269cabdff1aSopenharmony_ci src += 16; 2270cabdff1aSopenharmony_ci 2271cabdff1aSopenharmony_ci /* p5 */ 2272cabdff1aSopenharmony_ci q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1); 2273cabdff1aSopenharmony_ci tmp0_r = p5_r_in - p6_r_in; 2274cabdff1aSopenharmony_ci tmp0_r += q1_r_in; 2275cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 2276cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2277cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2278cabdff1aSopenharmony_ci q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1); 2279cabdff1aSopenharmony_ci tmp0_l = p5_l_in - p6_l_in; 2280cabdff1aSopenharmony_ci tmp0_l += q1_l_in; 2281cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 2282cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2283cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2284cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2285cabdff1aSopenharmony_ci p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2); 2286cabdff1aSopenharmony_ci ST_UB(p5, src); 2287cabdff1aSopenharmony_ci src += 16; 2288cabdff1aSopenharmony_ci 2289cabdff1aSopenharmony_ci /* p4 */ 2290cabdff1aSopenharmony_ci q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2); 2291cabdff1aSopenharmony_ci tmp0_r = p4_r_in - p5_r_in; 2292cabdff1aSopenharmony_ci tmp0_r += q2_r_in; 2293cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 2294cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2295cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2296cabdff1aSopenharmony_ci q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2); 2297cabdff1aSopenharmony_ci tmp0_l = p4_l_in - p5_l_in; 2298cabdff1aSopenharmony_ci tmp0_l += q2_l_in; 2299cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 2300cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2301cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2302cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2303cabdff1aSopenharmony_ci p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2); 2304cabdff1aSopenharmony_ci ST_UB(p4, src); 2305cabdff1aSopenharmony_ci src += 16; 2306cabdff1aSopenharmony_ci 2307cabdff1aSopenharmony_ci /* p3 */ 2308cabdff1aSopenharmony_ci q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3); 2309cabdff1aSopenharmony_ci tmp0_r = p3_r_in - p4_r_in; 2310cabdff1aSopenharmony_ci tmp0_r += q3_r_in; 2311cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 2312cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2313cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2314cabdff1aSopenharmony_ci q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3); 2315cabdff1aSopenharmony_ci tmp0_l = p3_l_in - p4_l_in; 2316cabdff1aSopenharmony_ci tmp0_l += q3_l_in; 2317cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 2318cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2319cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2320cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2321cabdff1aSopenharmony_ci p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2); 2322cabdff1aSopenharmony_ci ST_UB(p3, src); 2323cabdff1aSopenharmony_ci src += 16; 2324cabdff1aSopenharmony_ci 2325cabdff1aSopenharmony_ci /* p2 */ 2326cabdff1aSopenharmony_ci q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4); 2327cabdff1aSopenharmony_ci filter8 = LD_UB(filter48); 2328cabdff1aSopenharmony_ci tmp0_r = p2_r_in - p3_r_in; 2329cabdff1aSopenharmony_ci tmp0_r += q4_r_in; 2330cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 2331cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2332cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2333cabdff1aSopenharmony_ci q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4); 2334cabdff1aSopenharmony_ci tmp0_l = p2_l_in - p3_l_in; 2335cabdff1aSopenharmony_ci tmp0_l += q4_l_in; 2336cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 2337cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2338cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2339cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2340cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2341cabdff1aSopenharmony_ci ST_UB(filter8, src); 2342cabdff1aSopenharmony_ci src += 16; 2343cabdff1aSopenharmony_ci 2344cabdff1aSopenharmony_ci /* p1 */ 2345cabdff1aSopenharmony_ci q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5); 2346cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 16); 2347cabdff1aSopenharmony_ci tmp0_r = p1_r_in - p2_r_in; 2348cabdff1aSopenharmony_ci tmp0_r += q5_r_in; 2349cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 2350cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2351cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2352cabdff1aSopenharmony_ci q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5); 2353cabdff1aSopenharmony_ci tmp0_l = p1_l_in - p2_l_in; 2354cabdff1aSopenharmony_ci tmp0_l += q5_l_in; 2355cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 2356cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2357cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) (tmp1_l), 4); 2358cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2359cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2360cabdff1aSopenharmony_ci ST_UB(filter8, src); 2361cabdff1aSopenharmony_ci src += 16; 2362cabdff1aSopenharmony_ci 2363cabdff1aSopenharmony_ci /* p0 */ 2364cabdff1aSopenharmony_ci q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6); 2365cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 32); 2366cabdff1aSopenharmony_ci tmp0_r = p0_r_in - p1_r_in; 2367cabdff1aSopenharmony_ci tmp0_r += q6_r_in; 2368cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 2369cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2370cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2371cabdff1aSopenharmony_ci q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6); 2372cabdff1aSopenharmony_ci tmp0_l = p0_l_in - p1_l_in; 2373cabdff1aSopenharmony_ci tmp0_l += q6_l_in; 2374cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 2375cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2376cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2377cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2378cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2379cabdff1aSopenharmony_ci ST_UB(filter8, src); 2380cabdff1aSopenharmony_ci src += 16; 2381cabdff1aSopenharmony_ci 2382cabdff1aSopenharmony_ci /* q0 */ 2383cabdff1aSopenharmony_ci q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7); 2384cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 48); 2385cabdff1aSopenharmony_ci tmp0_r = q7_r_in - p0_r_in; 2386cabdff1aSopenharmony_ci tmp0_r += q0_r_in; 2387cabdff1aSopenharmony_ci tmp0_r -= p7_r_in; 2388cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2389cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2390cabdff1aSopenharmony_ci q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7); 2391cabdff1aSopenharmony_ci tmp0_l = q7_l_in - p0_l_in; 2392cabdff1aSopenharmony_ci tmp0_l += q0_l_in; 2393cabdff1aSopenharmony_ci tmp0_l -= p7_l_in; 2394cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2395cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2396cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2397cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2398cabdff1aSopenharmony_ci ST_UB(filter8, src); 2399cabdff1aSopenharmony_ci src += 16; 2400cabdff1aSopenharmony_ci 2401cabdff1aSopenharmony_ci /* q1 */ 2402cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 64); 2403cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q0_r_in; 2404cabdff1aSopenharmony_ci tmp0_r += q1_r_in; 2405cabdff1aSopenharmony_ci tmp0_r -= p6_r_in; 2406cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2407cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2408cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q0_l_in; 2409cabdff1aSopenharmony_ci tmp0_l += q1_l_in; 2410cabdff1aSopenharmony_ci tmp0_l -= p6_l_in; 2411cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2412cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2413cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2414cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2415cabdff1aSopenharmony_ci ST_UB(filter8, src); 2416cabdff1aSopenharmony_ci src += 16; 2417cabdff1aSopenharmony_ci 2418cabdff1aSopenharmony_ci /* q2 */ 2419cabdff1aSopenharmony_ci filter8 = LD_UB(filter48 + 80); 2420cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q1_r_in; 2421cabdff1aSopenharmony_ci tmp0_r += q2_r_in; 2422cabdff1aSopenharmony_ci tmp0_r -= p5_r_in; 2423cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2424cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2425cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q1_l_in; 2426cabdff1aSopenharmony_ci tmp0_l += q2_l_in; 2427cabdff1aSopenharmony_ci tmp0_l -= p5_l_in; 2428cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2429cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2430cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2431cabdff1aSopenharmony_ci filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2432cabdff1aSopenharmony_ci ST_UB(filter8, src); 2433cabdff1aSopenharmony_ci src += 16; 2434cabdff1aSopenharmony_ci 2435cabdff1aSopenharmony_ci /* q3 */ 2436cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q2_r_in; 2437cabdff1aSopenharmony_ci tmp0_r += q3_r_in; 2438cabdff1aSopenharmony_ci tmp0_r -= p4_r_in; 2439cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2440cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2441cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q2_l_in; 2442cabdff1aSopenharmony_ci tmp0_l += q3_l_in; 2443cabdff1aSopenharmony_ci tmp0_l -= p4_l_in; 2444cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2445cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2446cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2447cabdff1aSopenharmony_ci q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2); 2448cabdff1aSopenharmony_ci ST_UB(q3, src); 2449cabdff1aSopenharmony_ci src += 16; 2450cabdff1aSopenharmony_ci 2451cabdff1aSopenharmony_ci /* q4 */ 2452cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q3_r_in; 2453cabdff1aSopenharmony_ci tmp0_r += q4_r_in; 2454cabdff1aSopenharmony_ci tmp0_r -= p3_r_in; 2455cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2456cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2457cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q3_l_in; 2458cabdff1aSopenharmony_ci tmp0_l += q4_l_in; 2459cabdff1aSopenharmony_ci tmp0_l -= p3_l_in; 2460cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2461cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2462cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2463cabdff1aSopenharmony_ci q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2); 2464cabdff1aSopenharmony_ci ST_UB(q4, src); 2465cabdff1aSopenharmony_ci src += 16; 2466cabdff1aSopenharmony_ci 2467cabdff1aSopenharmony_ci /* q5 */ 2468cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q4_r_in; 2469cabdff1aSopenharmony_ci tmp0_r += q5_r_in; 2470cabdff1aSopenharmony_ci tmp0_r -= p2_r_in; 2471cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2472cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2473cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q4_l_in; 2474cabdff1aSopenharmony_ci tmp0_l += q5_l_in; 2475cabdff1aSopenharmony_ci tmp0_l -= p2_l_in; 2476cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2477cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2478cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2479cabdff1aSopenharmony_ci q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2); 2480cabdff1aSopenharmony_ci ST_UB(q5, src); 2481cabdff1aSopenharmony_ci src += 16; 2482cabdff1aSopenharmony_ci 2483cabdff1aSopenharmony_ci /* q6 */ 2484cabdff1aSopenharmony_ci tmp0_r = q7_r_in - q5_r_in; 2485cabdff1aSopenharmony_ci tmp0_r += q6_r_in; 2486cabdff1aSopenharmony_ci tmp0_r -= p1_r_in; 2487cabdff1aSopenharmony_ci tmp1_r += tmp0_r; 2488cabdff1aSopenharmony_ci r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2489cabdff1aSopenharmony_ci tmp0_l = q7_l_in - q5_l_in; 2490cabdff1aSopenharmony_ci tmp0_l += q6_l_in; 2491cabdff1aSopenharmony_ci tmp0_l -= p1_l_in; 2492cabdff1aSopenharmony_ci tmp1_l += tmp0_l; 2493cabdff1aSopenharmony_ci l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2494cabdff1aSopenharmony_ci r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2495cabdff1aSopenharmony_ci q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2); 2496cabdff1aSopenharmony_ci ST_UB(q6, src); 2497cabdff1aSopenharmony_ci 2498cabdff1aSopenharmony_ci return 0; 2499cabdff1aSopenharmony_ci } 2500cabdff1aSopenharmony_ci} 2501cabdff1aSopenharmony_ci 2502cabdff1aSopenharmony_civoid ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch, 2503cabdff1aSopenharmony_ci int32_t b_limit_ptr, 2504cabdff1aSopenharmony_ci int32_t limit_ptr, 2505cabdff1aSopenharmony_ci int32_t thresh_ptr) 2506cabdff1aSopenharmony_ci{ 2507cabdff1aSopenharmony_ci uint8_t early_exit = 0; 2508cabdff1aSopenharmony_ci uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT); 2509cabdff1aSopenharmony_ci uint8_t *filter48 = &transposed_input[16 * 16]; 2510cabdff1aSopenharmony_ci 2511cabdff1aSopenharmony_ci vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16); 2512cabdff1aSopenharmony_ci 2513cabdff1aSopenharmony_ci early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), 2514cabdff1aSopenharmony_ci &filter48[0], src, pitch, 2515cabdff1aSopenharmony_ci b_limit_ptr, limit_ptr, thresh_ptr); 2516cabdff1aSopenharmony_ci 2517cabdff1aSopenharmony_ci if (0 == early_exit) { 2518cabdff1aSopenharmony_ci early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, 2519cabdff1aSopenharmony_ci &filter48[0]); 2520cabdff1aSopenharmony_ci 2521cabdff1aSopenharmony_ci if (0 == early_exit) { 2522cabdff1aSopenharmony_ci vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch); 2523cabdff1aSopenharmony_ci } 2524cabdff1aSopenharmony_ci } 2525cabdff1aSopenharmony_ci} 2526