1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavcodec/vp8dsp.h" 22cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 23cabdff1aSopenharmony_ci#include "vp8dsp_mips.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) \ 26cabdff1aSopenharmony_ci{ \ 27cabdff1aSopenharmony_ci v16u8 p1_a_sub_q1, p0_a_sub_q0; \ 28cabdff1aSopenharmony_ci \ 29cabdff1aSopenharmony_ci p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \ 30cabdff1aSopenharmony_ci p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \ 31cabdff1aSopenharmony_ci p1_a_sub_q1 = (v16u8) __msa_srli_b((v16i8) p1_a_sub_q1, 1); \ 32cabdff1aSopenharmony_ci p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \ 33cabdff1aSopenharmony_ci mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \ 34cabdff1aSopenharmony_ci mask = ((v16u8) mask <= b_limit); \ 35cabdff1aSopenharmony_ci} 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out, \ 38cabdff1aSopenharmony_ci mask_in, hev_in) \ 39cabdff1aSopenharmony_ci{ \ 40cabdff1aSopenharmony_ci v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ 41cabdff1aSopenharmony_ci v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ 42cabdff1aSopenharmony_ci v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ 43cabdff1aSopenharmony_ci \ 44cabdff1aSopenharmony_ci p1_m = (v16i8) __msa_xori_b(p1_in_out, 0x80); \ 45cabdff1aSopenharmony_ci p0_m = (v16i8) __msa_xori_b(p0_in_out, 0x80); \ 46cabdff1aSopenharmony_ci q0_m = (v16i8) __msa_xori_b(q0_in_out, 0x80); \ 47cabdff1aSopenharmony_ci q1_m = (v16i8) __msa_xori_b(q1_in_out, 0x80); \ 48cabdff1aSopenharmony_ci \ 49cabdff1aSopenharmony_ci filt = __msa_subs_s_b(p1_m, q1_m); \ 50cabdff1aSopenharmony_ci \ 51cabdff1aSopenharmony_ci filt = filt & (v16i8) hev_in; \ 52cabdff1aSopenharmony_ci \ 53cabdff1aSopenharmony_ci q0_sub_p0 = q0_m - p0_m; \ 54cabdff1aSopenharmony_ci filt_sign = __msa_clti_s_b(filt, 0); \ 55cabdff1aSopenharmony_ci \ 56cabdff1aSopenharmony_ci cnst3h = __msa_ldi_h(3); \ 57cabdff1aSopenharmony_ci q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ 58cabdff1aSopenharmony_ci q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \ 59cabdff1aSopenharmony_ci filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ 60cabdff1aSopenharmony_ci filt_r += q0_sub_p0_r; \ 61cabdff1aSopenharmony_ci filt_r = __msa_sat_s_h(filt_r, 7); \ 62cabdff1aSopenharmony_ci \ 63cabdff1aSopenharmony_ci q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ 64cabdff1aSopenharmony_ci q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h); \ 65cabdff1aSopenharmony_ci filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \ 66cabdff1aSopenharmony_ci filt_l += q0_sub_p0_l; \ 67cabdff1aSopenharmony_ci filt_l = __msa_sat_s_h(filt_l, 7); \ 68cabdff1aSopenharmony_ci \ 69cabdff1aSopenharmony_ci filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \ 70cabdff1aSopenharmony_ci filt = filt & (v16i8) mask_in; \ 71cabdff1aSopenharmony_ci \ 72cabdff1aSopenharmony_ci cnst4b = __msa_ldi_b(4); \ 73cabdff1aSopenharmony_ci filt1 = __msa_adds_s_b(filt, cnst4b); \ 74cabdff1aSopenharmony_ci filt1 >>= 3; \ 75cabdff1aSopenharmony_ci \ 76cabdff1aSopenharmony_ci cnst3b = __msa_ldi_b(3); \ 77cabdff1aSopenharmony_ci filt2 = __msa_adds_s_b(filt, cnst3b); \ 78cabdff1aSopenharmony_ci filt2 >>= 3; \ 79cabdff1aSopenharmony_ci \ 80cabdff1aSopenharmony_ci q0_m = __msa_subs_s_b(q0_m, filt1); \ 81cabdff1aSopenharmony_ci q0_in_out = __msa_xori_b((v16u8) q0_m, 0x80); \ 82cabdff1aSopenharmony_ci p0_m = __msa_adds_s_b(p0_m, filt2); \ 83cabdff1aSopenharmony_ci p0_in_out = __msa_xori_b((v16u8) p0_m, 0x80); \ 84cabdff1aSopenharmony_ci \ 85cabdff1aSopenharmony_ci filt = __msa_srari_b(filt1, 1); \ 86cabdff1aSopenharmony_ci hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \ 87cabdff1aSopenharmony_ci filt = filt & (v16i8) hev_in; \ 88cabdff1aSopenharmony_ci \ 89cabdff1aSopenharmony_ci q1_m = __msa_subs_s_b(q1_m, filt); \ 90cabdff1aSopenharmony_ci q1_in_out = __msa_xori_b((v16u8) q1_m, 0x80); \ 91cabdff1aSopenharmony_ci p1_m = __msa_adds_s_b(p1_m, filt); \ 92cabdff1aSopenharmony_ci p1_in_out = __msa_xori_b((v16u8) p1_m, 0x80); \ 93cabdff1aSopenharmony_ci} 94cabdff1aSopenharmony_ci 95cabdff1aSopenharmony_ci#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \ 96cabdff1aSopenharmony_ci{ \ 97cabdff1aSopenharmony_ci v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign; \ 98cabdff1aSopenharmony_ci v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign; \ 99cabdff1aSopenharmony_ci v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ 100cabdff1aSopenharmony_ci \ 101cabdff1aSopenharmony_ci p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \ 102cabdff1aSopenharmony_ci p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \ 103cabdff1aSopenharmony_ci q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \ 104cabdff1aSopenharmony_ci q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \ 105cabdff1aSopenharmony_ci \ 106cabdff1aSopenharmony_ci filt = __msa_subs_s_b(p1_m, q1_m); \ 107cabdff1aSopenharmony_ci \ 108cabdff1aSopenharmony_ci q0_sub_p0 = q0_m - p0_m; \ 109cabdff1aSopenharmony_ci filt_sign = __msa_clti_s_b(filt, 0); \ 110cabdff1aSopenharmony_ci \ 111cabdff1aSopenharmony_ci cnst3h = __msa_ldi_h(3); \ 112cabdff1aSopenharmony_ci q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \ 113cabdff1aSopenharmony_ci q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \ 114cabdff1aSopenharmony_ci q0_sub_p0_r *= cnst3h; \ 115cabdff1aSopenharmony_ci filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ 116cabdff1aSopenharmony_ci filt_r += q0_sub_p0_r; \ 117cabdff1aSopenharmony_ci filt_r = __msa_sat_s_h(filt_r, 7); \ 118cabdff1aSopenharmony_ci \ 119cabdff1aSopenharmony_ci q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \ 120cabdff1aSopenharmony_ci q0_sub_p0_l *= cnst3h; \ 121cabdff1aSopenharmony_ci filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \ 122cabdff1aSopenharmony_ci filt_l += q0_sub_p0_l; \ 123cabdff1aSopenharmony_ci filt_l = __msa_sat_s_h(filt_l, 7); \ 124cabdff1aSopenharmony_ci \ 125cabdff1aSopenharmony_ci filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \ 126cabdff1aSopenharmony_ci filt = filt & (v16i8) (mask); \ 127cabdff1aSopenharmony_ci \ 128cabdff1aSopenharmony_ci cnst4b = __msa_ldi_b(4); \ 129cabdff1aSopenharmony_ci filt1 = __msa_adds_s_b(filt, cnst4b); \ 130cabdff1aSopenharmony_ci filt1 >>= 3; \ 131cabdff1aSopenharmony_ci \ 132cabdff1aSopenharmony_ci cnst3b = __msa_ldi_b(3); \ 133cabdff1aSopenharmony_ci filt2 = __msa_adds_s_b(filt, cnst3b); \ 134cabdff1aSopenharmony_ci filt2 >>= 3; \ 135cabdff1aSopenharmony_ci \ 136cabdff1aSopenharmony_ci q0_m = __msa_subs_s_b(q0_m, filt1); \ 137cabdff1aSopenharmony_ci p0_m = __msa_adds_s_b(p0_m, filt2); \ 138cabdff1aSopenharmony_ci q0_in = __msa_xori_b((v16u8) q0_m, 0x80); \ 139cabdff1aSopenharmony_ci p0_in = __msa_xori_b((v16u8) p0_m, 0x80); \ 140cabdff1aSopenharmony_ci} 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ci#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ 143cabdff1aSopenharmony_ci{ \ 144cabdff1aSopenharmony_ci v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ 145cabdff1aSopenharmony_ci v16i8 filt, q0_sub_p0, cnst4b, cnst3b; \ 146cabdff1aSopenharmony_ci v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign; \ 147cabdff1aSopenharmony_ci v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l; \ 148cabdff1aSopenharmony_ci v8i16 cnst3h, cnst27h, cnst18h, cnst63h; \ 149cabdff1aSopenharmony_ci \ 150cabdff1aSopenharmony_ci cnst3h = __msa_ldi_h(3); \ 151cabdff1aSopenharmony_ci \ 152cabdff1aSopenharmony_ci p2_m = (v16i8) __msa_xori_b(p2, 0x80); \ 153cabdff1aSopenharmony_ci p1_m = (v16i8) __msa_xori_b(p1, 0x80); \ 154cabdff1aSopenharmony_ci p0_m = (v16i8) __msa_xori_b(p0, 0x80); \ 155cabdff1aSopenharmony_ci q0_m = (v16i8) __msa_xori_b(q0, 0x80); \ 156cabdff1aSopenharmony_ci q1_m = (v16i8) __msa_xori_b(q1, 0x80); \ 157cabdff1aSopenharmony_ci q2_m = (v16i8) __msa_xori_b(q2, 0x80); \ 158cabdff1aSopenharmony_ci \ 159cabdff1aSopenharmony_ci filt = __msa_subs_s_b(p1_m, q1_m); \ 160cabdff1aSopenharmony_ci q0_sub_p0 = q0_m - p0_m; \ 161cabdff1aSopenharmony_ci q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \ 162cabdff1aSopenharmony_ci filt_sign = __msa_clti_s_b(filt, 0); \ 163cabdff1aSopenharmony_ci \ 164cabdff1aSopenharmony_ci /* right part */ \ 165cabdff1aSopenharmony_ci q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \ 166cabdff1aSopenharmony_ci q0_sub_p0_r *= cnst3h; \ 167cabdff1aSopenharmony_ci filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ 168cabdff1aSopenharmony_ci filt_r = filt_r + q0_sub_p0_r; \ 169cabdff1aSopenharmony_ci filt_r = __msa_sat_s_h(filt_r, 7); \ 170cabdff1aSopenharmony_ci \ 171cabdff1aSopenharmony_ci /* left part */ \ 172cabdff1aSopenharmony_ci q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \ 173cabdff1aSopenharmony_ci q0_sub_p0_l *= cnst3h; \ 174cabdff1aSopenharmony_ci filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \ 175cabdff1aSopenharmony_ci filt_l = filt_l + q0_sub_p0_l; \ 176cabdff1aSopenharmony_ci filt_l = __msa_sat_s_h(filt_l, 7); \ 177cabdff1aSopenharmony_ci \ 178cabdff1aSopenharmony_ci /* combine left and right part */ \ 179cabdff1aSopenharmony_ci filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \ 180cabdff1aSopenharmony_ci filt = filt & (v16i8) mask; \ 181cabdff1aSopenharmony_ci filt2 = filt & (v16i8) hev; \ 182cabdff1aSopenharmony_ci \ 183cabdff1aSopenharmony_ci /* filt_val &= ~hev */ \ 184cabdff1aSopenharmony_ci hev = __msa_xori_b(hev, 0xff); \ 185cabdff1aSopenharmony_ci filt = filt & (v16i8) hev; \ 186cabdff1aSopenharmony_ci cnst4b = __msa_ldi_b(4); \ 187cabdff1aSopenharmony_ci filt1 = __msa_adds_s_b(filt2, cnst4b); \ 188cabdff1aSopenharmony_ci filt1 >>= 3; \ 189cabdff1aSopenharmony_ci cnst3b = __msa_ldi_b(3); \ 190cabdff1aSopenharmony_ci filt2 = __msa_adds_s_b(filt2, cnst3b); \ 191cabdff1aSopenharmony_ci filt2 >>= 3; \ 192cabdff1aSopenharmony_ci q0_m = __msa_subs_s_b(q0_m, filt1); \ 193cabdff1aSopenharmony_ci p0_m = __msa_adds_s_b(p0_m, filt2); \ 194cabdff1aSopenharmony_ci \ 195cabdff1aSopenharmony_ci filt_sign = __msa_clti_s_b(filt, 0); \ 196cabdff1aSopenharmony_ci ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \ 197cabdff1aSopenharmony_ci \ 198cabdff1aSopenharmony_ci cnst27h = __msa_ldi_h(27); \ 199cabdff1aSopenharmony_ci cnst63h = __msa_ldi_h(63); \ 200cabdff1aSopenharmony_ci \ 201cabdff1aSopenharmony_ci /* right part */ \ 202cabdff1aSopenharmony_ci u_r = filt_r * cnst27h; \ 203cabdff1aSopenharmony_ci u_r += cnst63h; \ 204cabdff1aSopenharmony_ci u_r >>= 7; \ 205cabdff1aSopenharmony_ci u_r = __msa_sat_s_h(u_r, 7); \ 206cabdff1aSopenharmony_ci /* left part */ \ 207cabdff1aSopenharmony_ci u_l = filt_l * cnst27h; \ 208cabdff1aSopenharmony_ci u_l += cnst63h; \ 209cabdff1aSopenharmony_ci u_l >>= 7; \ 210cabdff1aSopenharmony_ci u_l = __msa_sat_s_h(u_l, 7); \ 211cabdff1aSopenharmony_ci /* combine left and right part */ \ 212cabdff1aSopenharmony_ci u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \ 213cabdff1aSopenharmony_ci q0_m = __msa_subs_s_b(q0_m, u); \ 214cabdff1aSopenharmony_ci q0 = __msa_xori_b((v16u8) q0_m, 0x80); \ 215cabdff1aSopenharmony_ci p0_m = __msa_adds_s_b(p0_m, u); \ 216cabdff1aSopenharmony_ci p0 = __msa_xori_b((v16u8) p0_m, 0x80); \ 217cabdff1aSopenharmony_ci cnst18h = __msa_ldi_h(18); \ 218cabdff1aSopenharmony_ci u_r = filt_r * cnst18h; \ 219cabdff1aSopenharmony_ci u_r += cnst63h; \ 220cabdff1aSopenharmony_ci u_r >>= 7; \ 221cabdff1aSopenharmony_ci u_r = __msa_sat_s_h(u_r, 7); \ 222cabdff1aSopenharmony_ci \ 223cabdff1aSopenharmony_ci /* left part */ \ 224cabdff1aSopenharmony_ci u_l = filt_l * cnst18h; \ 225cabdff1aSopenharmony_ci u_l += cnst63h; \ 226cabdff1aSopenharmony_ci u_l >>= 7; \ 227cabdff1aSopenharmony_ci u_l = __msa_sat_s_h(u_l, 7); \ 228cabdff1aSopenharmony_ci /* combine left and right part */ \ 229cabdff1aSopenharmony_ci u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \ 230cabdff1aSopenharmony_ci q1_m = __msa_subs_s_b(q1_m, u); \ 231cabdff1aSopenharmony_ci q1 = __msa_xori_b((v16u8) q1_m, 0x80); \ 232cabdff1aSopenharmony_ci p1_m = __msa_adds_s_b(p1_m, u); \ 233cabdff1aSopenharmony_ci p1 = __msa_xori_b((v16u8) p1_m, 0x80); \ 234cabdff1aSopenharmony_ci u_r = filt_r << 3; \ 235cabdff1aSopenharmony_ci u_r += filt_r + cnst63h; \ 236cabdff1aSopenharmony_ci u_r >>= 7; \ 237cabdff1aSopenharmony_ci u_r = __msa_sat_s_h(u_r, 7); \ 238cabdff1aSopenharmony_ci \ 239cabdff1aSopenharmony_ci /* left part */ \ 240cabdff1aSopenharmony_ci u_l = filt_l << 3; \ 241cabdff1aSopenharmony_ci u_l += filt_l + cnst63h; \ 242cabdff1aSopenharmony_ci u_l >>= 7; \ 243cabdff1aSopenharmony_ci u_l = __msa_sat_s_h(u_l, 7); \ 244cabdff1aSopenharmony_ci /* combine left and right part */ \ 245cabdff1aSopenharmony_ci u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \ 246cabdff1aSopenharmony_ci q2_m = __msa_subs_s_b(q2_m, u); \ 247cabdff1aSopenharmony_ci q2 = __msa_xori_b((v16u8) q2_m, 0x80); \ 248cabdff1aSopenharmony_ci p2_m = __msa_adds_s_b(p2_m, u); \ 249cabdff1aSopenharmony_ci p2 = __msa_xori_b((v16u8) p2_m, 0x80); \ 250cabdff1aSopenharmony_ci} 251cabdff1aSopenharmony_ci 252cabdff1aSopenharmony_ci#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \ 253cabdff1aSopenharmony_ci q0_in, q1_in, q2_in, q3_in, \ 254cabdff1aSopenharmony_ci limit_in, b_limit_in, thresh_in, \ 255cabdff1aSopenharmony_ci hev_out, mask_out, flat_out) \ 256cabdff1aSopenharmony_ci{ \ 257cabdff1aSopenharmony_ci v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ 258cabdff1aSopenharmony_ci v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ 259cabdff1aSopenharmony_ci \ 260cabdff1aSopenharmony_ci /* absolute subtraction of pixel values */ \ 261cabdff1aSopenharmony_ci p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in)); \ 262cabdff1aSopenharmony_ci p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in)); \ 263cabdff1aSopenharmony_ci p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in)); \ 264cabdff1aSopenharmony_ci q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in)); \ 265cabdff1aSopenharmony_ci q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in)); \ 266cabdff1aSopenharmony_ci q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in)); \ 267cabdff1aSopenharmony_ci p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in)); \ 268cabdff1aSopenharmony_ci p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in)); \ 269cabdff1aSopenharmony_ci /* calculation of hev */ \ 270cabdff1aSopenharmony_ci flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ 271cabdff1aSopenharmony_ci hev_out = (thresh_in) < (v16u8) flat_out; \ 272cabdff1aSopenharmony_ci /* calculation of mask */ \ 273cabdff1aSopenharmony_ci p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ 274cabdff1aSopenharmony_ci p1_asub_q1_m >>= 1; \ 275cabdff1aSopenharmony_ci p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ 276cabdff1aSopenharmony_ci mask_out = (b_limit_in) < p0_asub_q0_m; \ 277cabdff1aSopenharmony_ci mask_out = __msa_max_u_b(flat_out, mask_out); \ 278cabdff1aSopenharmony_ci p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ 279cabdff1aSopenharmony_ci mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ 280cabdff1aSopenharmony_ci q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ 281cabdff1aSopenharmony_ci mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ 282cabdff1aSopenharmony_ci mask_out = (limit_in) < (v16u8) mask_out; \ 283cabdff1aSopenharmony_ci mask_out = __msa_xori_b(mask_out, 0xff); \ 284cabdff1aSopenharmony_ci} 285cabdff1aSopenharmony_ci 286cabdff1aSopenharmony_ci#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) \ 287cabdff1aSopenharmony_ci{ \ 288cabdff1aSopenharmony_ci uint16_t tmp0_h; \ 289cabdff1aSopenharmony_ci uint32_t tmp0_w; \ 290cabdff1aSopenharmony_ci \ 291cabdff1aSopenharmony_ci tmp0_w = __msa_copy_u_w((v4i32) in0, in0_idx); \ 292cabdff1aSopenharmony_ci tmp0_h = __msa_copy_u_h((v8i16) in1, in1_idx); \ 293cabdff1aSopenharmony_ci SW(tmp0_w, pdst); \ 294cabdff1aSopenharmony_ci SH(tmp0_h, pdst + stride); \ 295cabdff1aSopenharmony_ci} 296cabdff1aSopenharmony_ci 297cabdff1aSopenharmony_civoid ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in, 298cabdff1aSopenharmony_ci int limit_in, int thresh_in) 299cabdff1aSopenharmony_ci{ 300cabdff1aSopenharmony_ci uint8_t *temp_src; 301cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 302cabdff1aSopenharmony_ci v16u8 mask, hev, flat, thresh, limit, b_limit; 303cabdff1aSopenharmony_ci 304cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_in); 305cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_in); 306cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_in); 307cabdff1aSopenharmony_ci /* load vector elements */ 308cabdff1aSopenharmony_ci temp_src = src - (pitch << 2); 309cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3); 310cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 311cabdff1aSopenharmony_ci hev, mask, flat); 312cabdff1aSopenharmony_ci VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 313cabdff1aSopenharmony_ci /* store vector elements */ 314cabdff1aSopenharmony_ci temp_src = src - 3 * pitch; 315cabdff1aSopenharmony_ci ST_UB4(p2, p1, p0, q0, temp_src, pitch); 316cabdff1aSopenharmony_ci temp_src += (4 * pitch); 317cabdff1aSopenharmony_ci ST_UB2(q1, q2, temp_src, pitch); 318cabdff1aSopenharmony_ci} 319cabdff1aSopenharmony_ci 320cabdff1aSopenharmony_civoid ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v, 321cabdff1aSopenharmony_ci ptrdiff_t pitch, int b_limit_in, int limit_in, 322cabdff1aSopenharmony_ci int thresh_in) 323cabdff1aSopenharmony_ci{ 324cabdff1aSopenharmony_ci uint8_t *temp_src; 325cabdff1aSopenharmony_ci uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 326cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 327cabdff1aSopenharmony_ci v16u8 mask, hev, flat, thresh, limit, b_limit; 328cabdff1aSopenharmony_ci v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; 329cabdff1aSopenharmony_ci v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_in); 332cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_in); 333cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_in); 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci temp_src = src_u - (pitch << 2); 336cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); 337cabdff1aSopenharmony_ci temp_src = src_v - (pitch << 2); 338cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); 339cabdff1aSopenharmony_ci 340cabdff1aSopenharmony_ci /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */ 341cabdff1aSopenharmony_ci ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); 342cabdff1aSopenharmony_ci ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); 343cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 344cabdff1aSopenharmony_ci hev, mask, flat); 345cabdff1aSopenharmony_ci VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 346cabdff1aSopenharmony_ci 347cabdff1aSopenharmony_ci p2_d = __msa_copy_u_d((v2i64) p2, 0); 348cabdff1aSopenharmony_ci p1_d = __msa_copy_u_d((v2i64) p1, 0); 349cabdff1aSopenharmony_ci p0_d = __msa_copy_u_d((v2i64) p0, 0); 350cabdff1aSopenharmony_ci q0_d = __msa_copy_u_d((v2i64) q0, 0); 351cabdff1aSopenharmony_ci q1_d = __msa_copy_u_d((v2i64) q1, 0); 352cabdff1aSopenharmony_ci q2_d = __msa_copy_u_d((v2i64) q2, 0); 353cabdff1aSopenharmony_ci src_u -= (pitch * 3); 354cabdff1aSopenharmony_ci SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch); 355cabdff1aSopenharmony_ci src_u += 4 * pitch; 356cabdff1aSopenharmony_ci SD(q1_d, src_u); 357cabdff1aSopenharmony_ci src_u += pitch; 358cabdff1aSopenharmony_ci SD(q2_d, src_u); 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci p2_d = __msa_copy_u_d((v2i64) p2, 1); 361cabdff1aSopenharmony_ci p1_d = __msa_copy_u_d((v2i64) p1, 1); 362cabdff1aSopenharmony_ci p0_d = __msa_copy_u_d((v2i64) p0, 1); 363cabdff1aSopenharmony_ci q0_d = __msa_copy_u_d((v2i64) q0, 1); 364cabdff1aSopenharmony_ci q1_d = __msa_copy_u_d((v2i64) q1, 1); 365cabdff1aSopenharmony_ci q2_d = __msa_copy_u_d((v2i64) q2, 1); 366cabdff1aSopenharmony_ci src_v -= (pitch * 3); 367cabdff1aSopenharmony_ci SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch); 368cabdff1aSopenharmony_ci src_v += 4 * pitch; 369cabdff1aSopenharmony_ci SD(q1_d, src_v); 370cabdff1aSopenharmony_ci src_v += pitch; 371cabdff1aSopenharmony_ci SD(q2_d, src_v); 372cabdff1aSopenharmony_ci} 373cabdff1aSopenharmony_ci 374cabdff1aSopenharmony_civoid ff_vp8_h_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in, 375cabdff1aSopenharmony_ci int limit_in, int thresh_in) 376cabdff1aSopenharmony_ci{ 377cabdff1aSopenharmony_ci uint8_t *temp_src; 378cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 379cabdff1aSopenharmony_ci v16u8 mask, hev, flat, thresh, limit, b_limit; 380cabdff1aSopenharmony_ci v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 381cabdff1aSopenharmony_ci v16u8 row9, row10, row11, row12, row13, row14, row15; 382cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 383cabdff1aSopenharmony_ci 384cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_in); 385cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_in); 386cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_in); 387cabdff1aSopenharmony_ci temp_src = src - 4; 388cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 389cabdff1aSopenharmony_ci temp_src += (8 * pitch); 390cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, 391cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15); 392cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 393cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15, 394cabdff1aSopenharmony_ci p3, p2, p1, p0, q0, q1, q2, q3); 395cabdff1aSopenharmony_ci 396cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 397cabdff1aSopenharmony_ci hev, mask, flat); 398cabdff1aSopenharmony_ci VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 399cabdff1aSopenharmony_ci ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 400cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4); 401cabdff1aSopenharmony_ci ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 402cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7); 403cabdff1aSopenharmony_ci ILVRL_B2_SH(q2, q1, tmp2, tmp5); 404cabdff1aSopenharmony_ci 405cabdff1aSopenharmony_ci temp_src = src - 3; 406cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4); 407cabdff1aSopenharmony_ci temp_src += pitch; 408cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4); 409cabdff1aSopenharmony_ci temp_src += pitch; 410cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4); 411cabdff1aSopenharmony_ci temp_src += pitch; 412cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4); 413cabdff1aSopenharmony_ci temp_src += pitch; 414cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4); 415cabdff1aSopenharmony_ci temp_src += pitch; 416cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4); 417cabdff1aSopenharmony_ci temp_src += pitch; 418cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4); 419cabdff1aSopenharmony_ci temp_src += pitch; 420cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4); 421cabdff1aSopenharmony_ci temp_src += pitch; 422cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4); 423cabdff1aSopenharmony_ci temp_src += pitch; 424cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4); 425cabdff1aSopenharmony_ci temp_src += pitch; 426cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4); 427cabdff1aSopenharmony_ci temp_src += pitch; 428cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4); 429cabdff1aSopenharmony_ci temp_src += pitch; 430cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4); 431cabdff1aSopenharmony_ci temp_src += pitch; 432cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4); 433cabdff1aSopenharmony_ci temp_src += pitch; 434cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4); 435cabdff1aSopenharmony_ci temp_src += pitch; 436cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4); 437cabdff1aSopenharmony_ci} 438cabdff1aSopenharmony_ci 439cabdff1aSopenharmony_civoid ff_vp8_h_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v, 440cabdff1aSopenharmony_ci ptrdiff_t pitch, int b_limit_in, int limit_in, 441cabdff1aSopenharmony_ci int thresh_in) 442cabdff1aSopenharmony_ci{ 443cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 444cabdff1aSopenharmony_ci v16u8 mask, hev, flat, thresh, limit, b_limit; 445cabdff1aSopenharmony_ci v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 446cabdff1aSopenharmony_ci v16u8 row9, row10, row11, row12, row13, row14, row15; 447cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_in); 450cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_in); 451cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_in); 452cabdff1aSopenharmony_ci 453cabdff1aSopenharmony_ci LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 454cabdff1aSopenharmony_ci LD_UB8(src_v - 4, pitch, 455cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15); 456cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 457cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15, 458cabdff1aSopenharmony_ci p3, p2, p1, p0, q0, q1, q2, q3); 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 461cabdff1aSopenharmony_ci hev, mask, flat); 462cabdff1aSopenharmony_ci VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ci ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 465cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4); 466cabdff1aSopenharmony_ci ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 467cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7); 468cabdff1aSopenharmony_ci ILVRL_B2_SH(q2, q1, tmp2, tmp5); 469cabdff1aSopenharmony_ci 470cabdff1aSopenharmony_ci src_u -= 3; 471cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4); 472cabdff1aSopenharmony_ci src_u += pitch; 473cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4); 474cabdff1aSopenharmony_ci src_u += pitch; 475cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4); 476cabdff1aSopenharmony_ci src_u += pitch; 477cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4); 478cabdff1aSopenharmony_ci src_u += pitch; 479cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4); 480cabdff1aSopenharmony_ci src_u += pitch; 481cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4); 482cabdff1aSopenharmony_ci src_u += pitch; 483cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4); 484cabdff1aSopenharmony_ci src_u += pitch; 485cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4); 486cabdff1aSopenharmony_ci 487cabdff1aSopenharmony_ci src_v -= 3; 488cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4); 489cabdff1aSopenharmony_ci src_v += pitch; 490cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4); 491cabdff1aSopenharmony_ci src_v += pitch; 492cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4); 493cabdff1aSopenharmony_ci src_v += pitch; 494cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4); 495cabdff1aSopenharmony_ci src_v += pitch; 496cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4); 497cabdff1aSopenharmony_ci src_v += pitch; 498cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4); 499cabdff1aSopenharmony_ci src_v += pitch; 500cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4); 501cabdff1aSopenharmony_ci src_v += pitch; 502cabdff1aSopenharmony_ci VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4); 503cabdff1aSopenharmony_ci} 504cabdff1aSopenharmony_ci 505cabdff1aSopenharmony_civoid ff_vp8_v_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch, 506cabdff1aSopenharmony_ci int b_limit_ptr) 507cabdff1aSopenharmony_ci{ 508cabdff1aSopenharmony_ci v16u8 p1, p0, q1, q0; 509cabdff1aSopenharmony_ci v16u8 mask, b_limit; 510cabdff1aSopenharmony_ci 511cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 512cabdff1aSopenharmony_ci /* load vector elements */ 513cabdff1aSopenharmony_ci LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1); 514cabdff1aSopenharmony_ci VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); 515cabdff1aSopenharmony_ci VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); 516cabdff1aSopenharmony_ci ST_UB2(p0, q0, (src - pitch), pitch); 517cabdff1aSopenharmony_ci} 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_civoid ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch, 520cabdff1aSopenharmony_ci int b_limit_ptr) 521cabdff1aSopenharmony_ci{ 522cabdff1aSopenharmony_ci uint8_t *temp_src; 523cabdff1aSopenharmony_ci v16u8 p1, p0, q1, q0; 524cabdff1aSopenharmony_ci v16u8 mask, b_limit; 525cabdff1aSopenharmony_ci v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 526cabdff1aSopenharmony_ci v16u8 row9, row10, row11, row12, row13, row14, row15; 527cabdff1aSopenharmony_ci v8i16 tmp0, tmp1; 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 530cabdff1aSopenharmony_ci temp_src = src - 2; 531cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 532cabdff1aSopenharmony_ci temp_src += (8 * pitch); 533cabdff1aSopenharmony_ci LD_UB8(temp_src, pitch, 534cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15); 535cabdff1aSopenharmony_ci TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 536cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15, 537cabdff1aSopenharmony_ci p1, p0, q0, q1); 538cabdff1aSopenharmony_ci VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); 539cabdff1aSopenharmony_ci VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); 540cabdff1aSopenharmony_ci ILVRL_B2_SH(q0, p0, tmp1, tmp0); 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_ci src -= 1; 543cabdff1aSopenharmony_ci ST_H8(tmp1, 0, 1, 2, 3, 4, 5, 6, 7, src, pitch) 544cabdff1aSopenharmony_ci ST_H8(tmp0, 0, 1, 2, 3, 4, 5, 6, 7, src + 8 * pitch, pitch) 545cabdff1aSopenharmony_ci} 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_civoid ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, 548cabdff1aSopenharmony_ci ptrdiff_t pitch, int b_limit_in, 549cabdff1aSopenharmony_ci int limit_in, int thresh_in) 550cabdff1aSopenharmony_ci{ 551cabdff1aSopenharmony_ci uint64_t p1_d, p0_d, q0_d, q1_d; 552cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 553cabdff1aSopenharmony_ci v16u8 mask, hev, flat, thresh, limit, b_limit; 554cabdff1aSopenharmony_ci v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; 555cabdff1aSopenharmony_ci v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_in); 558cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_in); 559cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_in); 560cabdff1aSopenharmony_ci 561cabdff1aSopenharmony_ci src_u = src_u - (pitch << 2); 562cabdff1aSopenharmony_ci LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); 563cabdff1aSopenharmony_ci src_u += (5 * pitch); 564cabdff1aSopenharmony_ci src_v = src_v - (pitch << 2); 565cabdff1aSopenharmony_ci LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); 566cabdff1aSopenharmony_ci src_v += (5 * pitch); 567cabdff1aSopenharmony_ci 568cabdff1aSopenharmony_ci /* right 8 element of p3 are u pixel and 569cabdff1aSopenharmony_ci left 8 element of p3 are v pixel */ 570cabdff1aSopenharmony_ci ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); 571cabdff1aSopenharmony_ci ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); 572cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 573cabdff1aSopenharmony_ci hev, mask, flat); 574cabdff1aSopenharmony_ci VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 575cabdff1aSopenharmony_ci 576cabdff1aSopenharmony_ci p1_d = __msa_copy_u_d((v2i64) p1, 0); 577cabdff1aSopenharmony_ci p0_d = __msa_copy_u_d((v2i64) p0, 0); 578cabdff1aSopenharmony_ci q0_d = __msa_copy_u_d((v2i64) q0, 0); 579cabdff1aSopenharmony_ci q1_d = __msa_copy_u_d((v2i64) q1, 0); 580cabdff1aSopenharmony_ci SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch)); 581cabdff1aSopenharmony_ci 582cabdff1aSopenharmony_ci p1_d = __msa_copy_u_d((v2i64) p1, 1); 583cabdff1aSopenharmony_ci p0_d = __msa_copy_u_d((v2i64) p0, 1); 584cabdff1aSopenharmony_ci q0_d = __msa_copy_u_d((v2i64) q0, 1); 585cabdff1aSopenharmony_ci q1_d = __msa_copy_u_d((v2i64) q1, 1); 586cabdff1aSopenharmony_ci SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch)); 587cabdff1aSopenharmony_ci} 588cabdff1aSopenharmony_ci 589cabdff1aSopenharmony_civoid ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, 590cabdff1aSopenharmony_ci ptrdiff_t pitch, int b_limit_in, 591cabdff1aSopenharmony_ci int limit_in, int thresh_in) 592cabdff1aSopenharmony_ci{ 593cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 594cabdff1aSopenharmony_ci v16u8 mask, hev, flat, thresh, limit, b_limit; 595cabdff1aSopenharmony_ci v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 596cabdff1aSopenharmony_ci v16u8 row9, row10, row11, row12, row13, row14, row15; 597cabdff1aSopenharmony_ci v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 598cabdff1aSopenharmony_ci 599cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(thresh_in); 600cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(limit_in); 601cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(b_limit_in); 602cabdff1aSopenharmony_ci 603cabdff1aSopenharmony_ci LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 604cabdff1aSopenharmony_ci LD_UB8(src_v - 4, pitch, 605cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15); 606cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 607cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15, 608cabdff1aSopenharmony_ci p3, p2, p1, p0, q0, q1, q2, q3); 609cabdff1aSopenharmony_ci 610cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 611cabdff1aSopenharmony_ci hev, mask, flat); 612cabdff1aSopenharmony_ci VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 613cabdff1aSopenharmony_ci ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1); 614cabdff1aSopenharmony_ci ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3); 615cabdff1aSopenharmony_ci tmp0 = (v4i32) __msa_ilvl_b((v16i8) p0, (v16i8) p1); 616cabdff1aSopenharmony_ci tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0); 617cabdff1aSopenharmony_ci ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5); 618cabdff1aSopenharmony_ci 619cabdff1aSopenharmony_ci ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src_u - 2, pitch); 620cabdff1aSopenharmony_ci ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src_v - 2, pitch); 621cabdff1aSopenharmony_ci} 622cabdff1aSopenharmony_ci 623cabdff1aSopenharmony_civoid ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, 624cabdff1aSopenharmony_ci int32_t e, int32_t i, int32_t h) 625cabdff1aSopenharmony_ci{ 626cabdff1aSopenharmony_ci v16u8 mask, hev, flat; 627cabdff1aSopenharmony_ci v16u8 thresh, b_limit, limit; 628cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 629cabdff1aSopenharmony_ci 630cabdff1aSopenharmony_ci /* load vector elements */ 631cabdff1aSopenharmony_ci LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 632cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(h); 633cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(e); 634cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(i); 635cabdff1aSopenharmony_ci 636cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 637cabdff1aSopenharmony_ci hev, mask, flat); 638cabdff1aSopenharmony_ci VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 639cabdff1aSopenharmony_ci 640cabdff1aSopenharmony_ci ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); 641cabdff1aSopenharmony_ci} 642cabdff1aSopenharmony_ci 643cabdff1aSopenharmony_civoid ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, 644cabdff1aSopenharmony_ci int32_t e, int32_t i, int32_t h) 645cabdff1aSopenharmony_ci{ 646cabdff1aSopenharmony_ci v16u8 mask, hev, flat; 647cabdff1aSopenharmony_ci v16u8 thresh, b_limit, limit; 648cabdff1aSopenharmony_ci v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 649cabdff1aSopenharmony_ci v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 650cabdff1aSopenharmony_ci v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 651cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_ci LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 654cabdff1aSopenharmony_ci LD_UB8(src - 4 + (8 * pitch), pitch, 655cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15); 656cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 657cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15, 658cabdff1aSopenharmony_ci p3, p2, p1, p0, q0, q1, q2, q3); 659cabdff1aSopenharmony_ci 660cabdff1aSopenharmony_ci thresh = (v16u8) __msa_fill_b(h); 661cabdff1aSopenharmony_ci b_limit = (v16u8) __msa_fill_b(e); 662cabdff1aSopenharmony_ci limit = (v16u8) __msa_fill_b(i); 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ci LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 665cabdff1aSopenharmony_ci hev, mask, flat); 666cabdff1aSopenharmony_ci VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 667cabdff1aSopenharmony_ci ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); 668cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); 669cabdff1aSopenharmony_ci ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); 670cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); 671cabdff1aSopenharmony_ci 672cabdff1aSopenharmony_ci src -= 2; 673cabdff1aSopenharmony_ci ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch) 674cabdff1aSopenharmony_ci ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch) 675cabdff1aSopenharmony_ci} 676