1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn> 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "vp3dsp_mips.h" 22cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 23cabdff1aSopenharmony_ci#include "libavutil/intreadwrite.h" 24cabdff1aSopenharmony_ci#include "libavcodec/rnd_avg.h" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_cistatic void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) 27cabdff1aSopenharmony_ci{ 28cabdff1aSopenharmony_ci v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign; 29cabdff1aSopenharmony_ci v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l, 30cabdff1aSopenharmony_ci r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l; 31cabdff1aSopenharmony_ci v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H; 32cabdff1aSopenharmony_ci v4i32 Ed, Gd, Add, Bdd, Fd, Hd; 33cabdff1aSopenharmony_ci v16u8 sign_l; 34cabdff1aSopenharmony_ci v16i8 d0, d1, d2, d3, d4, d5, d6, d7; 35cabdff1aSopenharmony_ci v4i32 c0, c1, c2, c3, c4, c5, c6, c7; 36cabdff1aSopenharmony_ci v4i32 f0, f1, f2, f3, f4, f5, f6, f7; 37cabdff1aSopenharmony_ci v4i32 sign_t; 38cabdff1aSopenharmony_ci v16i8 zero = {0}; 39cabdff1aSopenharmony_ci v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; 40cabdff1aSopenharmony_ci v4i32 cnst64277w = {64277, 64277, 64277, 64277}; 41cabdff1aSopenharmony_ci v4i32 cnst60547w = {60547, 60547, 60547, 60547}; 42cabdff1aSopenharmony_ci v4i32 cnst54491w = {54491, 54491, 54491, 54491}; 43cabdff1aSopenharmony_ci v4i32 cnst46341w = {46341, 46341, 46341, 46341}; 44cabdff1aSopenharmony_ci v4i32 cnst36410w = {36410, 36410, 36410, 36410}; 45cabdff1aSopenharmony_ci v4i32 cnst25080w = {25080, 25080, 25080, 25080}; 46cabdff1aSopenharmony_ci v4i32 cnst12785w = {12785, 12785, 12785, 12785}; 47cabdff1aSopenharmony_ci v4i32 cnst8w = {8, 8, 8, 8}; 48cabdff1aSopenharmony_ci v4i32 cnst2048w = {2048, 2048, 2048, 2048}; 49cabdff1aSopenharmony_ci v4i32 cnst128w = {128, 128, 128, 128}; 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ci /* Extended input data */ 52cabdff1aSopenharmony_ci LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7); 53cabdff1aSopenharmony_ci sign = __msa_clti_s_h(r0, 0); 54cabdff1aSopenharmony_ci r0_r = (v4i32) __msa_ilvr_h(sign, r0); 55cabdff1aSopenharmony_ci r0_l = (v4i32) __msa_ilvl_h(sign, r0); 56cabdff1aSopenharmony_ci sign = __msa_clti_s_h(r1, 0); 57cabdff1aSopenharmony_ci r1_r = (v4i32) __msa_ilvr_h(sign, r1); 58cabdff1aSopenharmony_ci r1_l = (v4i32) __msa_ilvl_h(sign, r1); 59cabdff1aSopenharmony_ci sign = __msa_clti_s_h(r2, 0); 60cabdff1aSopenharmony_ci r2_r = (v4i32) __msa_ilvr_h(sign, r2); 61cabdff1aSopenharmony_ci r2_l = (v4i32) __msa_ilvl_h(sign, r2); 62cabdff1aSopenharmony_ci sign = __msa_clti_s_h(r3, 0); 63cabdff1aSopenharmony_ci r3_r = (v4i32) __msa_ilvr_h(sign, r3); 64cabdff1aSopenharmony_ci r3_l = (v4i32) __msa_ilvl_h(sign, r3); 65cabdff1aSopenharmony_ci sign = __msa_clti_s_h(r4, 0); 66cabdff1aSopenharmony_ci r4_r = (v4i32) __msa_ilvr_h(sign, r4); 67cabdff1aSopenharmony_ci r4_l = (v4i32) __msa_ilvl_h(sign, r4); 68cabdff1aSopenharmony_ci sign = __msa_clti_s_h(r5, 0); 69cabdff1aSopenharmony_ci r5_r = (v4i32) __msa_ilvr_h(sign, r5); 70cabdff1aSopenharmony_ci r5_l = (v4i32) __msa_ilvl_h(sign, r5); 71cabdff1aSopenharmony_ci sign = __msa_clti_s_h(r6, 0); 72cabdff1aSopenharmony_ci r6_r = (v4i32) __msa_ilvr_h(sign, r6); 73cabdff1aSopenharmony_ci r6_l = (v4i32) __msa_ilvl_h(sign, r6); 74cabdff1aSopenharmony_ci sign = __msa_clti_s_h(r7, 0); 75cabdff1aSopenharmony_ci r7_r = (v4i32) __msa_ilvr_h(sign, r7); 76cabdff1aSopenharmony_ci r7_l = (v4i32) __msa_ilvl_h(sign, r7); 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_ci /* Right part */ 79cabdff1aSopenharmony_ci A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16); 80cabdff1aSopenharmony_ci B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16); 81cabdff1aSopenharmony_ci C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16); 82cabdff1aSopenharmony_ci D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16); 83cabdff1aSopenharmony_ci Ad = ((A - C) * cnst46341w) >> 16; 84cabdff1aSopenharmony_ci Bd = ((B - D) * cnst46341w) >> 16; 85cabdff1aSopenharmony_ci Cd = A + C; 86cabdff1aSopenharmony_ci Dd = B + D; 87cabdff1aSopenharmony_ci E = ((r0_r + r4_r) * cnst46341w) >> 16; 88cabdff1aSopenharmony_ci F = ((r0_r - r4_r) * cnst46341w) >> 16; 89cabdff1aSopenharmony_ci G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16); 90cabdff1aSopenharmony_ci H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16); 91cabdff1aSopenharmony_ci Ed = E - G; 92cabdff1aSopenharmony_ci Gd = E + G; 93cabdff1aSopenharmony_ci Add = F + Ad; 94cabdff1aSopenharmony_ci Bdd = Bd - H; 95cabdff1aSopenharmony_ci Fd = F - Ad; 96cabdff1aSopenharmony_ci Hd = Bd + H; 97cabdff1aSopenharmony_ci r0_r = Gd + Cd; 98cabdff1aSopenharmony_ci r7_r = Gd - Cd; 99cabdff1aSopenharmony_ci r1_r = Add + Hd; 100cabdff1aSopenharmony_ci r2_r = Add - Hd; 101cabdff1aSopenharmony_ci r3_r = Ed + Dd; 102cabdff1aSopenharmony_ci r4_r = Ed - Dd; 103cabdff1aSopenharmony_ci r5_r = Fd + Bdd; 104cabdff1aSopenharmony_ci r6_r = Fd - Bdd; 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci /* Left part */ 107cabdff1aSopenharmony_ci A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16); 108cabdff1aSopenharmony_ci B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16); 109cabdff1aSopenharmony_ci C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16); 110cabdff1aSopenharmony_ci D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16); 111cabdff1aSopenharmony_ci Ad = ((A - C) * cnst46341w) >> 16; 112cabdff1aSopenharmony_ci Bd = ((B - D) * cnst46341w) >> 16; 113cabdff1aSopenharmony_ci Cd = A + C; 114cabdff1aSopenharmony_ci Dd = B + D; 115cabdff1aSopenharmony_ci E = ((r0_l + r4_l) * cnst46341w) >> 16; 116cabdff1aSopenharmony_ci F = ((r0_l - r4_l) * cnst46341w) >> 16; 117cabdff1aSopenharmony_ci G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16); 118cabdff1aSopenharmony_ci H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16); 119cabdff1aSopenharmony_ci Ed = E - G; 120cabdff1aSopenharmony_ci Gd = E + G; 121cabdff1aSopenharmony_ci Add = F + Ad; 122cabdff1aSopenharmony_ci Bdd = Bd - H; 123cabdff1aSopenharmony_ci Fd = F - Ad; 124cabdff1aSopenharmony_ci Hd = Bd + H; 125cabdff1aSopenharmony_ci r0_l = Gd + Cd; 126cabdff1aSopenharmony_ci r7_l = Gd - Cd; 127cabdff1aSopenharmony_ci r1_l = Add + Hd; 128cabdff1aSopenharmony_ci r2_l = Add - Hd; 129cabdff1aSopenharmony_ci r3_l = Ed + Dd; 130cabdff1aSopenharmony_ci r4_l = Ed - Dd; 131cabdff1aSopenharmony_ci r5_l = Fd + Bdd; 132cabdff1aSopenharmony_ci r6_l = Fd - Bdd; 133cabdff1aSopenharmony_ci 134cabdff1aSopenharmony_ci /* Row 0 to 3 */ 135cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(r0_r, r1_r, r2_r, r3_r, 136cabdff1aSopenharmony_ci r0_r, r1_r, r2_r, r3_r); 137cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(r0_l, r1_l, r2_l, r3_l, 138cabdff1aSopenharmony_ci r0_l, r1_l, r2_l, r3_l); 139cabdff1aSopenharmony_ci A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16); 140cabdff1aSopenharmony_ci B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16); 141cabdff1aSopenharmony_ci C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16); 142cabdff1aSopenharmony_ci D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16); 143cabdff1aSopenharmony_ci Ad = ((A - C) * cnst46341w) >> 16; 144cabdff1aSopenharmony_ci Bd = ((B - D) * cnst46341w) >> 16; 145cabdff1aSopenharmony_ci Cd = A + C; 146cabdff1aSopenharmony_ci Dd = B + D; 147cabdff1aSopenharmony_ci E = ((r0_r + r0_l) * cnst46341w) >> 16; 148cabdff1aSopenharmony_ci E += cnst8w; 149cabdff1aSopenharmony_ci F = ((r0_r - r0_l) * cnst46341w) >> 16; 150cabdff1aSopenharmony_ci F += cnst8w; 151cabdff1aSopenharmony_ci if (type == 1) { // HACK 152cabdff1aSopenharmony_ci E += cnst2048w; 153cabdff1aSopenharmony_ci F += cnst2048w; 154cabdff1aSopenharmony_ci } 155cabdff1aSopenharmony_ci G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16); 156cabdff1aSopenharmony_ci H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16); 157cabdff1aSopenharmony_ci Ed = E - G; 158cabdff1aSopenharmony_ci Gd = E + G; 159cabdff1aSopenharmony_ci Add = F + Ad; 160cabdff1aSopenharmony_ci Bdd = Bd - H; 161cabdff1aSopenharmony_ci Fd = F - Ad; 162cabdff1aSopenharmony_ci Hd = Bd + H; 163cabdff1aSopenharmony_ci A = (Gd + Cd) >> 4; 164cabdff1aSopenharmony_ci B = (Gd - Cd) >> 4; 165cabdff1aSopenharmony_ci C = (Add + Hd) >> 4; 166cabdff1aSopenharmony_ci D = (Add - Hd) >> 4; 167cabdff1aSopenharmony_ci E = (Ed + Dd) >> 4; 168cabdff1aSopenharmony_ci F = (Ed - Dd) >> 4; 169cabdff1aSopenharmony_ci G = (Fd + Bdd) >> 4; 170cabdff1aSopenharmony_ci H = (Fd - Bdd) >> 4; 171cabdff1aSopenharmony_ci if (type != 1) { 172cabdff1aSopenharmony_ci LD_SB8(dst, stride, d0, d1, d2, d3, d4, d5, d6, d7); 173cabdff1aSopenharmony_ci ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3, 174cabdff1aSopenharmony_ci f0, f1, f2, f3); 175cabdff1aSopenharmony_ci ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7, 176cabdff1aSopenharmony_ci f4, f5, f6, f7); 177cabdff1aSopenharmony_ci ILVR_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3, 178cabdff1aSopenharmony_ci c0, c1, c2, c3); 179cabdff1aSopenharmony_ci ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7, 180cabdff1aSopenharmony_ci c4, c5, c6, c7); 181cabdff1aSopenharmony_ci A += c0; 182cabdff1aSopenharmony_ci B += c7; 183cabdff1aSopenharmony_ci C += c1; 184cabdff1aSopenharmony_ci D += c2; 185cabdff1aSopenharmony_ci E += c3; 186cabdff1aSopenharmony_ci F += c4; 187cabdff1aSopenharmony_ci G += c5; 188cabdff1aSopenharmony_ci H += c6; 189cabdff1aSopenharmony_ci } 190cabdff1aSopenharmony_ci CLIP_SW8_0_255(A, B, C, D, E, F, G, H); 191cabdff1aSopenharmony_ci sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r); 192cabdff1aSopenharmony_ci sign_l = __msa_or_v(sign_l, (v16u8)r3_r); 193cabdff1aSopenharmony_ci sign_l = __msa_or_v(sign_l, (v16u8)r0_l); 194cabdff1aSopenharmony_ci sign_l = __msa_or_v(sign_l, (v16u8)r1_l); 195cabdff1aSopenharmony_ci sign_l = __msa_or_v(sign_l, (v16u8)r2_l); 196cabdff1aSopenharmony_ci sign_l = __msa_or_v(sign_l, (v16u8)r3_l); 197cabdff1aSopenharmony_ci sign_t = __msa_ceqi_w((v4i32)sign_l, 0); 198cabdff1aSopenharmony_ci Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20; 199cabdff1aSopenharmony_ci if (type == 1) { 200cabdff1aSopenharmony_ci Bdd = Add + cnst128w; 201cabdff1aSopenharmony_ci CLIP_SW_0_255(Bdd); 202cabdff1aSopenharmony_ci Ad = Bdd; 203cabdff1aSopenharmony_ci Bd = Bdd; 204cabdff1aSopenharmony_ci Cd = Bdd; 205cabdff1aSopenharmony_ci Dd = Bdd; 206cabdff1aSopenharmony_ci Ed = Bdd; 207cabdff1aSopenharmony_ci Fd = Bdd; 208cabdff1aSopenharmony_ci Gd = Bdd; 209cabdff1aSopenharmony_ci Hd = Bdd; 210cabdff1aSopenharmony_ci } else { 211cabdff1aSopenharmony_ci Ad = Add + c0; 212cabdff1aSopenharmony_ci Bd = Add + c1; 213cabdff1aSopenharmony_ci Cd = Add + c2; 214cabdff1aSopenharmony_ci Dd = Add + c3; 215cabdff1aSopenharmony_ci Ed = Add + c4; 216cabdff1aSopenharmony_ci Fd = Add + c5; 217cabdff1aSopenharmony_ci Gd = Add + c6; 218cabdff1aSopenharmony_ci Hd = Add + c7; 219cabdff1aSopenharmony_ci CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); 220cabdff1aSopenharmony_ci } 221cabdff1aSopenharmony_ci Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); 222cabdff1aSopenharmony_ci Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t); 223cabdff1aSopenharmony_ci Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t); 224cabdff1aSopenharmony_ci Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t); 225cabdff1aSopenharmony_ci Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t); 226cabdff1aSopenharmony_ci Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t); 227cabdff1aSopenharmony_ci Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t); 228cabdff1aSopenharmony_ci Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t); 229cabdff1aSopenharmony_ci sign_t = __msa_ceqi_w(sign_t, 0); 230cabdff1aSopenharmony_ci A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t); 231cabdff1aSopenharmony_ci B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t); 232cabdff1aSopenharmony_ci C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t); 233cabdff1aSopenharmony_ci D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t); 234cabdff1aSopenharmony_ci E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t); 235cabdff1aSopenharmony_ci F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t); 236cabdff1aSopenharmony_ci G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t); 237cabdff1aSopenharmony_ci H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t); 238cabdff1aSopenharmony_ci r0_r = Ad + A; 239cabdff1aSopenharmony_ci r1_r = Bd + C; 240cabdff1aSopenharmony_ci r2_r = Cd + D; 241cabdff1aSopenharmony_ci r3_r = Dd + E; 242cabdff1aSopenharmony_ci r0_l = Ed + F; 243cabdff1aSopenharmony_ci r1_l = Fd + G; 244cabdff1aSopenharmony_ci r2_l = Gd + H; 245cabdff1aSopenharmony_ci r3_l = Hd + B; 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci /* Row 4 to 7 */ 248cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r, 249cabdff1aSopenharmony_ci r4_r, r5_r, r6_r, r7_r); 250cabdff1aSopenharmony_ci TRANSPOSE4x4_SW_SW(r4_l, r5_l, r6_l, r7_l, 251cabdff1aSopenharmony_ci r4_l, r5_l, r6_l, r7_l); 252cabdff1aSopenharmony_ci A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16); 253cabdff1aSopenharmony_ci B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16); 254cabdff1aSopenharmony_ci C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16); 255cabdff1aSopenharmony_ci D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16); 256cabdff1aSopenharmony_ci Ad = ((A - C) * cnst46341w) >> 16; 257cabdff1aSopenharmony_ci Bd = ((B - D) * cnst46341w) >> 16; 258cabdff1aSopenharmony_ci Cd = A + C; 259cabdff1aSopenharmony_ci Dd = B + D; 260cabdff1aSopenharmony_ci E = ((r4_r + r4_l) * cnst46341w) >> 16; 261cabdff1aSopenharmony_ci E += cnst8w; 262cabdff1aSopenharmony_ci F = ((r4_r - r4_l) * cnst46341w) >> 16; 263cabdff1aSopenharmony_ci F += cnst8w; 264cabdff1aSopenharmony_ci if (type == 1) { // HACK 265cabdff1aSopenharmony_ci E += cnst2048w; 266cabdff1aSopenharmony_ci F += cnst2048w; 267cabdff1aSopenharmony_ci } 268cabdff1aSopenharmony_ci G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16); 269cabdff1aSopenharmony_ci H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16); 270cabdff1aSopenharmony_ci Ed = E - G; 271cabdff1aSopenharmony_ci Gd = E + G; 272cabdff1aSopenharmony_ci Add = F + Ad; 273cabdff1aSopenharmony_ci Bdd = Bd - H; 274cabdff1aSopenharmony_ci Fd = F - Ad; 275cabdff1aSopenharmony_ci Hd = Bd + H; 276cabdff1aSopenharmony_ci A = (Gd + Cd) >> 4; 277cabdff1aSopenharmony_ci B = (Gd - Cd) >> 4; 278cabdff1aSopenharmony_ci C = (Add + Hd) >> 4; 279cabdff1aSopenharmony_ci D = (Add - Hd) >> 4; 280cabdff1aSopenharmony_ci E = (Ed + Dd) >> 4; 281cabdff1aSopenharmony_ci F = (Ed - Dd) >> 4; 282cabdff1aSopenharmony_ci G = (Fd + Bdd) >> 4; 283cabdff1aSopenharmony_ci H = (Fd - Bdd) >> 4; 284cabdff1aSopenharmony_ci if (type != 1) { 285cabdff1aSopenharmony_ci ILVL_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3, 286cabdff1aSopenharmony_ci c0, c1, c2, c3); 287cabdff1aSopenharmony_ci ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7, 288cabdff1aSopenharmony_ci c4, c5, c6, c7); 289cabdff1aSopenharmony_ci A += c0; 290cabdff1aSopenharmony_ci B += c7; 291cabdff1aSopenharmony_ci C += c1; 292cabdff1aSopenharmony_ci D += c2; 293cabdff1aSopenharmony_ci E += c3; 294cabdff1aSopenharmony_ci F += c4; 295cabdff1aSopenharmony_ci G += c5; 296cabdff1aSopenharmony_ci H += c6; 297cabdff1aSopenharmony_ci } 298cabdff1aSopenharmony_ci CLIP_SW8_0_255(A, B, C, D, E, F, G, H); 299cabdff1aSopenharmony_ci sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r); 300cabdff1aSopenharmony_ci sign_l = __msa_or_v(sign_l, (v16u8)r7_r); 301cabdff1aSopenharmony_ci sign_l = __msa_or_v(sign_l, (v16u8)r4_l); 302cabdff1aSopenharmony_ci sign_l = __msa_or_v(sign_l, (v16u8)r5_l); 303cabdff1aSopenharmony_ci sign_l = __msa_or_v(sign_l, (v16u8)r6_l); 304cabdff1aSopenharmony_ci sign_l = __msa_or_v(sign_l, (v16u8)r7_l); 305cabdff1aSopenharmony_ci sign_t = __msa_ceqi_w((v4i32)sign_l, 0); 306cabdff1aSopenharmony_ci Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20; 307cabdff1aSopenharmony_ci if (type == 1) { 308cabdff1aSopenharmony_ci Bdd = Add + cnst128w; 309cabdff1aSopenharmony_ci CLIP_SW_0_255(Bdd); 310cabdff1aSopenharmony_ci Ad = Bdd; 311cabdff1aSopenharmony_ci Bd = Bdd; 312cabdff1aSopenharmony_ci Cd = Bdd; 313cabdff1aSopenharmony_ci Dd = Bdd; 314cabdff1aSopenharmony_ci Ed = Bdd; 315cabdff1aSopenharmony_ci Fd = Bdd; 316cabdff1aSopenharmony_ci Gd = Bdd; 317cabdff1aSopenharmony_ci Hd = Bdd; 318cabdff1aSopenharmony_ci } else { 319cabdff1aSopenharmony_ci Ad = Add + c0; 320cabdff1aSopenharmony_ci Bd = Add + c1; 321cabdff1aSopenharmony_ci Cd = Add + c2; 322cabdff1aSopenharmony_ci Dd = Add + c3; 323cabdff1aSopenharmony_ci Ed = Add + c4; 324cabdff1aSopenharmony_ci Fd = Add + c5; 325cabdff1aSopenharmony_ci Gd = Add + c6; 326cabdff1aSopenharmony_ci Hd = Add + c7; 327cabdff1aSopenharmony_ci CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); 328cabdff1aSopenharmony_ci } 329cabdff1aSopenharmony_ci Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); 330cabdff1aSopenharmony_ci Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t); 331cabdff1aSopenharmony_ci Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t); 332cabdff1aSopenharmony_ci Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t); 333cabdff1aSopenharmony_ci Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t); 334cabdff1aSopenharmony_ci Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t); 335cabdff1aSopenharmony_ci Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t); 336cabdff1aSopenharmony_ci Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t); 337cabdff1aSopenharmony_ci sign_t = __msa_ceqi_w(sign_t, 0); 338cabdff1aSopenharmony_ci A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t); 339cabdff1aSopenharmony_ci B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t); 340cabdff1aSopenharmony_ci C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t); 341cabdff1aSopenharmony_ci D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t); 342cabdff1aSopenharmony_ci E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t); 343cabdff1aSopenharmony_ci F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t); 344cabdff1aSopenharmony_ci G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t); 345cabdff1aSopenharmony_ci H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t); 346cabdff1aSopenharmony_ci r4_r = Ad + A; 347cabdff1aSopenharmony_ci r5_r = Bd + C; 348cabdff1aSopenharmony_ci r6_r = Cd + D; 349cabdff1aSopenharmony_ci r7_r = Dd + E; 350cabdff1aSopenharmony_ci r4_l = Ed + F; 351cabdff1aSopenharmony_ci r5_l = Fd + G; 352cabdff1aSopenharmony_ci r6_l = Gd + H; 353cabdff1aSopenharmony_ci r7_l = Hd + B; 354cabdff1aSopenharmony_ci VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1); 355cabdff1aSopenharmony_ci VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3); 356cabdff1aSopenharmony_ci VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5); 357cabdff1aSopenharmony_ci VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7); 358cabdff1aSopenharmony_ci 359cabdff1aSopenharmony_ci /* Final sequence of operations over-write original dst */ 360cabdff1aSopenharmony_ci ST_D1(d0, 0, dst); 361cabdff1aSopenharmony_ci ST_D1(d1, 0, dst + stride); 362cabdff1aSopenharmony_ci ST_D1(d2, 0, dst + 2 * stride); 363cabdff1aSopenharmony_ci ST_D1(d3, 0, dst + 3 * stride); 364cabdff1aSopenharmony_ci ST_D1(d4, 0, dst + 4 * stride); 365cabdff1aSopenharmony_ci ST_D1(d5, 0, dst + 5 * stride); 366cabdff1aSopenharmony_ci ST_D1(d6, 0, dst + 6 * stride); 367cabdff1aSopenharmony_ci ST_D1(d7, 0, dst + 7 * stride); 368cabdff1aSopenharmony_ci} 369cabdff1aSopenharmony_ci 370cabdff1aSopenharmony_civoid ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) 371cabdff1aSopenharmony_ci{ 372cabdff1aSopenharmony_ci idct_msa(dest, line_size, block, 1); 373cabdff1aSopenharmony_ci memset(block, 0, sizeof(*block) * 64); 374cabdff1aSopenharmony_ci} 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_civoid ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) 377cabdff1aSopenharmony_ci{ 378cabdff1aSopenharmony_ci idct_msa(dest, line_size, block, 2); 379cabdff1aSopenharmony_ci memset(block, 0, sizeof(*block) * 64); 380cabdff1aSopenharmony_ci} 381cabdff1aSopenharmony_ci 382cabdff1aSopenharmony_civoid ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) 383cabdff1aSopenharmony_ci{ 384cabdff1aSopenharmony_ci int i = (block[0] + 15) >> 5; 385cabdff1aSopenharmony_ci v4i32 dc = {i, i, i, i}; 386cabdff1aSopenharmony_ci v16i8 d0, d1, d2, d3, d4, d5, d6, d7; 387cabdff1aSopenharmony_ci v4i32 c0, c1, c2, c3, c4, c5, c6, c7; 388cabdff1aSopenharmony_ci v4i32 e0, e1, e2, e3, e4, e5, e6, e7; 389cabdff1aSopenharmony_ci v4i32 r0, r1, r2, r3, r4, r5, r6, r7; 390cabdff1aSopenharmony_ci v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; 391cabdff1aSopenharmony_ci v16i8 zero = {0}; 392cabdff1aSopenharmony_ci 393cabdff1aSopenharmony_ci LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7); 394cabdff1aSopenharmony_ci ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3, 395cabdff1aSopenharmony_ci c0, c1, c2, c3); 396cabdff1aSopenharmony_ci ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7, 397cabdff1aSopenharmony_ci c4, c5, c6, c7); 398cabdff1aSopenharmony_ci /* Right part */ 399cabdff1aSopenharmony_ci ILVR_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3, 400cabdff1aSopenharmony_ci e0, e1, e2, e3); 401cabdff1aSopenharmony_ci ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7, 402cabdff1aSopenharmony_ci e4, e5, e6, e7); 403cabdff1aSopenharmony_ci e0 += dc; 404cabdff1aSopenharmony_ci e1 += dc; 405cabdff1aSopenharmony_ci e2 += dc; 406cabdff1aSopenharmony_ci e3 += dc; 407cabdff1aSopenharmony_ci e4 += dc; 408cabdff1aSopenharmony_ci e5 += dc; 409cabdff1aSopenharmony_ci e6 += dc; 410cabdff1aSopenharmony_ci e7 += dc; 411cabdff1aSopenharmony_ci CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7); 412cabdff1aSopenharmony_ci 413cabdff1aSopenharmony_ci /* Left part */ 414cabdff1aSopenharmony_ci ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3, 415cabdff1aSopenharmony_ci r0, r1, r2, r3); 416cabdff1aSopenharmony_ci ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7, 417cabdff1aSopenharmony_ci r4, r5, r6, r7); 418cabdff1aSopenharmony_ci r0 += dc; 419cabdff1aSopenharmony_ci r1 += dc; 420cabdff1aSopenharmony_ci r2 += dc; 421cabdff1aSopenharmony_ci r3 += dc; 422cabdff1aSopenharmony_ci r4 += dc; 423cabdff1aSopenharmony_ci r5 += dc; 424cabdff1aSopenharmony_ci r6 += dc; 425cabdff1aSopenharmony_ci r7 += dc; 426cabdff1aSopenharmony_ci CLIP_SW8_0_255(r0, r1, r2, r3, r4, r5, r6, r7); 427cabdff1aSopenharmony_ci VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1); 428cabdff1aSopenharmony_ci VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3); 429cabdff1aSopenharmony_ci VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5); 430cabdff1aSopenharmony_ci VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7); 431cabdff1aSopenharmony_ci 432cabdff1aSopenharmony_ci /* Final sequence of operations over-write original dst */ 433cabdff1aSopenharmony_ci ST_D1(d0, 0, dest); 434cabdff1aSopenharmony_ci ST_D1(d1, 0, dest + line_size); 435cabdff1aSopenharmony_ci ST_D1(d2, 0, dest + 2 * line_size); 436cabdff1aSopenharmony_ci ST_D1(d3, 0, dest + 3 * line_size); 437cabdff1aSopenharmony_ci ST_D1(d4, 0, dest + 4 * line_size); 438cabdff1aSopenharmony_ci ST_D1(d5, 0, dest + 5 * line_size); 439cabdff1aSopenharmony_ci ST_D1(d6, 0, dest + 6 * line_size); 440cabdff1aSopenharmony_ci ST_D1(d7, 0, dest + 7 * line_size); 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ci block[0] = 0; 443cabdff1aSopenharmony_ci} 444cabdff1aSopenharmony_ci 445cabdff1aSopenharmony_civoid ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, 446cabdff1aSopenharmony_ci int *bounding_values) 447cabdff1aSopenharmony_ci{ 448cabdff1aSopenharmony_ci int nstride = -stride; 449cabdff1aSopenharmony_ci v4i32 e0, e1, f0, f1, g0, g1; 450cabdff1aSopenharmony_ci v16i8 zero = {0}; 451cabdff1aSopenharmony_ci v16i8 d0, d1, d2, d3; 452cabdff1aSopenharmony_ci v8i16 c0, c1, c2, c3; 453cabdff1aSopenharmony_ci v8i16 r0; 454cabdff1aSopenharmony_ci v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3}, 455cabdff1aSopenharmony_ci cnst4h = {4, 4, 4, 4, 4, 4, 4, 4}; 456cabdff1aSopenharmony_ci v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; 457cabdff1aSopenharmony_ci int16_t temp_16[8]; 458cabdff1aSopenharmony_ci int temp_32[8]; 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci LD_SB4(first_pixel + nstride * 2, stride, d0, d1, d2, d3); 461cabdff1aSopenharmony_ci ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3, 462cabdff1aSopenharmony_ci c0, c1, c2, c3); 463cabdff1aSopenharmony_ci r0 = (c0 - c3) + (c2 - c1) * cnst3h; 464cabdff1aSopenharmony_ci r0 += cnst4h; 465cabdff1aSopenharmony_ci r0 = r0 >> 3; 466cabdff1aSopenharmony_ci /* Get filter_value from bounding_values one by one */ 467cabdff1aSopenharmony_ci ST_SH(r0, temp_16); 468cabdff1aSopenharmony_ci for (int i = 0; i < 8; i++) 469cabdff1aSopenharmony_ci temp_32[i] = bounding_values[temp_16[i]]; 470cabdff1aSopenharmony_ci LD_SW2(temp_32, 4, e0, e1); 471cabdff1aSopenharmony_ci ILVR_H2_SW(zero, c1, zero, c2, f0, g0); 472cabdff1aSopenharmony_ci ILVL_H2_SW(zero, c1, zero, c2, f1, g1); 473cabdff1aSopenharmony_ci f0 += e0; 474cabdff1aSopenharmony_ci f1 += e1; 475cabdff1aSopenharmony_ci g0 -= e0; 476cabdff1aSopenharmony_ci g1 -= e1; 477cabdff1aSopenharmony_ci CLIP_SW4_0_255(f0, f1, g0, g1); 478cabdff1aSopenharmony_ci VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2); 479cabdff1aSopenharmony_ci 480cabdff1aSopenharmony_ci /* Final move to first_pixel */ 481cabdff1aSopenharmony_ci ST_D1(d1, 0, first_pixel + nstride); 482cabdff1aSopenharmony_ci ST_D1(d2, 0, first_pixel); 483cabdff1aSopenharmony_ci} 484cabdff1aSopenharmony_ci 485cabdff1aSopenharmony_civoid ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, 486cabdff1aSopenharmony_ci int *bounding_values) 487cabdff1aSopenharmony_ci{ 488cabdff1aSopenharmony_ci v16i8 d0, d1, d2, d3, d4, d5, d6, d7; 489cabdff1aSopenharmony_ci v8i16 c0, c1, c2, c3, c4, c5, c6, c7; 490cabdff1aSopenharmony_ci v8i16 r0; 491cabdff1aSopenharmony_ci v4i32 e0, e1, f0, f1, g0, g1; 492cabdff1aSopenharmony_ci v16i8 zero = {0}; 493cabdff1aSopenharmony_ci v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3}, 494cabdff1aSopenharmony_ci cnst4h = {4, 4, 4, 4, 4, 4, 4, 4}; 495cabdff1aSopenharmony_ci v16i8 mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0}; 496cabdff1aSopenharmony_ci int16_t temp_16[8]; 497cabdff1aSopenharmony_ci int temp_32[8]; 498cabdff1aSopenharmony_ci 499cabdff1aSopenharmony_ci LD_SB8(first_pixel - 2, stride, d0, d1, d2, d3, d4, d5, d6, d7); 500cabdff1aSopenharmony_ci ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3, 501cabdff1aSopenharmony_ci c0, c1, c2, c3); 502cabdff1aSopenharmony_ci ILVR_B4_SH(zero, d4, zero, d5, zero, d6, zero, d7, 503cabdff1aSopenharmony_ci c4, c5, c6, c7); 504cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(c0, c1, c2, c3, c4, c5, c6, c7, 505cabdff1aSopenharmony_ci c0, c1, c2, c3, c4, c5, c6, c7); 506cabdff1aSopenharmony_ci r0 = (c0 - c3) + (c2 - c1) * cnst3h; 507cabdff1aSopenharmony_ci r0 += cnst4h; 508cabdff1aSopenharmony_ci r0 = r0 >> 3; 509cabdff1aSopenharmony_ci 510cabdff1aSopenharmony_ci /* Get filter_value from bounding_values one by one */ 511cabdff1aSopenharmony_ci ST_SH(r0, temp_16); 512cabdff1aSopenharmony_ci for (int i = 0; i < 8; i++) 513cabdff1aSopenharmony_ci temp_32[i] = bounding_values[temp_16[i]]; 514cabdff1aSopenharmony_ci LD_SW2(temp_32, 4, e0, e1); 515cabdff1aSopenharmony_ci ILVR_H2_SW(zero, c1, zero, c2, f0, g0); 516cabdff1aSopenharmony_ci ILVL_H2_SW(zero, c1, zero, c2, f1, g1); 517cabdff1aSopenharmony_ci f0 += e0; 518cabdff1aSopenharmony_ci f1 += e1; 519cabdff1aSopenharmony_ci g0 -= e0; 520cabdff1aSopenharmony_ci g1 -= e1; 521cabdff1aSopenharmony_ci CLIP_SW4_0_255(f0, f1, g0, g1); 522cabdff1aSopenharmony_ci VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2); 523cabdff1aSopenharmony_ci /* Final move to first_pixel */ 524cabdff1aSopenharmony_ci ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride); 525cabdff1aSopenharmony_ci ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride); 526cabdff1aSopenharmony_ci} 527cabdff1aSopenharmony_ci 528cabdff1aSopenharmony_civoid ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1, 529cabdff1aSopenharmony_ci const uint8_t *src2, ptrdiff_t stride, int h) 530cabdff1aSopenharmony_ci{ 531cabdff1aSopenharmony_ci if (h == 8) { 532cabdff1aSopenharmony_ci v16i8 d0, d1, d2, d3, d4, d5, d6, d7; 533cabdff1aSopenharmony_ci v16i8 c0, c1, c2, c3; 534cabdff1aSopenharmony_ci v4i32 a0, a1, a2, a3, b0, b1, b2, b3; 535cabdff1aSopenharmony_ci v4i32 e0, e1, e2; 536cabdff1aSopenharmony_ci v4i32 f0, f1, f2; 537cabdff1aSopenharmony_ci v4u32 t0, t1, t2, t3; 538cabdff1aSopenharmony_ci v16i8 mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; 539cabdff1aSopenharmony_ci int32_t value = 0xfefefefe; 540cabdff1aSopenharmony_ci v4i32 fmask = {value, value, value, value}; 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_ci LD_SB8(src1, stride, d0, d1, d2, d3, d4, d5, d6, d7); 543cabdff1aSopenharmony_ci VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1); 544cabdff1aSopenharmony_ci VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3); 545cabdff1aSopenharmony_ci a0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0); 546cabdff1aSopenharmony_ci a2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0); 547cabdff1aSopenharmony_ci a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2); 548cabdff1aSopenharmony_ci a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2); 549cabdff1aSopenharmony_ci 550cabdff1aSopenharmony_ci LD_SB8(src2, stride, d0, d1, d2, d3, d4, d5, d6, d7); 551cabdff1aSopenharmony_ci VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1); 552cabdff1aSopenharmony_ci VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3); 553cabdff1aSopenharmony_ci b0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0); 554cabdff1aSopenharmony_ci b2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0); 555cabdff1aSopenharmony_ci b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2); 556cabdff1aSopenharmony_ci b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2); 557cabdff1aSopenharmony_ci 558cabdff1aSopenharmony_ci e0 = (v4i32) __msa_xor_v((v16u8)a0, (v16u8)b0); 559cabdff1aSopenharmony_ci e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask); 560cabdff1aSopenharmony_ci t0 = ((v4u32)e0) >> 1; 561cabdff1aSopenharmony_ci e2 = (v4i32) __msa_and_v((v16u8)a0, (v16u8)b0); 562cabdff1aSopenharmony_ci t0 = t0 + (v4u32)e2; 563cabdff1aSopenharmony_ci 564cabdff1aSopenharmony_ci e1 = (v4i32) __msa_xor_v((v16u8)a1, (v16u8)b1); 565cabdff1aSopenharmony_ci e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask); 566cabdff1aSopenharmony_ci t1 = ((v4u32)e1) >> 1; 567cabdff1aSopenharmony_ci e2 = (v4i32) __msa_and_v((v16u8)a1, (v16u8)b1); 568cabdff1aSopenharmony_ci t1 = t1 + (v4u32)e2; 569cabdff1aSopenharmony_ci 570cabdff1aSopenharmony_ci f0 = (v4i32) __msa_xor_v((v16u8)a2, (v16u8)b2); 571cabdff1aSopenharmony_ci f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask); 572cabdff1aSopenharmony_ci t2 = ((v4u32)f0) >> 1; 573cabdff1aSopenharmony_ci f2 = (v4i32) __msa_and_v((v16u8)a2, (v16u8)b2); 574cabdff1aSopenharmony_ci t2 = t2 + (v4u32)f2; 575cabdff1aSopenharmony_ci 576cabdff1aSopenharmony_ci f1 = (v4i32) __msa_xor_v((v16u8)a3, (v16u8)b3); 577cabdff1aSopenharmony_ci f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask); 578cabdff1aSopenharmony_ci t3 = ((v4u32)f1) >> 1; 579cabdff1aSopenharmony_ci f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3); 580cabdff1aSopenharmony_ci t3 = t3 + (v4u32)f2; 581cabdff1aSopenharmony_ci 582cabdff1aSopenharmony_ci ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 583cabdff1aSopenharmony_ci ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride); 584cabdff1aSopenharmony_ci } else { 585cabdff1aSopenharmony_ci int i; 586cabdff1aSopenharmony_ci 587cabdff1aSopenharmony_ci for (i = 0; i < h; i++) { 588cabdff1aSopenharmony_ci uint32_t a, b; 589cabdff1aSopenharmony_ci 590cabdff1aSopenharmony_ci a = AV_RN32(&src1[i * stride]); 591cabdff1aSopenharmony_ci b = AV_RN32(&src2[i * stride]); 592cabdff1aSopenharmony_ci AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b)); 593cabdff1aSopenharmony_ci a = AV_RN32(&src1[i * stride + 4]); 594cabdff1aSopenharmony_ci b = AV_RN32(&src2[i * stride + 4]); 595cabdff1aSopenharmony_ci AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b)); 596cabdff1aSopenharmony_ci } 597cabdff1aSopenharmony_ci } 598cabdff1aSopenharmony_ci} 599