1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h" 22cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 23cabdff1aSopenharmony_ci#include "vp9dsp_mips.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ 26cabdff1aSopenharmony_ci{ \ 27cabdff1aSopenharmony_ci out0 = __msa_subs_u_h(out0, in0); \ 28cabdff1aSopenharmony_ci out1 = __msa_subs_u_h(out1, in1); \ 29cabdff1aSopenharmony_ci} 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_civoid ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, 32cabdff1aSopenharmony_ci const uint8_t *src) 33cabdff1aSopenharmony_ci{ 34cabdff1aSopenharmony_ci uint32_t row; 35cabdff1aSopenharmony_ci v16u8 src0; 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci src0 = LD_UB(src); 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ci for (row = 16; row--;) { 40cabdff1aSopenharmony_ci ST_UB(src0, dst); 41cabdff1aSopenharmony_ci dst += dst_stride; 42cabdff1aSopenharmony_ci } 43cabdff1aSopenharmony_ci} 44cabdff1aSopenharmony_ci 45cabdff1aSopenharmony_civoid ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, 46cabdff1aSopenharmony_ci const uint8_t *src) 47cabdff1aSopenharmony_ci{ 48cabdff1aSopenharmony_ci uint32_t row; 49cabdff1aSopenharmony_ci v16u8 src1, src2; 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ci src1 = LD_UB(src); 52cabdff1aSopenharmony_ci src2 = LD_UB(src + 16); 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci for (row = 32; row--;) { 55cabdff1aSopenharmony_ci ST_UB2(src1, src2, dst, 16); 56cabdff1aSopenharmony_ci dst += dst_stride; 57cabdff1aSopenharmony_ci } 58cabdff1aSopenharmony_ci} 59cabdff1aSopenharmony_ci 60cabdff1aSopenharmony_civoid ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, 61cabdff1aSopenharmony_ci const uint8_t *top) 62cabdff1aSopenharmony_ci{ 63cabdff1aSopenharmony_ci uint32_t row, inp; 64cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3; 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ci src += 12; 67cabdff1aSopenharmony_ci for (row = 4; row--;) { 68cabdff1aSopenharmony_ci inp = LW(src); 69cabdff1aSopenharmony_ci src -= 4; 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_ci src0 = (v16u8) __msa_fill_b(inp >> 24); 72cabdff1aSopenharmony_ci src1 = (v16u8) __msa_fill_b(inp >> 16); 73cabdff1aSopenharmony_ci src2 = (v16u8) __msa_fill_b(inp >> 8); 74cabdff1aSopenharmony_ci src3 = (v16u8) __msa_fill_b(inp); 75cabdff1aSopenharmony_ci 76cabdff1aSopenharmony_ci ST_UB4(src0, src1, src2, src3, dst, dst_stride); 77cabdff1aSopenharmony_ci dst += (4 * dst_stride); 78cabdff1aSopenharmony_ci } 79cabdff1aSopenharmony_ci} 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_civoid ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, 82cabdff1aSopenharmony_ci const uint8_t *top) 83cabdff1aSopenharmony_ci{ 84cabdff1aSopenharmony_ci uint32_t row, inp; 85cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3; 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_ci src += 28; 88cabdff1aSopenharmony_ci for (row = 8; row--;) { 89cabdff1aSopenharmony_ci inp = LW(src); 90cabdff1aSopenharmony_ci src -= 4; 91cabdff1aSopenharmony_ci 92cabdff1aSopenharmony_ci src0 = (v16u8) __msa_fill_b(inp >> 24); 93cabdff1aSopenharmony_ci src1 = (v16u8) __msa_fill_b(inp >> 16); 94cabdff1aSopenharmony_ci src2 = (v16u8) __msa_fill_b(inp >> 8); 95cabdff1aSopenharmony_ci src3 = (v16u8) __msa_fill_b(inp); 96cabdff1aSopenharmony_ci 97cabdff1aSopenharmony_ci ST_UB2(src0, src0, dst, 16); 98cabdff1aSopenharmony_ci dst += dst_stride; 99cabdff1aSopenharmony_ci ST_UB2(src1, src1, dst, 16); 100cabdff1aSopenharmony_ci dst += dst_stride; 101cabdff1aSopenharmony_ci ST_UB2(src2, src2, dst, 16); 102cabdff1aSopenharmony_ci dst += dst_stride; 103cabdff1aSopenharmony_ci ST_UB2(src3, src3, dst, 16); 104cabdff1aSopenharmony_ci dst += dst_stride; 105cabdff1aSopenharmony_ci } 106cabdff1aSopenharmony_ci} 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_civoid ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, 109cabdff1aSopenharmony_ci const uint8_t *src_top) 110cabdff1aSopenharmony_ci{ 111cabdff1aSopenharmony_ci uint32_t val0, val1; 112cabdff1aSopenharmony_ci v16i8 store, src = { 0 }; 113cabdff1aSopenharmony_ci v8u16 sum_h; 114cabdff1aSopenharmony_ci v4u32 sum_w; 115cabdff1aSopenharmony_ci v2u64 sum_d; 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci val0 = LW(src_top); 118cabdff1aSopenharmony_ci val1 = LW(src_left); 119cabdff1aSopenharmony_ci INSERT_W2_SB(val0, val1, src); 120cabdff1aSopenharmony_ci sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src); 121cabdff1aSopenharmony_ci sum_w = __msa_hadd_u_w(sum_h, sum_h); 122cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); 123cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3); 124cabdff1aSopenharmony_ci store = __msa_splati_b((v16i8) sum_w, 0); 125cabdff1aSopenharmony_ci val0 = __msa_copy_u_w((v4i32) store, 0); 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci SW4(val0, val0, val0, val0, dst, dst_stride); 128cabdff1aSopenharmony_ci} 129cabdff1aSopenharmony_ci 130cabdff1aSopenharmony_ci#define INTRA_DC_TL_4x4(dir) \ 131cabdff1aSopenharmony_civoid ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 132cabdff1aSopenharmony_ci const uint8_t *left, \ 133cabdff1aSopenharmony_ci const uint8_t *top) \ 134cabdff1aSopenharmony_ci{ \ 135cabdff1aSopenharmony_ci uint32_t val0; \ 136cabdff1aSopenharmony_ci v16i8 store, data = { 0 }; \ 137cabdff1aSopenharmony_ci v8u16 sum_h; \ 138cabdff1aSopenharmony_ci v4u32 sum_w; \ 139cabdff1aSopenharmony_ci \ 140cabdff1aSopenharmony_ci val0 = LW(dir); \ 141cabdff1aSopenharmony_ci data = (v16i8) __msa_insert_w((v4i32) data, 0, val0); \ 142cabdff1aSopenharmony_ci sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data); \ 143cabdff1aSopenharmony_ci sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 144cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2); \ 145cabdff1aSopenharmony_ci store = __msa_splati_b((v16i8) sum_w, 0); \ 146cabdff1aSopenharmony_ci val0 = __msa_copy_u_w((v4i32) store, 0); \ 147cabdff1aSopenharmony_ci \ 148cabdff1aSopenharmony_ci SW4(val0, val0, val0, val0, dst, dst_stride); \ 149cabdff1aSopenharmony_ci} 150cabdff1aSopenharmony_ciINTRA_DC_TL_4x4(top); 151cabdff1aSopenharmony_ciINTRA_DC_TL_4x4(left); 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_civoid ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, 154cabdff1aSopenharmony_ci const uint8_t *src_top) 155cabdff1aSopenharmony_ci{ 156cabdff1aSopenharmony_ci uint64_t val0, val1; 157cabdff1aSopenharmony_ci v16i8 store; 158cabdff1aSopenharmony_ci v16u8 src = { 0 }; 159cabdff1aSopenharmony_ci v8u16 sum_h; 160cabdff1aSopenharmony_ci v4u32 sum_w; 161cabdff1aSopenharmony_ci v2u64 sum_d; 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci val0 = LD(src_top); 164cabdff1aSopenharmony_ci val1 = LD(src_left); 165cabdff1aSopenharmony_ci INSERT_D2_UB(val0, val1, src); 166cabdff1aSopenharmony_ci sum_h = __msa_hadd_u_h(src, src); 167cabdff1aSopenharmony_ci sum_w = __msa_hadd_u_w(sum_h, sum_h); 168cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); 169cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); 170cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); 171cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4); 172cabdff1aSopenharmony_ci store = __msa_splati_b((v16i8) sum_w, 0); 173cabdff1aSopenharmony_ci val0 = __msa_copy_u_d((v2i64) store, 0); 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci SD4(val0, val0, val0, val0, dst, dst_stride); 176cabdff1aSopenharmony_ci dst += (4 * dst_stride); 177cabdff1aSopenharmony_ci SD4(val0, val0, val0, val0, dst, dst_stride); 178cabdff1aSopenharmony_ci} 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci#define INTRA_DC_TL_8x8(dir) \ 181cabdff1aSopenharmony_civoid ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 182cabdff1aSopenharmony_ci const uint8_t *left, \ 183cabdff1aSopenharmony_ci const uint8_t *top) \ 184cabdff1aSopenharmony_ci{ \ 185cabdff1aSopenharmony_ci uint64_t val0; \ 186cabdff1aSopenharmony_ci v16i8 store; \ 187cabdff1aSopenharmony_ci v16u8 data = { 0 }; \ 188cabdff1aSopenharmony_ci v8u16 sum_h; \ 189cabdff1aSopenharmony_ci v4u32 sum_w; \ 190cabdff1aSopenharmony_ci v2u64 sum_d; \ 191cabdff1aSopenharmony_ci \ 192cabdff1aSopenharmony_ci val0 = LD(dir); \ 193cabdff1aSopenharmony_ci data = (v16u8) __msa_insert_d((v2i64) data, 0, val0); \ 194cabdff1aSopenharmony_ci sum_h = __msa_hadd_u_h(data, data); \ 195cabdff1aSopenharmony_ci sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 196cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 197cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3); \ 198cabdff1aSopenharmony_ci store = __msa_splati_b((v16i8) sum_w, 0); \ 199cabdff1aSopenharmony_ci val0 = __msa_copy_u_d((v2i64) store, 0); \ 200cabdff1aSopenharmony_ci \ 201cabdff1aSopenharmony_ci SD4(val0, val0, val0, val0, dst, dst_stride); \ 202cabdff1aSopenharmony_ci dst += (4 * dst_stride); \ 203cabdff1aSopenharmony_ci SD4(val0, val0, val0, val0, dst, dst_stride); \ 204cabdff1aSopenharmony_ci} 205cabdff1aSopenharmony_ci 206cabdff1aSopenharmony_ciINTRA_DC_TL_8x8(top); 207cabdff1aSopenharmony_ciINTRA_DC_TL_8x8(left); 208cabdff1aSopenharmony_ci 209cabdff1aSopenharmony_civoid ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, 210cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top) 211cabdff1aSopenharmony_ci{ 212cabdff1aSopenharmony_ci v16u8 top, left, out; 213cabdff1aSopenharmony_ci v8u16 sum_h, sum_top, sum_left; 214cabdff1aSopenharmony_ci v4u32 sum_w; 215cabdff1aSopenharmony_ci v2u64 sum_d; 216cabdff1aSopenharmony_ci 217cabdff1aSopenharmony_ci top = LD_UB(src_top); 218cabdff1aSopenharmony_ci left = LD_UB(src_left); 219cabdff1aSopenharmony_ci HADD_UB2_UH(top, left, sum_top, sum_left); 220cabdff1aSopenharmony_ci sum_h = sum_top + sum_left; 221cabdff1aSopenharmony_ci sum_w = __msa_hadd_u_w(sum_h, sum_h); 222cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); 223cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); 224cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); 225cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5); 226cabdff1aSopenharmony_ci out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); 227cabdff1aSopenharmony_ci 228cabdff1aSopenharmony_ci ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); 229cabdff1aSopenharmony_ci dst += (8 * dst_stride); 230cabdff1aSopenharmony_ci ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); 231cabdff1aSopenharmony_ci} 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_ci#define INTRA_DC_TL_16x16(dir) \ 234cabdff1aSopenharmony_civoid ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 235cabdff1aSopenharmony_ci const uint8_t *left, \ 236cabdff1aSopenharmony_ci const uint8_t *top) \ 237cabdff1aSopenharmony_ci{ \ 238cabdff1aSopenharmony_ci v16u8 data, out; \ 239cabdff1aSopenharmony_ci v8u16 sum_h; \ 240cabdff1aSopenharmony_ci v4u32 sum_w; \ 241cabdff1aSopenharmony_ci v2u64 sum_d; \ 242cabdff1aSopenharmony_ci \ 243cabdff1aSopenharmony_ci data = LD_UB(dir); \ 244cabdff1aSopenharmony_ci sum_h = __msa_hadd_u_h(data, data); \ 245cabdff1aSopenharmony_ci sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 246cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 247cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \ 248cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 249cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4); \ 250cabdff1aSopenharmony_ci out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \ 251cabdff1aSopenharmony_ci \ 252cabdff1aSopenharmony_ci ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 253cabdff1aSopenharmony_ci dst += (8 * dst_stride); \ 254cabdff1aSopenharmony_ci ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 255cabdff1aSopenharmony_ci} 256cabdff1aSopenharmony_ciINTRA_DC_TL_16x16(top); 257cabdff1aSopenharmony_ciINTRA_DC_TL_16x16(left); 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_civoid ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, 260cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top) 261cabdff1aSopenharmony_ci{ 262cabdff1aSopenharmony_ci uint32_t row; 263cabdff1aSopenharmony_ci v16u8 top0, top1, left0, left1, out; 264cabdff1aSopenharmony_ci v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; 265cabdff1aSopenharmony_ci v4u32 sum_w; 266cabdff1aSopenharmony_ci v2u64 sum_d; 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci LD_UB2(src_top, 16, top0, top1); 269cabdff1aSopenharmony_ci LD_UB2(src_left, 16, left0, left1); 270cabdff1aSopenharmony_ci HADD_UB2_UH(top0, top1, sum_top0, sum_top1); 271cabdff1aSopenharmony_ci HADD_UB2_UH(left0, left1, sum_left0, sum_left1); 272cabdff1aSopenharmony_ci sum_h = sum_top0 + sum_top1; 273cabdff1aSopenharmony_ci sum_h += sum_left0 + sum_left1; 274cabdff1aSopenharmony_ci sum_w = __msa_hadd_u_w(sum_h, sum_h); 275cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); 276cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); 277cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); 278cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 6); 279cabdff1aSopenharmony_ci out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); 280cabdff1aSopenharmony_ci 281cabdff1aSopenharmony_ci for (row = 16; row--;) 282cabdff1aSopenharmony_ci { 283cabdff1aSopenharmony_ci ST_UB2(out, out, dst, 16); 284cabdff1aSopenharmony_ci dst += dst_stride; 285cabdff1aSopenharmony_ci ST_UB2(out, out, dst, 16); 286cabdff1aSopenharmony_ci dst += dst_stride; 287cabdff1aSopenharmony_ci } 288cabdff1aSopenharmony_ci} 289cabdff1aSopenharmony_ci 290cabdff1aSopenharmony_ci#define INTRA_DC_TL_32x32(dir) \ 291cabdff1aSopenharmony_civoid ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 292cabdff1aSopenharmony_ci const uint8_t *left, \ 293cabdff1aSopenharmony_ci const uint8_t *top) \ 294cabdff1aSopenharmony_ci{ \ 295cabdff1aSopenharmony_ci uint32_t row; \ 296cabdff1aSopenharmony_ci v16u8 data0, data1, out; \ 297cabdff1aSopenharmony_ci v8u16 sum_h, sum_data0, sum_data1; \ 298cabdff1aSopenharmony_ci v4u32 sum_w; \ 299cabdff1aSopenharmony_ci v2u64 sum_d; \ 300cabdff1aSopenharmony_ci \ 301cabdff1aSopenharmony_ci LD_UB2(dir, 16, data0, data1); \ 302cabdff1aSopenharmony_ci HADD_UB2_UH(data0, data1, sum_data0, sum_data1); \ 303cabdff1aSopenharmony_ci sum_h = sum_data0 + sum_data1; \ 304cabdff1aSopenharmony_ci sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 305cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 306cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \ 307cabdff1aSopenharmony_ci sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 308cabdff1aSopenharmony_ci sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5); \ 309cabdff1aSopenharmony_ci out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \ 310cabdff1aSopenharmony_ci \ 311cabdff1aSopenharmony_ci for (row = 16; row--;) \ 312cabdff1aSopenharmony_ci { \ 313cabdff1aSopenharmony_ci ST_UB2(out, out, dst, 16); \ 314cabdff1aSopenharmony_ci dst += dst_stride; \ 315cabdff1aSopenharmony_ci ST_UB2(out, out, dst, 16); \ 316cabdff1aSopenharmony_ci dst += dst_stride; \ 317cabdff1aSopenharmony_ci } \ 318cabdff1aSopenharmony_ci} 319cabdff1aSopenharmony_ciINTRA_DC_TL_32x32(top); 320cabdff1aSopenharmony_ciINTRA_DC_TL_32x32(left); 321cabdff1aSopenharmony_ci 322cabdff1aSopenharmony_ci#define INTRA_PREDICT_VALDC_16X16_MSA(val) \ 323cabdff1aSopenharmony_civoid ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 324cabdff1aSopenharmony_ci const uint8_t *left, const uint8_t *top) \ 325cabdff1aSopenharmony_ci{ \ 326cabdff1aSopenharmony_ci v16u8 out = (v16u8) __msa_ldi_b(val); \ 327cabdff1aSopenharmony_ci \ 328cabdff1aSopenharmony_ci ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 329cabdff1aSopenharmony_ci dst += (8 * dst_stride); \ 330cabdff1aSopenharmony_ci ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 331cabdff1aSopenharmony_ci} 332cabdff1aSopenharmony_ci 333cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_MSA(127); 334cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_MSA(128); 335cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_MSA(129); 336cabdff1aSopenharmony_ci 337cabdff1aSopenharmony_ci#define INTRA_PREDICT_VALDC_32X32_MSA(val) \ 338cabdff1aSopenharmony_civoid ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 339cabdff1aSopenharmony_ci const uint8_t *left, const uint8_t *top) \ 340cabdff1aSopenharmony_ci{ \ 341cabdff1aSopenharmony_ci uint32_t row; \ 342cabdff1aSopenharmony_ci v16u8 out = (v16u8) __msa_ldi_b(val); \ 343cabdff1aSopenharmony_ci \ 344cabdff1aSopenharmony_ci for (row = 16; row--;) \ 345cabdff1aSopenharmony_ci { \ 346cabdff1aSopenharmony_ci ST_UB2(out, out, dst, 16); \ 347cabdff1aSopenharmony_ci dst += dst_stride; \ 348cabdff1aSopenharmony_ci ST_UB2(out, out, dst, 16); \ 349cabdff1aSopenharmony_ci dst += dst_stride; \ 350cabdff1aSopenharmony_ci } \ 351cabdff1aSopenharmony_ci} 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_MSA(127); 354cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_MSA(128); 355cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_MSA(129); 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_civoid ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, 358cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top_ptr) 359cabdff1aSopenharmony_ci{ 360cabdff1aSopenharmony_ci uint32_t left; 361cabdff1aSopenharmony_ci uint8_t top_left = src_top_ptr[-1]; 362cabdff1aSopenharmony_ci v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1; 363cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3; 364cabdff1aSopenharmony_ci v8u16 src_top_left, vec0, vec1, vec2, vec3; 365cabdff1aSopenharmony_ci 366cabdff1aSopenharmony_ci src_top_left = (v8u16) __msa_fill_h(top_left); 367cabdff1aSopenharmony_ci src_top = LD_SB(src_top_ptr); 368cabdff1aSopenharmony_ci left = LW(src_left); 369cabdff1aSopenharmony_ci src_left0 = __msa_fill_b(left >> 24); 370cabdff1aSopenharmony_ci src_left1 = __msa_fill_b(left >> 16); 371cabdff1aSopenharmony_ci src_left2 = __msa_fill_b(left >> 8); 372cabdff1aSopenharmony_ci src_left3 = __msa_fill_b(left); 373cabdff1aSopenharmony_ci 374cabdff1aSopenharmony_ci ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, 375cabdff1aSopenharmony_ci src_left3, src_top, src0, src1, src2, src3); 376cabdff1aSopenharmony_ci HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); 377cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); 378cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); 379cabdff1aSopenharmony_ci SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); 380cabdff1aSopenharmony_ci PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); 381cabdff1aSopenharmony_ci ST_W2(tmp0, 0, 2, dst, dst_stride); 382cabdff1aSopenharmony_ci ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride); 383cabdff1aSopenharmony_ci} 384cabdff1aSopenharmony_ci 385cabdff1aSopenharmony_civoid ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, 386cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top_ptr) 387cabdff1aSopenharmony_ci{ 388cabdff1aSopenharmony_ci uint8_t top_left = src_top_ptr[-1]; 389cabdff1aSopenharmony_ci uint32_t loop_cnt, left; 390cabdff1aSopenharmony_ci v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1; 391cabdff1aSopenharmony_ci v8u16 src_top_left, vec0, vec1, vec2, vec3; 392cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3; 393cabdff1aSopenharmony_ci 394cabdff1aSopenharmony_ci src_top = LD_SB(src_top_ptr); 395cabdff1aSopenharmony_ci src_top_left = (v8u16) __msa_fill_h(top_left); 396cabdff1aSopenharmony_ci 397cabdff1aSopenharmony_ci src_left += 4; 398cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 399cabdff1aSopenharmony_ci left = LW(src_left); 400cabdff1aSopenharmony_ci src_left0 = __msa_fill_b(left >> 24); 401cabdff1aSopenharmony_ci src_left1 = __msa_fill_b(left >> 16); 402cabdff1aSopenharmony_ci src_left2 = __msa_fill_b(left >> 8); 403cabdff1aSopenharmony_ci src_left3 = __msa_fill_b(left); 404cabdff1aSopenharmony_ci src_left -= 4; 405cabdff1aSopenharmony_ci 406cabdff1aSopenharmony_ci ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, 407cabdff1aSopenharmony_ci src_left3, src_top, src0, src1, src2, src3); 408cabdff1aSopenharmony_ci HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); 409cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); 410cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); 411cabdff1aSopenharmony_ci SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); 412cabdff1aSopenharmony_ci PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); 413cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 414cabdff1aSopenharmony_ci dst += (4 * dst_stride); 415cabdff1aSopenharmony_ci } 416cabdff1aSopenharmony_ci} 417cabdff1aSopenharmony_ci 418cabdff1aSopenharmony_civoid ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, 419cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top_ptr) 420cabdff1aSopenharmony_ci{ 421cabdff1aSopenharmony_ci uint8_t top_left = src_top_ptr[-1]; 422cabdff1aSopenharmony_ci uint32_t loop_cnt, left; 423cabdff1aSopenharmony_ci v16i8 src_top, src_left0, src_left1, src_left2, src_left3; 424cabdff1aSopenharmony_ci v8u16 src_top_left, res_r, res_l; 425cabdff1aSopenharmony_ci 426cabdff1aSopenharmony_ci src_top = LD_SB(src_top_ptr); 427cabdff1aSopenharmony_ci src_top_left = (v8u16) __msa_fill_h(top_left); 428cabdff1aSopenharmony_ci 429cabdff1aSopenharmony_ci src_left += 12; 430cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 431cabdff1aSopenharmony_ci left = LW(src_left); 432cabdff1aSopenharmony_ci src_left0 = __msa_fill_b(left >> 24); 433cabdff1aSopenharmony_ci src_left1 = __msa_fill_b(left >> 16); 434cabdff1aSopenharmony_ci src_left2 = __msa_fill_b(left >> 8); 435cabdff1aSopenharmony_ci src_left3 = __msa_fill_b(left); 436cabdff1aSopenharmony_ci src_left -= 4; 437cabdff1aSopenharmony_ci 438cabdff1aSopenharmony_ci ILVRL_B2_UH(src_left0, src_top, res_r, res_l); 439cabdff1aSopenharmony_ci HADD_UB2_UH(res_r, res_l, res_r, res_l); 440cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ci SAT_UH2_UH(res_r, res_l, 7); 443cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r, res_l, dst); 444cabdff1aSopenharmony_ci dst += dst_stride; 445cabdff1aSopenharmony_ci 446cabdff1aSopenharmony_ci ILVRL_B2_UH(src_left1, src_top, res_r, res_l); 447cabdff1aSopenharmony_ci HADD_UB2_UH(res_r, res_l, res_r, res_l); 448cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); 449cabdff1aSopenharmony_ci SAT_UH2_UH(res_r, res_l, 7); 450cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r, res_l, dst); 451cabdff1aSopenharmony_ci dst += dst_stride; 452cabdff1aSopenharmony_ci 453cabdff1aSopenharmony_ci ILVRL_B2_UH(src_left2, src_top, res_r, res_l); 454cabdff1aSopenharmony_ci HADD_UB2_UH(res_r, res_l, res_r, res_l); 455cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); 456cabdff1aSopenharmony_ci SAT_UH2_UH(res_r, res_l, 7); 457cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r, res_l, dst); 458cabdff1aSopenharmony_ci dst += dst_stride; 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci ILVRL_B2_UH(src_left3, src_top, res_r, res_l); 461cabdff1aSopenharmony_ci HADD_UB2_UH(res_r, res_l, res_r, res_l); 462cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); 463cabdff1aSopenharmony_ci SAT_UH2_UH(res_r, res_l, 7); 464cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r, res_l, dst); 465cabdff1aSopenharmony_ci dst += dst_stride; 466cabdff1aSopenharmony_ci } 467cabdff1aSopenharmony_ci} 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_civoid ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, 470cabdff1aSopenharmony_ci const uint8_t *src_left, const uint8_t *src_top_ptr) 471cabdff1aSopenharmony_ci{ 472cabdff1aSopenharmony_ci uint8_t top_left = src_top_ptr[-1]; 473cabdff1aSopenharmony_ci uint32_t loop_cnt, left; 474cabdff1aSopenharmony_ci v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; 475cabdff1aSopenharmony_ci v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; 476cabdff1aSopenharmony_ci 477cabdff1aSopenharmony_ci src_top0 = LD_SB(src_top_ptr); 478cabdff1aSopenharmony_ci src_top1 = LD_SB(src_top_ptr + 16); 479cabdff1aSopenharmony_ci src_top_left = (v8u16) __msa_fill_h(top_left); 480cabdff1aSopenharmony_ci 481cabdff1aSopenharmony_ci src_left += 28; 482cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 483cabdff1aSopenharmony_ci left = LW(src_left); 484cabdff1aSopenharmony_ci src_left0 = __msa_fill_b(left >> 24); 485cabdff1aSopenharmony_ci src_left1 = __msa_fill_b(left >> 16); 486cabdff1aSopenharmony_ci src_left2 = __msa_fill_b(left >> 8); 487cabdff1aSopenharmony_ci src_left3 = __msa_fill_b(left); 488cabdff1aSopenharmony_ci src_left -= 4; 489cabdff1aSopenharmony_ci 490cabdff1aSopenharmony_ci ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); 491cabdff1aSopenharmony_ci ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); 492cabdff1aSopenharmony_ci HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, 493cabdff1aSopenharmony_ci res_l1); 494cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); 495cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); 496cabdff1aSopenharmony_ci SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); 497cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r0, res_l0, dst); 498cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r1, res_l1, dst + 16); 499cabdff1aSopenharmony_ci dst += dst_stride; 500cabdff1aSopenharmony_ci 501cabdff1aSopenharmony_ci ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); 502cabdff1aSopenharmony_ci ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); 503cabdff1aSopenharmony_ci HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, 504cabdff1aSopenharmony_ci res_l1); 505cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); 506cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); 507cabdff1aSopenharmony_ci SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); 508cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r0, res_l0, dst); 509cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r1, res_l1, dst + 16); 510cabdff1aSopenharmony_ci dst += dst_stride; 511cabdff1aSopenharmony_ci 512cabdff1aSopenharmony_ci ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); 513cabdff1aSopenharmony_ci ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); 514cabdff1aSopenharmony_ci HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, 515cabdff1aSopenharmony_ci res_l1); 516cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); 517cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); 518cabdff1aSopenharmony_ci SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); 519cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r0, res_l0, dst); 520cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r1, res_l1, dst + 16); 521cabdff1aSopenharmony_ci dst += dst_stride; 522cabdff1aSopenharmony_ci 523cabdff1aSopenharmony_ci ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); 524cabdff1aSopenharmony_ci ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); 525cabdff1aSopenharmony_ci HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, 526cabdff1aSopenharmony_ci res_l1); 527cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); 528cabdff1aSopenharmony_ci IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); 529cabdff1aSopenharmony_ci SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); 530cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r0, res_l0, dst); 531cabdff1aSopenharmony_ci PCKEV_ST_SB(res_r1, res_l1, dst + 16); 532cabdff1aSopenharmony_ci dst += dst_stride; 533cabdff1aSopenharmony_ci } 534cabdff1aSopenharmony_ci} 535