1/* 2 * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavcodec/vp9dsp.h" 22#include "libavutil/mips/generic_macros_msa.h" 23#include "vp9dsp_mips.h" 24 25#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ 26{ \ 27 out0 = __msa_subs_u_h(out0, in0); \ 28 out1 = __msa_subs_u_h(out1, in1); \ 29} 30 31void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, 32 const uint8_t *src) 33{ 34 uint32_t row; 35 v16u8 src0; 36 37 src0 = LD_UB(src); 38 39 for (row = 16; row--;) { 40 ST_UB(src0, dst); 41 dst += dst_stride; 42 } 43} 44 45void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, 46 const uint8_t *src) 47{ 48 uint32_t row; 49 v16u8 src1, src2; 50 51 src1 = LD_UB(src); 52 src2 = LD_UB(src + 16); 53 54 for (row = 32; row--;) { 55 ST_UB2(src1, src2, dst, 16); 56 dst += dst_stride; 57 } 58} 59 60void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, 61 const uint8_t *top) 62{ 63 uint32_t row, inp; 64 v16u8 src0, src1, src2, src3; 65 66 src += 12; 67 for (row = 4; row--;) { 68 inp = LW(src); 69 src -= 4; 70 71 src0 = (v16u8) __msa_fill_b(inp >> 24); 72 src1 = (v16u8) __msa_fill_b(inp >> 16); 73 src2 = (v16u8) __msa_fill_b(inp >> 8); 74 src3 = (v16u8) __msa_fill_b(inp); 75 76 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 77 dst += (4 * dst_stride); 78 } 79} 80 81void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, 82 const uint8_t *top) 83{ 84 uint32_t row, inp; 85 v16u8 src0, src1, src2, src3; 86 87 src += 28; 88 for (row = 8; row--;) { 89 inp = LW(src); 90 src -= 4; 91 92 src0 = (v16u8) __msa_fill_b(inp >> 24); 93 src1 = (v16u8) __msa_fill_b(inp >> 16); 94 src2 = (v16u8) __msa_fill_b(inp >> 8); 95 src3 = (v16u8) __msa_fill_b(inp); 96 97 ST_UB2(src0, src0, dst, 16); 98 dst += dst_stride; 99 ST_UB2(src1, src1, dst, 16); 100 dst += dst_stride; 101 ST_UB2(src2, src2, dst, 16); 102 dst += dst_stride; 103 ST_UB2(src3, src3, dst, 16); 104 dst += dst_stride; 105 } 106} 107 108void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, 109 const uint8_t *src_top) 110{ 111 uint32_t val0, val1; 112 v16i8 store, src = { 0 }; 113 v8u16 sum_h; 114 v4u32 sum_w; 115 v2u64 sum_d; 116 117 val0 = LW(src_top); 118 val1 = LW(src_left); 119 INSERT_W2_SB(val0, val1, src); 120 sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src); 121 sum_w = __msa_hadd_u_w(sum_h, sum_h); 122 sum_d = __msa_hadd_u_d(sum_w, sum_w); 123 sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3); 124 store = __msa_splati_b((v16i8) sum_w, 0); 125 val0 = __msa_copy_u_w((v4i32) store, 0); 126 127 SW4(val0, val0, val0, val0, dst, dst_stride); 128} 129 130#define INTRA_DC_TL_4x4(dir) \ 131void ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 132 const uint8_t *left, \ 133 const uint8_t *top) \ 134{ \ 135 uint32_t val0; \ 136 v16i8 store, data = { 0 }; \ 137 v8u16 sum_h; \ 138 v4u32 sum_w; \ 139 \ 140 val0 = LW(dir); \ 141 data = (v16i8) __msa_insert_w((v4i32) data, 0, val0); \ 142 sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data); \ 143 sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 144 sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2); \ 145 store = __msa_splati_b((v16i8) sum_w, 0); \ 146 val0 = __msa_copy_u_w((v4i32) store, 0); \ 147 \ 148 SW4(val0, val0, val0, val0, dst, dst_stride); \ 149} 150INTRA_DC_TL_4x4(top); 151INTRA_DC_TL_4x4(left); 152 153void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, 154 const uint8_t *src_top) 155{ 156 uint64_t val0, val1; 157 v16i8 store; 158 v16u8 src = { 0 }; 159 v8u16 sum_h; 160 v4u32 sum_w; 161 v2u64 sum_d; 162 163 val0 = LD(src_top); 164 val1 = LD(src_left); 165 INSERT_D2_UB(val0, val1, src); 166 sum_h = __msa_hadd_u_h(src, src); 167 sum_w = __msa_hadd_u_w(sum_h, sum_h); 168 sum_d = __msa_hadd_u_d(sum_w, sum_w); 169 sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); 170 sum_d = __msa_hadd_u_d(sum_w, sum_w); 171 sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4); 172 store = __msa_splati_b((v16i8) sum_w, 0); 173 val0 = __msa_copy_u_d((v2i64) store, 0); 174 175 SD4(val0, val0, val0, val0, dst, dst_stride); 176 dst += (4 * dst_stride); 177 SD4(val0, val0, val0, val0, dst, dst_stride); 178} 179 180#define INTRA_DC_TL_8x8(dir) \ 181void ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 182 const uint8_t *left, \ 183 const uint8_t *top) \ 184{ \ 185 uint64_t val0; \ 186 v16i8 store; \ 187 v16u8 data = { 0 }; \ 188 v8u16 sum_h; \ 189 v4u32 sum_w; \ 190 v2u64 sum_d; \ 191 \ 192 val0 = LD(dir); \ 193 data = (v16u8) __msa_insert_d((v2i64) data, 0, val0); \ 194 sum_h = __msa_hadd_u_h(data, data); \ 195 sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 196 sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 197 sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3); \ 198 store = __msa_splati_b((v16i8) sum_w, 0); \ 199 val0 = __msa_copy_u_d((v2i64) store, 0); \ 200 \ 201 SD4(val0, val0, val0, val0, dst, dst_stride); \ 202 dst += (4 * dst_stride); \ 203 SD4(val0, val0, val0, val0, dst, dst_stride); \ 204} 205 206INTRA_DC_TL_8x8(top); 207INTRA_DC_TL_8x8(left); 208 209void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, 210 const uint8_t *src_left, const uint8_t *src_top) 211{ 212 v16u8 top, left, out; 213 v8u16 sum_h, sum_top, sum_left; 214 v4u32 sum_w; 215 v2u64 sum_d; 216 217 top = LD_UB(src_top); 218 left = LD_UB(src_left); 219 HADD_UB2_UH(top, left, sum_top, sum_left); 220 sum_h = sum_top + sum_left; 221 sum_w = __msa_hadd_u_w(sum_h, sum_h); 222 sum_d = __msa_hadd_u_d(sum_w, sum_w); 223 sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); 224 sum_d = __msa_hadd_u_d(sum_w, sum_w); 225 sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5); 226 out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); 227 228 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); 229 dst += (8 * dst_stride); 230 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); 231} 232 233#define INTRA_DC_TL_16x16(dir) \ 234void ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 235 const uint8_t *left, \ 236 const uint8_t *top) \ 237{ \ 238 v16u8 data, out; \ 239 v8u16 sum_h; \ 240 v4u32 sum_w; \ 241 v2u64 sum_d; \ 242 \ 243 data = LD_UB(dir); \ 244 sum_h = __msa_hadd_u_h(data, data); \ 245 sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 246 sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 247 sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \ 248 sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 249 sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4); \ 250 out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \ 251 \ 252 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 253 dst += (8 * dst_stride); \ 254 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 255} 256INTRA_DC_TL_16x16(top); 257INTRA_DC_TL_16x16(left); 258 259void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, 260 const uint8_t *src_left, const uint8_t *src_top) 261{ 262 uint32_t row; 263 v16u8 top0, top1, left0, left1, out; 264 v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; 265 v4u32 sum_w; 266 v2u64 sum_d; 267 268 LD_UB2(src_top, 16, top0, top1); 269 LD_UB2(src_left, 16, left0, left1); 270 HADD_UB2_UH(top0, top1, sum_top0, sum_top1); 271 HADD_UB2_UH(left0, left1, sum_left0, sum_left1); 272 sum_h = sum_top0 + sum_top1; 273 sum_h += sum_left0 + sum_left1; 274 sum_w = __msa_hadd_u_w(sum_h, sum_h); 275 sum_d = __msa_hadd_u_d(sum_w, sum_w); 276 sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); 277 sum_d = __msa_hadd_u_d(sum_w, sum_w); 278 sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 6); 279 out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); 280 281 for (row = 16; row--;) 282 { 283 ST_UB2(out, out, dst, 16); 284 dst += dst_stride; 285 ST_UB2(out, out, dst, 16); 286 dst += dst_stride; 287 } 288} 289 290#define INTRA_DC_TL_32x32(dir) \ 291void ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 292 const uint8_t *left, \ 293 const uint8_t *top) \ 294{ \ 295 uint32_t row; \ 296 v16u8 data0, data1, out; \ 297 v8u16 sum_h, sum_data0, sum_data1; \ 298 v4u32 sum_w; \ 299 v2u64 sum_d; \ 300 \ 301 LD_UB2(dir, 16, data0, data1); \ 302 HADD_UB2_UH(data0, data1, sum_data0, sum_data1); \ 303 sum_h = sum_data0 + sum_data1; \ 304 sum_w = __msa_hadd_u_w(sum_h, sum_h); \ 305 sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 306 sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \ 307 sum_d = __msa_hadd_u_d(sum_w, sum_w); \ 308 sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5); \ 309 out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \ 310 \ 311 for (row = 16; row--;) \ 312 { \ 313 ST_UB2(out, out, dst, 16); \ 314 dst += dst_stride; \ 315 ST_UB2(out, out, dst, 16); \ 316 dst += dst_stride; \ 317 } \ 318} 319INTRA_DC_TL_32x32(top); 320INTRA_DC_TL_32x32(left); 321 322#define INTRA_PREDICT_VALDC_16X16_MSA(val) \ 323void ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 324 const uint8_t *left, const uint8_t *top) \ 325{ \ 326 v16u8 out = (v16u8) __msa_ldi_b(val); \ 327 \ 328 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 329 dst += (8 * dst_stride); \ 330 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 331} 332 333INTRA_PREDICT_VALDC_16X16_MSA(127); 334INTRA_PREDICT_VALDC_16X16_MSA(128); 335INTRA_PREDICT_VALDC_16X16_MSA(129); 336 337#define INTRA_PREDICT_VALDC_32X32_MSA(val) \ 338void ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \ 339 const uint8_t *left, const uint8_t *top) \ 340{ \ 341 uint32_t row; \ 342 v16u8 out = (v16u8) __msa_ldi_b(val); \ 343 \ 344 for (row = 16; row--;) \ 345 { \ 346 ST_UB2(out, out, dst, 16); \ 347 dst += dst_stride; \ 348 ST_UB2(out, out, dst, 16); \ 349 dst += dst_stride; \ 350 } \ 351} 352 353INTRA_PREDICT_VALDC_32X32_MSA(127); 354INTRA_PREDICT_VALDC_32X32_MSA(128); 355INTRA_PREDICT_VALDC_32X32_MSA(129); 356 357void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, 358 const uint8_t *src_left, const uint8_t *src_top_ptr) 359{ 360 uint32_t left; 361 uint8_t top_left = src_top_ptr[-1]; 362 v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1; 363 v16u8 src0, src1, src2, src3; 364 v8u16 src_top_left, vec0, vec1, vec2, vec3; 365 366 src_top_left = (v8u16) __msa_fill_h(top_left); 367 src_top = LD_SB(src_top_ptr); 368 left = LW(src_left); 369 src_left0 = __msa_fill_b(left >> 24); 370 src_left1 = __msa_fill_b(left >> 16); 371 src_left2 = __msa_fill_b(left >> 8); 372 src_left3 = __msa_fill_b(left); 373 374 ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, 375 src_left3, src_top, src0, src1, src2, src3); 376 HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); 377 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); 378 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); 379 SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); 380 PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); 381 ST_W2(tmp0, 0, 2, dst, dst_stride); 382 ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride); 383} 384 385void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, 386 const uint8_t *src_left, const uint8_t *src_top_ptr) 387{ 388 uint8_t top_left = src_top_ptr[-1]; 389 uint32_t loop_cnt, left; 390 v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1; 391 v8u16 src_top_left, vec0, vec1, vec2, vec3; 392 v16u8 src0, src1, src2, src3; 393 394 src_top = LD_SB(src_top_ptr); 395 src_top_left = (v8u16) __msa_fill_h(top_left); 396 397 src_left += 4; 398 for (loop_cnt = 2; loop_cnt--;) { 399 left = LW(src_left); 400 src_left0 = __msa_fill_b(left >> 24); 401 src_left1 = __msa_fill_b(left >> 16); 402 src_left2 = __msa_fill_b(left >> 8); 403 src_left3 = __msa_fill_b(left); 404 src_left -= 4; 405 406 ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, 407 src_left3, src_top, src0, src1, src2, src3); 408 HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); 409 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); 410 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); 411 SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); 412 PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); 413 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 414 dst += (4 * dst_stride); 415 } 416} 417 418void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, 419 const uint8_t *src_left, const uint8_t *src_top_ptr) 420{ 421 uint8_t top_left = src_top_ptr[-1]; 422 uint32_t loop_cnt, left; 423 v16i8 src_top, src_left0, src_left1, src_left2, src_left3; 424 v8u16 src_top_left, res_r, res_l; 425 426 src_top = LD_SB(src_top_ptr); 427 src_top_left = (v8u16) __msa_fill_h(top_left); 428 429 src_left += 12; 430 for (loop_cnt = 4; loop_cnt--;) { 431 left = LW(src_left); 432 src_left0 = __msa_fill_b(left >> 24); 433 src_left1 = __msa_fill_b(left >> 16); 434 src_left2 = __msa_fill_b(left >> 8); 435 src_left3 = __msa_fill_b(left); 436 src_left -= 4; 437 438 ILVRL_B2_UH(src_left0, src_top, res_r, res_l); 439 HADD_UB2_UH(res_r, res_l, res_r, res_l); 440 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); 441 442 SAT_UH2_UH(res_r, res_l, 7); 443 PCKEV_ST_SB(res_r, res_l, dst); 444 dst += dst_stride; 445 446 ILVRL_B2_UH(src_left1, src_top, res_r, res_l); 447 HADD_UB2_UH(res_r, res_l, res_r, res_l); 448 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); 449 SAT_UH2_UH(res_r, res_l, 7); 450 PCKEV_ST_SB(res_r, res_l, dst); 451 dst += dst_stride; 452 453 ILVRL_B2_UH(src_left2, src_top, res_r, res_l); 454 HADD_UB2_UH(res_r, res_l, res_r, res_l); 455 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); 456 SAT_UH2_UH(res_r, res_l, 7); 457 PCKEV_ST_SB(res_r, res_l, dst); 458 dst += dst_stride; 459 460 ILVRL_B2_UH(src_left3, src_top, res_r, res_l); 461 HADD_UB2_UH(res_r, res_l, res_r, res_l); 462 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); 463 SAT_UH2_UH(res_r, res_l, 7); 464 PCKEV_ST_SB(res_r, res_l, dst); 465 dst += dst_stride; 466 } 467} 468 469void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, 470 const uint8_t *src_left, const uint8_t *src_top_ptr) 471{ 472 uint8_t top_left = src_top_ptr[-1]; 473 uint32_t loop_cnt, left; 474 v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; 475 v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; 476 477 src_top0 = LD_SB(src_top_ptr); 478 src_top1 = LD_SB(src_top_ptr + 16); 479 src_top_left = (v8u16) __msa_fill_h(top_left); 480 481 src_left += 28; 482 for (loop_cnt = 8; loop_cnt--;) { 483 left = LW(src_left); 484 src_left0 = __msa_fill_b(left >> 24); 485 src_left1 = __msa_fill_b(left >> 16); 486 src_left2 = __msa_fill_b(left >> 8); 487 src_left3 = __msa_fill_b(left); 488 src_left -= 4; 489 490 ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); 491 ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); 492 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, 493 res_l1); 494 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); 495 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); 496 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); 497 PCKEV_ST_SB(res_r0, res_l0, dst); 498 PCKEV_ST_SB(res_r1, res_l1, dst + 16); 499 dst += dst_stride; 500 501 ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); 502 ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); 503 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, 504 res_l1); 505 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); 506 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); 507 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); 508 PCKEV_ST_SB(res_r0, res_l0, dst); 509 PCKEV_ST_SB(res_r1, res_l1, dst + 16); 510 dst += dst_stride; 511 512 ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); 513 ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); 514 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, 515 res_l1); 516 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); 517 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); 518 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); 519 PCKEV_ST_SB(res_r0, res_l0, dst); 520 PCKEV_ST_SB(res_r1, res_l1, dst + 16); 521 dst += dst_stride; 522 523 ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); 524 ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); 525 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, 526 res_l1); 527 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); 528 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); 529 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); 530 PCKEV_ST_SB(res_r0, res_l0, dst); 531 PCKEV_ST_SB(res_r1, res_l1, dst + 16); 532 dst += dst_stride; 533 } 534} 535