1/*
2 * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavcodec/vp9dsp.h"
22#include "libavutil/mips/generic_macros_msa.h"
23#include "vp9dsp_mips.h"
24
25#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1)  \
26{                                                \
27    out0 = __msa_subs_u_h(out0, in0);            \
28    out1 = __msa_subs_u_h(out1, in1);            \
29}
30
31void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
32                       const uint8_t *src)
33{
34    uint32_t row;
35    v16u8 src0;
36
37    src0 = LD_UB(src);
38
39    for (row = 16; row--;) {
40        ST_UB(src0, dst);
41        dst += dst_stride;
42    }
43}
44
45void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
46                       const uint8_t *src)
47{
48    uint32_t row;
49    v16u8 src1, src2;
50
51    src1 = LD_UB(src);
52    src2 = LD_UB(src + 16);
53
54    for (row = 32; row--;) {
55        ST_UB2(src1, src2, dst, 16);
56        dst += dst_stride;
57    }
58}
59
60void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
61                      const uint8_t *top)
62{
63    uint32_t row, inp;
64    v16u8 src0, src1, src2, src3;
65
66    src += 12;
67    for (row = 4; row--;) {
68        inp = LW(src);
69        src -= 4;
70
71        src0 = (v16u8) __msa_fill_b(inp >> 24);
72        src1 = (v16u8) __msa_fill_b(inp >> 16);
73        src2 = (v16u8) __msa_fill_b(inp >> 8);
74        src3 = (v16u8) __msa_fill_b(inp);
75
76        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
77        dst += (4 * dst_stride);
78    }
79}
80
81void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
82                      const uint8_t *top)
83{
84    uint32_t row, inp;
85    v16u8 src0, src1, src2, src3;
86
87    src += 28;
88    for (row = 8; row--;) {
89        inp = LW(src);
90        src -= 4;
91
92        src0 = (v16u8) __msa_fill_b(inp >> 24);
93        src1 = (v16u8) __msa_fill_b(inp >> 16);
94        src2 = (v16u8) __msa_fill_b(inp >> 8);
95        src3 = (v16u8) __msa_fill_b(inp);
96
97        ST_UB2(src0, src0, dst, 16);
98        dst += dst_stride;
99        ST_UB2(src1, src1, dst, 16);
100        dst += dst_stride;
101        ST_UB2(src2, src2, dst, 16);
102        dst += dst_stride;
103        ST_UB2(src3, src3, dst, 16);
104        dst += dst_stride;
105    }
106}
107
108void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
109                   const uint8_t *src_top)
110{
111    uint32_t val0, val1;
112    v16i8 store, src = { 0 };
113    v8u16 sum_h;
114    v4u32 sum_w;
115    v2u64 sum_d;
116
117    val0 = LW(src_top);
118    val1 = LW(src_left);
119    INSERT_W2_SB(val0, val1, src);
120    sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src);
121    sum_w = __msa_hadd_u_w(sum_h, sum_h);
122    sum_d = __msa_hadd_u_d(sum_w, sum_w);
123    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);
124    store = __msa_splati_b((v16i8) sum_w, 0);
125    val0 = __msa_copy_u_w((v4i32) store, 0);
126
127    SW4(val0, val0, val0, val0, dst, dst_stride);
128}
129
130#define INTRA_DC_TL_4x4(dir)                                    \
131void ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
132                           const uint8_t *left,                 \
133                           const uint8_t *top)                  \
134{                                                               \
135    uint32_t val0;                                              \
136    v16i8 store, data = { 0 };                                  \
137    v8u16 sum_h;                                                \
138    v4u32 sum_w;                                                \
139                                                                \
140    val0 = LW(dir);                                             \
141    data = (v16i8) __msa_insert_w((v4i32) data, 0, val0);       \
142    sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data);         \
143    sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \
144    sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2);            \
145    store = __msa_splati_b((v16i8) sum_w, 0);                   \
146    val0 = __msa_copy_u_w((v4i32) store, 0);                    \
147                                                                \
148    SW4(val0, val0, val0, val0, dst, dst_stride);               \
149}
150INTRA_DC_TL_4x4(top);
151INTRA_DC_TL_4x4(left);
152
153void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
154                   const uint8_t *src_top)
155{
156    uint64_t val0, val1;
157    v16i8 store;
158    v16u8 src = { 0 };
159    v8u16 sum_h;
160    v4u32 sum_w;
161    v2u64 sum_d;
162
163    val0 = LD(src_top);
164    val1 = LD(src_left);
165    INSERT_D2_UB(val0, val1, src);
166    sum_h = __msa_hadd_u_h(src, src);
167    sum_w = __msa_hadd_u_w(sum_h, sum_h);
168    sum_d = __msa_hadd_u_d(sum_w, sum_w);
169    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
170    sum_d = __msa_hadd_u_d(sum_w, sum_w);
171    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);
172    store = __msa_splati_b((v16i8) sum_w, 0);
173    val0 = __msa_copy_u_d((v2i64) store, 0);
174
175    SD4(val0, val0, val0, val0, dst, dst_stride);
176    dst += (4 * dst_stride);
177    SD4(val0, val0, val0, val0, dst, dst_stride);
178}
179
180#define INTRA_DC_TL_8x8(dir)                                    \
181void ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
182                           const uint8_t *left,                 \
183                           const uint8_t *top)                  \
184{                                                               \
185    uint64_t val0;                                              \
186    v16i8 store;                                                \
187    v16u8 data = { 0 };                                         \
188    v8u16 sum_h;                                                \
189    v4u32 sum_w;                                                \
190    v2u64 sum_d;                                                \
191                                                                \
192    val0 = LD(dir);                                             \
193    data = (v16u8) __msa_insert_d((v2i64) data, 0, val0);       \
194    sum_h = __msa_hadd_u_h(data, data);                         \
195    sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \
196    sum_d = __msa_hadd_u_d(sum_w, sum_w);                       \
197    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);            \
198    store = __msa_splati_b((v16i8) sum_w, 0);                   \
199    val0 = __msa_copy_u_d((v2i64) store, 0);                    \
200                                                                \
201    SD4(val0, val0, val0, val0, dst, dst_stride);               \
202    dst += (4 * dst_stride);                                    \
203    SD4(val0, val0, val0, val0, dst, dst_stride);               \
204}
205
206INTRA_DC_TL_8x8(top);
207INTRA_DC_TL_8x8(left);
208
209void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
210                     const uint8_t *src_left, const uint8_t *src_top)
211{
212    v16u8 top, left, out;
213    v8u16 sum_h, sum_top, sum_left;
214    v4u32 sum_w;
215    v2u64 sum_d;
216
217    top = LD_UB(src_top);
218    left = LD_UB(src_left);
219    HADD_UB2_UH(top, left, sum_top, sum_left);
220    sum_h = sum_top + sum_left;
221    sum_w = __msa_hadd_u_w(sum_h, sum_h);
222    sum_d = __msa_hadd_u_d(sum_w, sum_w);
223    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
224    sum_d = __msa_hadd_u_d(sum_w, sum_w);
225    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);
226    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
227
228    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
229    dst += (8 * dst_stride);
230    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
231}
232
233#define INTRA_DC_TL_16x16(dir)                                        \
234void ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,      \
235                             const uint8_t *left,                     \
236                             const uint8_t *top)                      \
237{                                                                     \
238    v16u8 data, out;                                                  \
239    v8u16 sum_h;                                                      \
240    v4u32 sum_w;                                                      \
241    v2u64 sum_d;                                                      \
242                                                                      \
243    data = LD_UB(dir);                                                \
244    sum_h = __msa_hadd_u_h(data, data);                               \
245    sum_w = __msa_hadd_u_w(sum_h, sum_h);                             \
246    sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \
247    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);      \
248    sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \
249    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);                  \
250    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);                   \
251                                                                      \
252    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
253    dst += (8 * dst_stride);                                          \
254    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
255}
256INTRA_DC_TL_16x16(top);
257INTRA_DC_TL_16x16(left);
258
259void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
260                     const uint8_t *src_left, const uint8_t *src_top)
261{
262    uint32_t row;
263    v16u8 top0, top1, left0, left1, out;
264    v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
265    v4u32 sum_w;
266    v2u64 sum_d;
267
268    LD_UB2(src_top, 16, top0, top1);
269    LD_UB2(src_left, 16, left0, left1);
270    HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
271    HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
272    sum_h = sum_top0 + sum_top1;
273    sum_h += sum_left0 + sum_left1;
274    sum_w = __msa_hadd_u_w(sum_h, sum_h);
275    sum_d = __msa_hadd_u_d(sum_w, sum_w);
276    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
277    sum_d = __msa_hadd_u_d(sum_w, sum_w);
278    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 6);
279    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
280
281    for (row = 16; row--;)
282    {
283        ST_UB2(out, out, dst, 16);
284        dst += dst_stride;
285        ST_UB2(out, out, dst, 16);
286        dst += dst_stride;
287    }
288}
289
290#define INTRA_DC_TL_32x32(dir)                                    \
291void ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
292                             const uint8_t *left,                 \
293                             const uint8_t *top)                  \
294{                                                                 \
295    uint32_t row;                                                 \
296    v16u8 data0, data1, out;                                      \
297    v8u16 sum_h, sum_data0, sum_data1;                            \
298    v4u32 sum_w;                                                  \
299    v2u64 sum_d;                                                  \
300                                                                  \
301    LD_UB2(dir, 16, data0, data1);                                \
302    HADD_UB2_UH(data0, data1, sum_data0, sum_data1);              \
303    sum_h = sum_data0 + sum_data1;                                \
304    sum_w = __msa_hadd_u_w(sum_h, sum_h);                         \
305    sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \
306    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);  \
307    sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \
308    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);              \
309    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);               \
310                                                                  \
311    for (row = 16; row--;)                                        \
312    {                                                             \
313        ST_UB2(out, out, dst, 16);                                \
314        dst += dst_stride;                                        \
315        ST_UB2(out, out, dst, 16);                                \
316        dst += dst_stride;                                        \
317    }                                                             \
318}
319INTRA_DC_TL_32x32(top);
320INTRA_DC_TL_32x32(left);
321
322#define INTRA_PREDICT_VALDC_16X16_MSA(val)                             \
323void ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,       \
324                             const uint8_t *left, const uint8_t *top)  \
325{                                                                      \
326    v16u8 out = (v16u8) __msa_ldi_b(val);                              \
327                                                                       \
328    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \
329    dst += (8 * dst_stride);                                           \
330    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \
331}
332
333INTRA_PREDICT_VALDC_16X16_MSA(127);
334INTRA_PREDICT_VALDC_16X16_MSA(128);
335INTRA_PREDICT_VALDC_16X16_MSA(129);
336
337#define INTRA_PREDICT_VALDC_32X32_MSA(val)                             \
338void ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,       \
339                             const uint8_t *left, const uint8_t *top)  \
340{                                                                      \
341    uint32_t row;                                                      \
342    v16u8 out = (v16u8) __msa_ldi_b(val);                              \
343                                                                       \
344    for (row = 16; row--;)                                             \
345    {                                                                  \
346        ST_UB2(out, out, dst, 16);                                     \
347        dst += dst_stride;                                             \
348        ST_UB2(out, out, dst, 16);                                     \
349        dst += dst_stride;                                             \
350    }                                                                  \
351}
352
353INTRA_PREDICT_VALDC_32X32_MSA(127);
354INTRA_PREDICT_VALDC_32X32_MSA(128);
355INTRA_PREDICT_VALDC_32X32_MSA(129);
356
357void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,
358                   const uint8_t *src_left, const uint8_t *src_top_ptr)
359{
360    uint32_t left;
361    uint8_t top_left = src_top_ptr[-1];
362    v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
363    v16u8 src0, src1, src2, src3;
364    v8u16 src_top_left, vec0, vec1, vec2, vec3;
365
366    src_top_left = (v8u16) __msa_fill_h(top_left);
367    src_top = LD_SB(src_top_ptr);
368    left = LW(src_left);
369    src_left0 = __msa_fill_b(left >> 24);
370    src_left1 = __msa_fill_b(left >> 16);
371    src_left2 = __msa_fill_b(left >> 8);
372    src_left3 = __msa_fill_b(left);
373
374    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
375               src_left3, src_top, src0, src1, src2, src3);
376    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
377    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
378    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
379    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
380    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
381    ST_W2(tmp0, 0, 2, dst, dst_stride);
382    ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
383}
384
385void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
386                   const uint8_t *src_left, const uint8_t *src_top_ptr)
387{
388    uint8_t top_left = src_top_ptr[-1];
389    uint32_t loop_cnt, left;
390    v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
391    v8u16 src_top_left, vec0, vec1, vec2, vec3;
392    v16u8 src0, src1, src2, src3;
393
394    src_top = LD_SB(src_top_ptr);
395    src_top_left = (v8u16) __msa_fill_h(top_left);
396
397    src_left += 4;
398    for (loop_cnt = 2; loop_cnt--;) {
399        left = LW(src_left);
400        src_left0 = __msa_fill_b(left >> 24);
401        src_left1 = __msa_fill_b(left >> 16);
402        src_left2 = __msa_fill_b(left >> 8);
403        src_left3 = __msa_fill_b(left);
404        src_left -= 4;
405
406        ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
407                   src_left3, src_top, src0, src1, src2, src3);
408        HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
409        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
410        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
411        SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
412        PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
413        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
414        dst += (4 * dst_stride);
415    }
416}
417
418void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
419                     const uint8_t *src_left, const uint8_t *src_top_ptr)
420{
421    uint8_t top_left = src_top_ptr[-1];
422    uint32_t loop_cnt, left;
423    v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
424    v8u16 src_top_left, res_r, res_l;
425
426    src_top = LD_SB(src_top_ptr);
427    src_top_left = (v8u16) __msa_fill_h(top_left);
428
429    src_left += 12;
430    for (loop_cnt = 4; loop_cnt--;) {
431        left = LW(src_left);
432        src_left0 = __msa_fill_b(left >> 24);
433        src_left1 = __msa_fill_b(left >> 16);
434        src_left2 = __msa_fill_b(left >> 8);
435        src_left3 = __msa_fill_b(left);
436        src_left -= 4;
437
438        ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
439        HADD_UB2_UH(res_r, res_l, res_r, res_l);
440        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
441
442        SAT_UH2_UH(res_r, res_l, 7);
443        PCKEV_ST_SB(res_r, res_l, dst);
444        dst += dst_stride;
445
446        ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
447        HADD_UB2_UH(res_r, res_l, res_r, res_l);
448        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
449        SAT_UH2_UH(res_r, res_l, 7);
450        PCKEV_ST_SB(res_r, res_l, dst);
451        dst += dst_stride;
452
453        ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
454        HADD_UB2_UH(res_r, res_l, res_r, res_l);
455        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
456        SAT_UH2_UH(res_r, res_l, 7);
457        PCKEV_ST_SB(res_r, res_l, dst);
458        dst += dst_stride;
459
460        ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
461        HADD_UB2_UH(res_r, res_l, res_r, res_l);
462        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
463        SAT_UH2_UH(res_r, res_l, 7);
464        PCKEV_ST_SB(res_r, res_l, dst);
465        dst += dst_stride;
466    }
467}
468
469void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
470                     const uint8_t *src_left, const uint8_t *src_top_ptr)
471{
472    uint8_t top_left = src_top_ptr[-1];
473    uint32_t loop_cnt, left;
474    v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
475    v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
476
477    src_top0 = LD_SB(src_top_ptr);
478    src_top1 = LD_SB(src_top_ptr + 16);
479    src_top_left = (v8u16) __msa_fill_h(top_left);
480
481    src_left += 28;
482    for (loop_cnt = 8; loop_cnt--;) {
483        left = LW(src_left);
484        src_left0 = __msa_fill_b(left >> 24);
485        src_left1 = __msa_fill_b(left >> 16);
486        src_left2 = __msa_fill_b(left >> 8);
487        src_left3 = __msa_fill_b(left);
488        src_left -= 4;
489
490        ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
491        ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
492        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
493                    res_l1);
494        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
495        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
496        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
497        PCKEV_ST_SB(res_r0, res_l0, dst);
498        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
499        dst += dst_stride;
500
501        ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
502        ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
503        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
504                    res_l1);
505        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
506        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
507        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
508        PCKEV_ST_SB(res_r0, res_l0, dst);
509        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
510        dst += dst_stride;
511
512        ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
513        ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
514        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
515                    res_l1);
516        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
517        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
518        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
519        PCKEV_ST_SB(res_r0, res_l0, dst);
520        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
521        dst += dst_stride;
522
523        ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
524        ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
525        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
526                    res_l1);
527        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
528        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
529        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
530        PCKEV_ST_SB(res_r0, res_l0, dst);
531        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
532        dst += dst_stride;
533    }
534}
535