1/*
2 * HEVC video decoder
3 *
4 * Copyright (C) 2012 - 2013 Guillaume Martres
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "get_bits.h"
24#include "hevcdec.h"
25
26#include "bit_depth_template.c"
27#include "hevcdsp.h"
28
29static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
30                          GetBitContext *gb, int pcm_bit_depth)
31{
32    int x, y;
33    pixel *dst = (pixel *)_dst;
34
35    stride /= sizeof(pixel);
36
37    for (y = 0; y < height; y++) {
38        for (x = 0; x < width; x++)
39            dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
40        dst += stride;
41    }
42}
43
44static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
45                                                ptrdiff_t stride, int size)
46{
47    int x, y;
48    pixel *dst = (pixel *)_dst;
49
50    stride /= sizeof(pixel);
51
52    for (y = 0; y < size; y++) {
53        for (x = 0; x < size; x++) {
54            dst[x] = av_clip_pixel(dst[x] + *res);
55            res++;
56        }
57        dst += stride;
58    }
59}
60
61static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
62                                  ptrdiff_t stride)
63{
64    FUNC(add_residual)(_dst, res, stride, 4);
65}
66
67static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
68                                  ptrdiff_t stride)
69{
70    FUNC(add_residual)(_dst, res, stride, 8);
71}
72
73static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
74                                    ptrdiff_t stride)
75{
76    FUNC(add_residual)(_dst, res, stride, 16);
77}
78
79static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
80                                    ptrdiff_t stride)
81{
82    FUNC(add_residual)(_dst, res, stride, 32);
83}
84
85static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
86{
87    int16_t *coeffs = (int16_t *) _coeffs;
88    int x, y;
89    int size = 1 << log2_size;
90
91    if (mode) {
92        coeffs += size;
93        for (y = 0; y < size - 1; y++) {
94            for (x = 0; x < size; x++)
95                coeffs[x] += coeffs[x - size];
96            coeffs += size;
97        }
98    } else {
99        for (y = 0; y < size; y++) {
100            for (x = 1; x < size; x++)
101                coeffs[x] += coeffs[x - 1];
102            coeffs += size;
103        }
104    }
105}
106
107static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
108{
109    int shift  = 15 - BIT_DEPTH - log2_size;
110    int x, y;
111    int size = 1 << log2_size;
112
113    if (shift > 0) {
114        int offset = 1 << (shift - 1);
115        for (y = 0; y < size; y++) {
116            for (x = 0; x < size; x++) {
117                *coeffs = (*coeffs + offset) >> shift;
118                coeffs++;
119            }
120        }
121    } else {
122        for (y = 0; y < size; y++) {
123            for (x = 0; x < size; x++) {
124                *coeffs = *(uint16_t*)coeffs << -shift;
125                coeffs++;
126            }
127        }
128    }
129}
130
131#define SET(dst, x)   (dst) = (x)
132#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
133
134#define TR_4x4_LUMA(dst, src, step, assign)                             \
135    do {                                                                \
136        int c0 = src[0 * step] + src[2 * step];                         \
137        int c1 = src[2 * step] + src[3 * step];                         \
138        int c2 = src[0 * step] - src[3 * step];                         \
139        int c3 = 74 * src[1 * step];                                    \
140                                                                        \
141        assign(dst[2 * step], 74 * (src[0 * step] -                     \
142                                    src[2 * step] +                     \
143                                    src[3 * step]));                    \
144        assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
145        assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
146        assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
147    } while (0)
148
149static void FUNC(transform_4x4_luma)(int16_t *coeffs)
150{
151    int i;
152    int shift    = 7;
153    int add      = 1 << (shift - 1);
154    int16_t *src = coeffs;
155
156    for (i = 0; i < 4; i++) {
157        TR_4x4_LUMA(src, src, 4, SCALE);
158        src++;
159    }
160
161    shift = 20 - BIT_DEPTH;
162    add   = 1 << (shift - 1);
163    for (i = 0; i < 4; i++) {
164        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
165        coeffs += 4;
166    }
167}
168
169#undef TR_4x4_LUMA
170
171#define TR_4(dst, src, dstep, sstep, assign, end)                 \
172    do {                                                          \
173        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
174        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
175        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
176        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
177                                                                  \
178        assign(dst[0 * dstep], e0 + o0);                          \
179        assign(dst[1 * dstep], e1 + o1);                          \
180        assign(dst[2 * dstep], e1 - o1);                          \
181        assign(dst[3 * dstep], e0 - o0);                          \
182    } while (0)
183
184#define TR_8(dst, src, dstep, sstep, assign, end)                 \
185    do {                                                          \
186        int i, j;                                                 \
187        int e_8[4];                                               \
188        int o_8[4] = { 0 };                                       \
189        for (i = 0; i < 4; i++)                                   \
190            for (j = 1; j < end; j += 2)                          \
191                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
192        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                     \
193                                                                  \
194        for (i = 0; i < 4; i++) {                                 \
195            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
196            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
197        }                                                         \
198    } while (0)
199
200#define TR_16(dst, src, dstep, sstep, assign, end)                \
201    do {                                                          \
202        int i, j;                                                 \
203        int e_16[8];                                              \
204        int o_16[8] = { 0 };                                      \
205        for (i = 0; i < 8; i++)                                   \
206            for (j = 1; j < end; j += 2)                          \
207                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
208        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                    \
209                                                                  \
210        for (i = 0; i < 8; i++) {                                 \
211            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
212            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
213        }                                                         \
214    } while (0)
215
216#define TR_32(dst, src, dstep, sstep, assign, end)                \
217    do {                                                          \
218        int i, j;                                                 \
219        int e_32[16];                                             \
220        int o_32[16] = { 0 };                                     \
221        for (i = 0; i < 16; i++)                                  \
222            for (j = 1; j < end; j += 2)                          \
223                o_32[i] += transform[j][i] * src[j * sstep];      \
224        TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);             \
225                                                                  \
226        for (i = 0; i < 16; i++) {                                \
227            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
228            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
229        }                                                         \
230    } while (0)
231
232#define IDCT_VAR4(H)                                              \
233    int limit2 = FFMIN(col_limit + 4, H)
234#define IDCT_VAR8(H)                                              \
235    int limit  = FFMIN(col_limit, H);                             \
236    int limit2 = FFMIN(col_limit + 4, H)
237#define IDCT_VAR16(H)   IDCT_VAR8(H)
238#define IDCT_VAR32(H)   IDCT_VAR8(H)
239
240#define IDCT(H)                                                   \
241static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs,          \
242                                        int col_limit)            \
243{                                                                 \
244    int i;                                                        \
245    int      shift = 7;                                           \
246    int      add   = 1 << (shift - 1);                            \
247    int16_t *src   = coeffs;                                      \
248    IDCT_VAR ## H(H);                                             \
249                                                                  \
250    for (i = 0; i < H; i++) {                                     \
251        TR_ ## H(src, src, H, H, SCALE, limit2);                  \
252        if (limit2 < H && i%4 == 0 && !!i)                        \
253            limit2 -= 4;                                          \
254        src++;                                                    \
255    }                                                             \
256                                                                  \
257    shift = 20 - BIT_DEPTH;                                       \
258    add   = 1 << (shift - 1);                                     \
259    for (i = 0; i < H; i++) {                                     \
260        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);             \
261        coeffs += H;                                              \
262    }                                                             \
263}
264
265#define IDCT_DC(H)                                                \
266static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs)    \
267{                                                                 \
268    int i, j;                                                     \
269    int shift = 14 - BIT_DEPTH;                                   \
270    int add   = 1 << (shift - 1);                                 \
271    int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift;          \
272                                                                  \
273    for (j = 0; j < H; j++) {                                     \
274        for (i = 0; i < H; i++) {                                 \
275            coeffs[i + j * H] = coeff;                            \
276        }                                                         \
277    }                                                             \
278}
279
280IDCT( 4)
281IDCT( 8)
282IDCT(16)
283IDCT(32)
284
285IDCT_DC( 4)
286IDCT_DC( 8)
287IDCT_DC(16)
288IDCT_DC(32)
289
290#undef TR_4
291#undef TR_8
292#undef TR_16
293#undef TR_32
294
295#undef SET
296#undef SCALE
297
298static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
299                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
300                                  int16_t *sao_offset_val, int sao_left_class,
301                                  int width, int height)
302{
303    pixel *dst = (pixel *)_dst;
304    pixel *src = (pixel *)_src;
305    int offset_table[32] = { 0 };
306    int k, y, x;
307    int shift  = BIT_DEPTH - 5;
308
309    stride_dst /= sizeof(pixel);
310    stride_src /= sizeof(pixel);
311
312    for (k = 0; k < 4; k++)
313        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
314    for (y = 0; y < height; y++) {
315        for (x = 0; x < width; x++)
316            dst[x] = av_clip_pixel(src[x] + offset_table[(src[x] >> shift) & 31]);
317        dst += stride_dst;
318        src += stride_src;
319    }
320}
321
322#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
323
324static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
325                                  int eo, int width, int height) {
326
327    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
328    static const int8_t pos[4][2][2] = {
329        { { -1,  0 }, {  1, 0 } }, // horizontal
330        { {  0, -1 }, {  0, 1 } }, // vertical
331        { { -1, -1 }, {  1, 1 } }, // 45 degree
332        { {  1, -1 }, { -1, 1 } }, // 135 degree
333    };
334    pixel *dst = (pixel *)_dst;
335    pixel *src = (pixel *)_src;
336    int a_stride, b_stride;
337    int x, y;
338    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
339    stride_dst /= sizeof(pixel);
340
341    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
342    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
343    for (y = 0; y < height; y++) {
344        for (x = 0; x < width; x++) {
345            int diff0 = CMP(src[x], src[x + a_stride]);
346            int diff1 = CMP(src[x], src[x + b_stride]);
347            int offset_val        = edge_idx[2 + diff0 + diff1];
348            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
349        }
350        src += stride_src;
351        dst += stride_dst;
352    }
353}
354
355static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
356                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
357                                    int *borders, int _width, int _height,
358                                    int c_idx, uint8_t *vert_edge,
359                                    uint8_t *horiz_edge, uint8_t *diag_edge)
360{
361    int x, y;
362    pixel *dst = (pixel *)_dst;
363    pixel *src = (pixel *)_src;
364    int16_t *sao_offset_val = sao->offset_val[c_idx];
365    int sao_eo_class    = sao->eo_class[c_idx];
366    int init_x = 0, width = _width, height = _height;
367
368    stride_dst /= sizeof(pixel);
369    stride_src /= sizeof(pixel);
370
371    if (sao_eo_class != SAO_EO_VERT) {
372        if (borders[0]) {
373            int offset_val = sao_offset_val[0];
374            for (y = 0; y < height; y++) {
375                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
376            }
377            init_x = 1;
378        }
379        if (borders[2]) {
380            int offset_val = sao_offset_val[0];
381            int offset     = width - 1;
382            for (x = 0; x < height; x++) {
383                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
384            }
385            width--;
386        }
387    }
388    if (sao_eo_class != SAO_EO_HORIZ) {
389        if (borders[1]) {
390            int offset_val = sao_offset_val[0];
391            for (x = init_x; x < width; x++)
392                dst[x] = av_clip_pixel(src[x] + offset_val);
393        }
394        if (borders[3]) {
395            int offset_val   = sao_offset_val[0];
396            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
397            ptrdiff_t y_stride_src = stride_src * (height - 1);
398            for (x = init_x; x < width; x++)
399                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
400            height--;
401        }
402    }
403}
404
405static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
406                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
407                                    int *borders, int _width, int _height,
408                                    int c_idx, uint8_t *vert_edge,
409                                    uint8_t *horiz_edge, uint8_t *diag_edge)
410{
411    int x, y;
412    pixel *dst = (pixel *)_dst;
413    pixel *src = (pixel *)_src;
414    int16_t *sao_offset_val = sao->offset_val[c_idx];
415    int sao_eo_class    = sao->eo_class[c_idx];
416    int init_x = 0, init_y = 0, width = _width, height = _height;
417
418    stride_dst /= sizeof(pixel);
419    stride_src /= sizeof(pixel);
420
421    if (sao_eo_class != SAO_EO_VERT) {
422        if (borders[0]) {
423            int offset_val = sao_offset_val[0];
424            for (y = 0; y < height; y++) {
425                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
426            }
427            init_x = 1;
428        }
429        if (borders[2]) {
430            int offset_val = sao_offset_val[0];
431            int offset     = width - 1;
432            for (x = 0; x < height; x++) {
433                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
434            }
435            width--;
436        }
437    }
438    if (sao_eo_class != SAO_EO_HORIZ) {
439        if (borders[1]) {
440            int offset_val = sao_offset_val[0];
441            for (x = init_x; x < width; x++)
442                dst[x] = av_clip_pixel(src[x] + offset_val);
443            init_y = 1;
444        }
445        if (borders[3]) {
446            int offset_val   = sao_offset_val[0];
447            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
448            ptrdiff_t y_stride_src = stride_src * (height - 1);
449            for (x = init_x; x < width; x++)
450                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
451            height--;
452        }
453    }
454
455    {
456        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
457        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
458        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
459        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
460
461        // Restore pixels that can't be modified
462        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
463            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
464                dst[y*stride_dst] = src[y*stride_src];
465        }
466        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
467            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
468                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
469        }
470
471        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
472            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
473                dst[x] = src[x];
474        }
475        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
476            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
477                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
478        }
479        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
480            dst[0] = src[0];
481        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
482            dst[width-1] = src[width-1];
483        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
484            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
485        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
486            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
487
488    }
489}
490
491#undef CMP
492
493////////////////////////////////////////////////////////////////////////////////
494//
495////////////////////////////////////////////////////////////////////////////////
496static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
497                                      uint8_t *_src, ptrdiff_t _srcstride,
498                                      int height, intptr_t mx, intptr_t my, int width)
499{
500    int x, y;
501    pixel *src          = (pixel *)_src;
502    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
503
504    for (y = 0; y < height; y++) {
505        for (x = 0; x < width; x++)
506            dst[x] = src[x] << (14 - BIT_DEPTH);
507        src += srcstride;
508        dst += MAX_PB_SIZE;
509    }
510}
511
512static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
513                                          int height, intptr_t mx, intptr_t my, int width)
514{
515    int y;
516    pixel *src          = (pixel *)_src;
517    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
518    pixel *dst          = (pixel *)_dst;
519    ptrdiff_t dststride = _dststride / sizeof(pixel);
520
521    for (y = 0; y < height; y++) {
522        memcpy(dst, src, width * sizeof(pixel));
523        src += srcstride;
524        dst += dststride;
525    }
526}
527
528static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
529                                         int16_t *src2,
530                                         int height, intptr_t mx, intptr_t my, int width)
531{
532    int x, y;
533    pixel *src          = (pixel *)_src;
534    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
535    pixel *dst          = (pixel *)_dst;
536    ptrdiff_t dststride = _dststride / sizeof(pixel);
537
538    int shift = 14  + 1 - BIT_DEPTH;
539#if BIT_DEPTH < 14
540    int offset = 1 << (shift - 1);
541#else
542    int offset = 0;
543#endif
544
545    for (y = 0; y < height; y++) {
546        for (x = 0; x < width; x++)
547            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
548        src  += srcstride;
549        dst  += dststride;
550        src2 += MAX_PB_SIZE;
551    }
552}
553
554static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
555                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
556{
557    int x, y;
558    pixel *src          = (pixel *)_src;
559    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
560    pixel *dst          = (pixel *)_dst;
561    ptrdiff_t dststride = _dststride / sizeof(pixel);
562    int shift = denom + 14 - BIT_DEPTH;
563#if BIT_DEPTH < 14
564    int offset = 1 << (shift - 1);
565#else
566    int offset = 0;
567#endif
568
569    ox     = ox * (1 << (BIT_DEPTH - 8));
570    for (y = 0; y < height; y++) {
571        for (x = 0; x < width; x++)
572            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
573        src += srcstride;
574        dst += dststride;
575    }
576}
577
578static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
579                                           int16_t *src2,
580                                           int height, int denom, int wx0, int wx1,
581                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
582{
583    int x, y;
584    pixel *src          = (pixel *)_src;
585    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
586    pixel *dst          = (pixel *)_dst;
587    ptrdiff_t dststride = _dststride / sizeof(pixel);
588
589    int shift = 14  + 1 - BIT_DEPTH;
590    int log2Wd = denom + shift - 1;
591
592    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
593    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
594    for (y = 0; y < height; y++) {
595        for (x = 0; x < width; x++) {
596            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
597        }
598        src  += srcstride;
599        dst  += dststride;
600        src2 += MAX_PB_SIZE;
601    }
602}
603
604////////////////////////////////////////////////////////////////////////////////
605//
606////////////////////////////////////////////////////////////////////////////////
607#define QPEL_FILTER(src, stride)                                               \
608    (filter[0] * src[x - 3 * stride] +                                         \
609     filter[1] * src[x - 2 * stride] +                                         \
610     filter[2] * src[x -     stride] +                                         \
611     filter[3] * src[x             ] +                                         \
612     filter[4] * src[x +     stride] +                                         \
613     filter[5] * src[x + 2 * stride] +                                         \
614     filter[6] * src[x + 3 * stride] +                                         \
615     filter[7] * src[x + 4 * stride])
616
617static void FUNC(put_hevc_qpel_h)(int16_t *dst,
618                                  uint8_t *_src, ptrdiff_t _srcstride,
619                                  int height, intptr_t mx, intptr_t my, int width)
620{
621    int x, y;
622    pixel        *src       = (pixel*)_src;
623    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
624    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
625    for (y = 0; y < height; y++) {
626        for (x = 0; x < width; x++)
627            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
628        src += srcstride;
629        dst += MAX_PB_SIZE;
630    }
631}
632
633static void FUNC(put_hevc_qpel_v)(int16_t *dst,
634                                  uint8_t *_src, ptrdiff_t _srcstride,
635                                  int height, intptr_t mx, intptr_t my, int width)
636{
637    int x, y;
638    pixel        *src       = (pixel*)_src;
639    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
640    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
641    for (y = 0; y < height; y++)  {
642        for (x = 0; x < width; x++)
643            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
644        src += srcstride;
645        dst += MAX_PB_SIZE;
646    }
647}
648
649static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
650                                   uint8_t *_src,
651                                   ptrdiff_t _srcstride,
652                                   int height, intptr_t mx,
653                                   intptr_t my, int width)
654{
655    int x, y;
656    const int8_t *filter;
657    pixel *src = (pixel*)_src;
658    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
659    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
660    int16_t *tmp = tmp_array;
661
662    src   -= QPEL_EXTRA_BEFORE * srcstride;
663    filter = ff_hevc_qpel_filters[mx - 1];
664    for (y = 0; y < height + QPEL_EXTRA; y++) {
665        for (x = 0; x < width; x++)
666            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
667        src += srcstride;
668        tmp += MAX_PB_SIZE;
669    }
670
671    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
672    filter = ff_hevc_qpel_filters[my - 1];
673    for (y = 0; y < height; y++) {
674        for (x = 0; x < width; x++)
675            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
676        tmp += MAX_PB_SIZE;
677        dst += MAX_PB_SIZE;
678    }
679}
680
681static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
682                                      uint8_t *_src, ptrdiff_t _srcstride,
683                                      int height, intptr_t mx, intptr_t my, int width)
684{
685    int x, y;
686    pixel        *src       = (pixel*)_src;
687    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
688    pixel *dst          = (pixel *)_dst;
689    ptrdiff_t dststride = _dststride / sizeof(pixel);
690    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
691    int shift = 14 - BIT_DEPTH;
692
693#if BIT_DEPTH < 14
694    int offset = 1 << (shift - 1);
695#else
696    int offset = 0;
697#endif
698
699    for (y = 0; y < height; y++) {
700        for (x = 0; x < width; x++)
701            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
702        src += srcstride;
703        dst += dststride;
704    }
705}
706
707static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
708                                     int16_t *src2,
709                                     int height, intptr_t mx, intptr_t my, int width)
710{
711    int x, y;
712    pixel        *src       = (pixel*)_src;
713    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
714    pixel *dst          = (pixel *)_dst;
715    ptrdiff_t dststride = _dststride / sizeof(pixel);
716
717    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
718
719    int shift = 14  + 1 - BIT_DEPTH;
720#if BIT_DEPTH < 14
721    int offset = 1 << (shift - 1);
722#else
723    int offset = 0;
724#endif
725
726    for (y = 0; y < height; y++) {
727        for (x = 0; x < width; x++)
728            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
729        src  += srcstride;
730        dst  += dststride;
731        src2 += MAX_PB_SIZE;
732    }
733}
734
735static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
736                                     uint8_t *_src, ptrdiff_t _srcstride,
737                                     int height, intptr_t mx, intptr_t my, int width)
738{
739    int x, y;
740    pixel        *src       = (pixel*)_src;
741    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
742    pixel *dst          = (pixel *)_dst;
743    ptrdiff_t dststride = _dststride / sizeof(pixel);
744    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
745    int shift = 14 - BIT_DEPTH;
746
747#if BIT_DEPTH < 14
748    int offset = 1 << (shift - 1);
749#else
750    int offset = 0;
751#endif
752
753    for (y = 0; y < height; y++) {
754        for (x = 0; x < width; x++)
755            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
756        src += srcstride;
757        dst += dststride;
758    }
759}
760
761
762static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
763                                     int16_t *src2,
764                                     int height, intptr_t mx, intptr_t my, int width)
765{
766    int x, y;
767    pixel        *src       = (pixel*)_src;
768    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
769    pixel *dst          = (pixel *)_dst;
770    ptrdiff_t dststride = _dststride / sizeof(pixel);
771
772    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
773
774    int shift = 14 + 1 - BIT_DEPTH;
775#if BIT_DEPTH < 14
776    int offset = 1 << (shift - 1);
777#else
778    int offset = 0;
779#endif
780
781    for (y = 0; y < height; y++) {
782        for (x = 0; x < width; x++)
783            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
784        src  += srcstride;
785        dst  += dststride;
786        src2 += MAX_PB_SIZE;
787    }
788}
789
790static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
791                                       uint8_t *_src, ptrdiff_t _srcstride,
792                                       int height, intptr_t mx, intptr_t my, int width)
793{
794    int x, y;
795    const int8_t *filter;
796    pixel *src = (pixel*)_src;
797    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
798    pixel *dst          = (pixel *)_dst;
799    ptrdiff_t dststride = _dststride / sizeof(pixel);
800    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
801    int16_t *tmp = tmp_array;
802    int shift =  14 - BIT_DEPTH;
803
804#if BIT_DEPTH < 14
805    int offset = 1 << (shift - 1);
806#else
807    int offset = 0;
808#endif
809
810    src   -= QPEL_EXTRA_BEFORE * srcstride;
811    filter = ff_hevc_qpel_filters[mx - 1];
812    for (y = 0; y < height + QPEL_EXTRA; y++) {
813        for (x = 0; x < width; x++)
814            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
815        src += srcstride;
816        tmp += MAX_PB_SIZE;
817    }
818
819    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
820    filter = ff_hevc_qpel_filters[my - 1];
821
822    for (y = 0; y < height; y++) {
823        for (x = 0; x < width; x++)
824            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
825        tmp += MAX_PB_SIZE;
826        dst += dststride;
827    }
828}
829
830static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
831                                      int16_t *src2,
832                                      int height, intptr_t mx, intptr_t my, int width)
833{
834    int x, y;
835    const int8_t *filter;
836    pixel *src = (pixel*)_src;
837    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
838    pixel *dst          = (pixel *)_dst;
839    ptrdiff_t dststride = _dststride / sizeof(pixel);
840    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
841    int16_t *tmp = tmp_array;
842    int shift = 14 + 1 - BIT_DEPTH;
843#if BIT_DEPTH < 14
844    int offset = 1 << (shift - 1);
845#else
846    int offset = 0;
847#endif
848
849    src   -= QPEL_EXTRA_BEFORE * srcstride;
850    filter = ff_hevc_qpel_filters[mx - 1];
851    for (y = 0; y < height + QPEL_EXTRA; y++) {
852        for (x = 0; x < width; x++)
853            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
854        src += srcstride;
855        tmp += MAX_PB_SIZE;
856    }
857
858    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
859    filter = ff_hevc_qpel_filters[my - 1];
860
861    for (y = 0; y < height; y++) {
862        for (x = 0; x < width; x++)
863            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
864        tmp  += MAX_PB_SIZE;
865        dst  += dststride;
866        src2 += MAX_PB_SIZE;
867    }
868}
869
870static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
871                                        uint8_t *_src, ptrdiff_t _srcstride,
872                                        int height, int denom, int wx, int ox,
873                                        intptr_t mx, intptr_t my, int width)
874{
875    int x, y;
876    pixel        *src       = (pixel*)_src;
877    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
878    pixel *dst          = (pixel *)_dst;
879    ptrdiff_t dststride = _dststride / sizeof(pixel);
880    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
881    int shift = denom + 14 - BIT_DEPTH;
882#if BIT_DEPTH < 14
883    int offset = 1 << (shift - 1);
884#else
885    int offset = 0;
886#endif
887
888    ox = ox * (1 << (BIT_DEPTH - 8));
889    for (y = 0; y < height; y++) {
890        for (x = 0; x < width; x++)
891            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
892        src += srcstride;
893        dst += dststride;
894    }
895}
896
897static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
898                                       int16_t *src2,
899                                       int height, int denom, int wx0, int wx1,
900                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
901{
902    int x, y;
903    pixel        *src       = (pixel*)_src;
904    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
905    pixel *dst          = (pixel *)_dst;
906    ptrdiff_t dststride = _dststride / sizeof(pixel);
907
908    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
909
910    int shift = 14  + 1 - BIT_DEPTH;
911    int log2Wd = denom + shift - 1;
912
913    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
914    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
915    for (y = 0; y < height; y++) {
916        for (x = 0; x < width; x++)
917            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
918                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
919        src  += srcstride;
920        dst  += dststride;
921        src2 += MAX_PB_SIZE;
922    }
923}
924
925static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
926                                        uint8_t *_src, ptrdiff_t _srcstride,
927                                        int height, int denom, int wx, int ox,
928                                        intptr_t mx, intptr_t my, int width)
929{
930    int x, y;
931    pixel        *src       = (pixel*)_src;
932    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
933    pixel *dst          = (pixel *)_dst;
934    ptrdiff_t dststride = _dststride / sizeof(pixel);
935    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
936    int shift = denom + 14 - BIT_DEPTH;
937#if BIT_DEPTH < 14
938    int offset = 1 << (shift - 1);
939#else
940    int offset = 0;
941#endif
942
943    ox = ox * (1 << (BIT_DEPTH - 8));
944    for (y = 0; y < height; y++) {
945        for (x = 0; x < width; x++)
946            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
947        src += srcstride;
948        dst += dststride;
949    }
950}
951
952static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
953                                       int16_t *src2,
954                                       int height, int denom, int wx0, int wx1,
955                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
956{
957    int x, y;
958    pixel        *src       = (pixel*)_src;
959    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
960    pixel *dst          = (pixel *)_dst;
961    ptrdiff_t dststride = _dststride / sizeof(pixel);
962
963    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
964
965    int shift = 14 + 1 - BIT_DEPTH;
966    int log2Wd = denom + shift - 1;
967
968    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
969    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
970    for (y = 0; y < height; y++) {
971        for (x = 0; x < width; x++)
972            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
973                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
974        src  += srcstride;
975        dst  += dststride;
976        src2 += MAX_PB_SIZE;
977    }
978}
979
980static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
981                                         uint8_t *_src, ptrdiff_t _srcstride,
982                                         int height, int denom, int wx, int ox,
983                                         intptr_t mx, intptr_t my, int width)
984{
985    int x, y;
986    const int8_t *filter;
987    pixel *src = (pixel*)_src;
988    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
989    pixel *dst          = (pixel *)_dst;
990    ptrdiff_t dststride = _dststride / sizeof(pixel);
991    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
992    int16_t *tmp = tmp_array;
993    int shift = denom + 14 - BIT_DEPTH;
994#if BIT_DEPTH < 14
995    int offset = 1 << (shift - 1);
996#else
997    int offset = 0;
998#endif
999
1000    src   -= QPEL_EXTRA_BEFORE * srcstride;
1001    filter = ff_hevc_qpel_filters[mx - 1];
1002    for (y = 0; y < height + QPEL_EXTRA; y++) {
1003        for (x = 0; x < width; x++)
1004            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1005        src += srcstride;
1006        tmp += MAX_PB_SIZE;
1007    }
1008
1009    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1010    filter = ff_hevc_qpel_filters[my - 1];
1011
1012    ox = ox * (1 << (BIT_DEPTH - 8));
1013    for (y = 0; y < height; y++) {
1014        for (x = 0; x < width; x++)
1015            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1016        tmp += MAX_PB_SIZE;
1017        dst += dststride;
1018    }
1019}
1020
1021static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1022                                        int16_t *src2,
1023                                        int height, int denom, int wx0, int wx1,
1024                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1025{
1026    int x, y;
1027    const int8_t *filter;
1028    pixel *src = (pixel*)_src;
1029    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1030    pixel *dst          = (pixel *)_dst;
1031    ptrdiff_t dststride = _dststride / sizeof(pixel);
1032    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
1033    int16_t *tmp = tmp_array;
1034    int shift = 14 + 1 - BIT_DEPTH;
1035    int log2Wd = denom + shift - 1;
1036
1037    src   -= QPEL_EXTRA_BEFORE * srcstride;
1038    filter = ff_hevc_qpel_filters[mx - 1];
1039    for (y = 0; y < height + QPEL_EXTRA; y++) {
1040        for (x = 0; x < width; x++)
1041            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1042        src += srcstride;
1043        tmp += MAX_PB_SIZE;
1044    }
1045
1046    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1047    filter = ff_hevc_qpel_filters[my - 1];
1048
1049    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1050    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1051    for (y = 0; y < height; y++) {
1052        for (x = 0; x < width; x++)
1053            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1054                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
1055        tmp  += MAX_PB_SIZE;
1056        dst  += dststride;
1057        src2 += MAX_PB_SIZE;
1058    }
1059}
1060
1061////////////////////////////////////////////////////////////////////////////////
1062//
1063////////////////////////////////////////////////////////////////////////////////
1064#define EPEL_FILTER(src, stride)                                               \
1065    (filter[0] * src[x - stride] +                                             \
1066     filter[1] * src[x]          +                                             \
1067     filter[2] * src[x + stride] +                                             \
1068     filter[3] * src[x + 2 * stride])
1069
1070static void FUNC(put_hevc_epel_h)(int16_t *dst,
1071                                  uint8_t *_src, ptrdiff_t _srcstride,
1072                                  int height, intptr_t mx, intptr_t my, int width)
1073{
1074    int x, y;
1075    pixel *src = (pixel *)_src;
1076    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1077    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1078    for (y = 0; y < height; y++) {
1079        for (x = 0; x < width; x++)
1080            dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1081        src += srcstride;
1082        dst += MAX_PB_SIZE;
1083    }
1084}
1085
1086static void FUNC(put_hevc_epel_v)(int16_t *dst,
1087                                  uint8_t *_src, ptrdiff_t _srcstride,
1088                                  int height, intptr_t mx, intptr_t my, int width)
1089{
1090    int x, y;
1091    pixel *src = (pixel *)_src;
1092    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1093    const int8_t *filter = ff_hevc_epel_filters[my - 1];
1094
1095    for (y = 0; y < height; y++) {
1096        for (x = 0; x < width; x++)
1097            dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
1098        src += srcstride;
1099        dst += MAX_PB_SIZE;
1100    }
1101}
1102
1103static void FUNC(put_hevc_epel_hv)(int16_t *dst,
1104                                   uint8_t *_src, ptrdiff_t _srcstride,
1105                                   int height, intptr_t mx, intptr_t my, int width)
1106{
1107    int x, y;
1108    pixel *src = (pixel *)_src;
1109    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1110    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1111    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1112    int16_t *tmp = tmp_array;
1113
1114    src -= EPEL_EXTRA_BEFORE * srcstride;
1115
1116    for (y = 0; y < height + EPEL_EXTRA; y++) {
1117        for (x = 0; x < width; x++)
1118            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1119        src += srcstride;
1120        tmp += MAX_PB_SIZE;
1121    }
1122
1123    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1124    filter = ff_hevc_epel_filters[my - 1];
1125
1126    for (y = 0; y < height; y++) {
1127        for (x = 0; x < width; x++)
1128            dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
1129        tmp += MAX_PB_SIZE;
1130        dst += MAX_PB_SIZE;
1131    }
1132}
1133
1134static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1135                                      int height, intptr_t mx, intptr_t my, int width)
1136{
1137    int x, y;
1138    pixel *src = (pixel *)_src;
1139    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1140    pixel *dst          = (pixel *)_dst;
1141    ptrdiff_t dststride = _dststride / sizeof(pixel);
1142    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1143    int shift = 14 - BIT_DEPTH;
1144#if BIT_DEPTH < 14
1145    int offset = 1 << (shift - 1);
1146#else
1147    int offset = 0;
1148#endif
1149
1150    for (y = 0; y < height; y++) {
1151        for (x = 0; x < width; x++)
1152            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
1153        src += srcstride;
1154        dst += dststride;
1155    }
1156}
1157
1158static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1159                                     int16_t *src2,
1160                                     int height, intptr_t mx, intptr_t my, int width)
1161{
1162    int x, y;
1163    pixel *src = (pixel *)_src;
1164    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1165    pixel *dst          = (pixel *)_dst;
1166    ptrdiff_t dststride = _dststride / sizeof(pixel);
1167    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1168    int shift = 14 + 1 - BIT_DEPTH;
1169#if BIT_DEPTH < 14
1170    int offset = 1 << (shift - 1);
1171#else
1172    int offset = 0;
1173#endif
1174
1175    for (y = 0; y < height; y++) {
1176        for (x = 0; x < width; x++) {
1177            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1178        }
1179        dst  += dststride;
1180        src  += srcstride;
1181        src2 += MAX_PB_SIZE;
1182    }
1183}
1184
1185static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1186                                      int height, intptr_t mx, intptr_t my, int width)
1187{
1188    int x, y;
1189    pixel *src = (pixel *)_src;
1190    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1191    pixel *dst          = (pixel *)_dst;
1192    ptrdiff_t dststride = _dststride / sizeof(pixel);
1193    const int8_t *filter = ff_hevc_epel_filters[my - 1];
1194    int shift = 14 - BIT_DEPTH;
1195#if BIT_DEPTH < 14
1196    int offset = 1 << (shift - 1);
1197#else
1198    int offset = 0;
1199#endif
1200
1201    for (y = 0; y < height; y++) {
1202        for (x = 0; x < width; x++)
1203            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
1204        src += srcstride;
1205        dst += dststride;
1206    }
1207}
1208
1209static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1210                                     int16_t *src2,
1211                                     int height, intptr_t mx, intptr_t my, int width)
1212{
1213    int x, y;
1214    pixel *src = (pixel *)_src;
1215    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1216    const int8_t *filter = ff_hevc_epel_filters[my - 1];
1217    pixel *dst          = (pixel *)_dst;
1218    ptrdiff_t dststride = _dststride / sizeof(pixel);
1219    int shift = 14 + 1 - BIT_DEPTH;
1220#if BIT_DEPTH < 14
1221    int offset = 1 << (shift - 1);
1222#else
1223    int offset = 0;
1224#endif
1225
1226    for (y = 0; y < height; y++) {
1227        for (x = 0; x < width; x++)
1228            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1229        dst  += dststride;
1230        src  += srcstride;
1231        src2 += MAX_PB_SIZE;
1232    }
1233}
1234
1235static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1236                                       int height, intptr_t mx, intptr_t my, int width)
1237{
1238    int x, y;
1239    pixel *src = (pixel *)_src;
1240    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1241    pixel *dst          = (pixel *)_dst;
1242    ptrdiff_t dststride = _dststride / sizeof(pixel);
1243    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1244    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1245    int16_t *tmp = tmp_array;
1246    int shift = 14 - BIT_DEPTH;
1247#if BIT_DEPTH < 14
1248    int offset = 1 << (shift - 1);
1249#else
1250    int offset = 0;
1251#endif
1252
1253    src -= EPEL_EXTRA_BEFORE * srcstride;
1254
1255    for (y = 0; y < height + EPEL_EXTRA; y++) {
1256        for (x = 0; x < width; x++)
1257            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1258        src += srcstride;
1259        tmp += MAX_PB_SIZE;
1260    }
1261
1262    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1263    filter = ff_hevc_epel_filters[my - 1];
1264
1265    for (y = 0; y < height; y++) {
1266        for (x = 0; x < width; x++)
1267            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
1268        tmp += MAX_PB_SIZE;
1269        dst += dststride;
1270    }
1271}
1272
1273static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1274                                      int16_t *src2,
1275                                      int height, intptr_t mx, intptr_t my, int width)
1276{
1277    int x, y;
1278    pixel *src = (pixel *)_src;
1279    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1280    pixel *dst          = (pixel *)_dst;
1281    ptrdiff_t dststride = _dststride / sizeof(pixel);
1282    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1283    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1284    int16_t *tmp = tmp_array;
1285    int shift = 14 + 1 - BIT_DEPTH;
1286#if BIT_DEPTH < 14
1287    int offset = 1 << (shift - 1);
1288#else
1289    int offset = 0;
1290#endif
1291
1292    src -= EPEL_EXTRA_BEFORE * srcstride;
1293
1294    for (y = 0; y < height + EPEL_EXTRA; y++) {
1295        for (x = 0; x < width; x++)
1296            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1297        src += srcstride;
1298        tmp += MAX_PB_SIZE;
1299    }
1300
1301    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1302    filter = ff_hevc_epel_filters[my - 1];
1303
1304    for (y = 0; y < height; y++) {
1305        for (x = 0; x < width; x++)
1306            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
1307        tmp  += MAX_PB_SIZE;
1308        dst  += dststride;
1309        src2 += MAX_PB_SIZE;
1310    }
1311}
1312
1313static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1314                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1315{
1316    int x, y;
1317    pixel *src = (pixel *)_src;
1318    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1319    pixel *dst          = (pixel *)_dst;
1320    ptrdiff_t dststride = _dststride / sizeof(pixel);
1321    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1322    int shift = denom + 14 - BIT_DEPTH;
1323#if BIT_DEPTH < 14
1324    int offset = 1 << (shift - 1);
1325#else
1326    int offset = 0;
1327#endif
1328
1329    ox     = ox * (1 << (BIT_DEPTH - 8));
1330    for (y = 0; y < height; y++) {
1331        for (x = 0; x < width; x++) {
1332            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1333        }
1334        dst += dststride;
1335        src += srcstride;
1336    }
1337}
1338
1339static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1340                                       int16_t *src2,
1341                                       int height, int denom, int wx0, int wx1,
1342                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1343{
1344    int x, y;
1345    pixel *src = (pixel *)_src;
1346    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1347    pixel *dst          = (pixel *)_dst;
1348    ptrdiff_t dststride = _dststride / sizeof(pixel);
1349    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1350    int shift = 14 + 1 - BIT_DEPTH;
1351    int log2Wd = denom + shift - 1;
1352
1353    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1354    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1355    for (y = 0; y < height; y++) {
1356        for (x = 0; x < width; x++)
1357            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1358                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
1359        src  += srcstride;
1360        dst  += dststride;
1361        src2 += MAX_PB_SIZE;
1362    }
1363}
1364
1365static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1366                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1367{
1368    int x, y;
1369    pixel *src = (pixel *)_src;
1370    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1371    pixel *dst          = (pixel *)_dst;
1372    ptrdiff_t dststride = _dststride / sizeof(pixel);
1373    const int8_t *filter = ff_hevc_epel_filters[my - 1];
1374    int shift = denom + 14 - BIT_DEPTH;
1375#if BIT_DEPTH < 14
1376    int offset = 1 << (shift - 1);
1377#else
1378    int offset = 0;
1379#endif
1380
1381    ox     = ox * (1 << (BIT_DEPTH - 8));
1382    for (y = 0; y < height; y++) {
1383        for (x = 0; x < width; x++) {
1384            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1385        }
1386        dst += dststride;
1387        src += srcstride;
1388    }
1389}
1390
1391static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1392                                       int16_t *src2,
1393                                       int height, int denom, int wx0, int wx1,
1394                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1395{
1396    int x, y;
1397    pixel *src = (pixel *)_src;
1398    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1399    const int8_t *filter = ff_hevc_epel_filters[my - 1];
1400    pixel *dst          = (pixel *)_dst;
1401    ptrdiff_t dststride = _dststride / sizeof(pixel);
1402    int shift = 14 + 1 - BIT_DEPTH;
1403    int log2Wd = denom + shift - 1;
1404
1405    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1406    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1407    for (y = 0; y < height; y++) {
1408        for (x = 0; x < width; x++)
1409            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1410                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
1411        src  += srcstride;
1412        dst  += dststride;
1413        src2 += MAX_PB_SIZE;
1414    }
1415}
1416
1417static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1418                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1419{
1420    int x, y;
1421    pixel *src = (pixel *)_src;
1422    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1423    pixel *dst          = (pixel *)_dst;
1424    ptrdiff_t dststride = _dststride / sizeof(pixel);
1425    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1426    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1427    int16_t *tmp = tmp_array;
1428    int shift = denom + 14 - BIT_DEPTH;
1429#if BIT_DEPTH < 14
1430    int offset = 1 << (shift - 1);
1431#else
1432    int offset = 0;
1433#endif
1434
1435    src -= EPEL_EXTRA_BEFORE * srcstride;
1436
1437    for (y = 0; y < height + EPEL_EXTRA; y++) {
1438        for (x = 0; x < width; x++)
1439            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1440        src += srcstride;
1441        tmp += MAX_PB_SIZE;
1442    }
1443
1444    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1445    filter = ff_hevc_epel_filters[my - 1];
1446
1447    ox     = ox * (1 << (BIT_DEPTH - 8));
1448    for (y = 0; y < height; y++) {
1449        for (x = 0; x < width; x++)
1450            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1451        tmp += MAX_PB_SIZE;
1452        dst += dststride;
1453    }
1454}
1455
1456static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1457                                        int16_t *src2,
1458                                        int height, int denom, int wx0, int wx1,
1459                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1460{
1461    int x, y;
1462    pixel *src = (pixel *)_src;
1463    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1464    pixel *dst          = (pixel *)_dst;
1465    ptrdiff_t dststride = _dststride / sizeof(pixel);
1466    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1467    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1468    int16_t *tmp = tmp_array;
1469    int shift = 14 + 1 - BIT_DEPTH;
1470    int log2Wd = denom + shift - 1;
1471
1472    src -= EPEL_EXTRA_BEFORE * srcstride;
1473
1474    for (y = 0; y < height + EPEL_EXTRA; y++) {
1475        for (x = 0; x < width; x++)
1476            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1477        src += srcstride;
1478        tmp += MAX_PB_SIZE;
1479    }
1480
1481    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1482    filter = ff_hevc_epel_filters[my - 1];
1483
1484    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1485    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1486    for (y = 0; y < height; y++) {
1487        for (x = 0; x < width; x++)
1488            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1489                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
1490        tmp  += MAX_PB_SIZE;
1491        dst  += dststride;
1492        src2 += MAX_PB_SIZE;
1493    }
1494}
1495
1496// line zero
1497#define P3 pix[-4 * xstride]
1498#define P2 pix[-3 * xstride]
1499#define P1 pix[-2 * xstride]
1500#define P0 pix[-1 * xstride]
1501#define Q0 pix[0 * xstride]
1502#define Q1 pix[1 * xstride]
1503#define Q2 pix[2 * xstride]
1504#define Q3 pix[3 * xstride]
1505
1506// line three. used only for deblocking decision
1507#define TP3 pix[-4 * xstride + 3 * ystride]
1508#define TP2 pix[-3 * xstride + 3 * ystride]
1509#define TP1 pix[-2 * xstride + 3 * ystride]
1510#define TP0 pix[-1 * xstride + 3 * ystride]
1511#define TQ0 pix[0  * xstride + 3 * ystride]
1512#define TQ1 pix[1  * xstride + 3 * ystride]
1513#define TQ2 pix[2  * xstride + 3 * ystride]
1514#define TQ3 pix[3  * xstride + 3 * ystride]
1515
1516static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
1517                                        ptrdiff_t _xstride, ptrdiff_t _ystride,
1518                                        int beta, int *_tc,
1519                                        uint8_t *_no_p, uint8_t *_no_q)
1520{
1521    int d, j;
1522    pixel *pix        = (pixel *)_pix;
1523    ptrdiff_t xstride = _xstride / sizeof(pixel);
1524    ptrdiff_t ystride = _ystride / sizeof(pixel);
1525
1526    beta <<= BIT_DEPTH - 8;
1527
1528    for (j = 0; j < 2; j++) {
1529        const int dp0  = abs(P2  - 2 * P1  + P0);
1530        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
1531        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
1532        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
1533        const int d0   = dp0 + dq0;
1534        const int d3   = dp3 + dq3;
1535        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
1536        const int no_p = _no_p[j];
1537        const int no_q = _no_q[j];
1538
1539        if (d0 + d3 >= beta) {
1540            pix += 4 * ystride;
1541            continue;
1542        } else {
1543            const int beta_3 = beta >> 3;
1544            const int beta_2 = beta >> 2;
1545            const int tc25   = ((tc * 5 + 1) >> 1);
1546
1547            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
1548                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
1549                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
1550                // strong filtering
1551                const int tc2 = tc << 1;
1552                for (d = 0; d < 4; d++) {
1553                    const int p3 = P3;
1554                    const int p2 = P2;
1555                    const int p1 = P1;
1556                    const int p0 = P0;
1557                    const int q0 = Q0;
1558                    const int q1 = Q1;
1559                    const int q2 = Q2;
1560                    const int q3 = Q3;
1561                    if (!no_p) {
1562                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
1563                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
1564                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
1565                    }
1566                    if (!no_q) {
1567                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
1568                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
1569                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
1570                    }
1571                    pix += ystride;
1572                }
1573            } else { // normal filtering
1574                int nd_p = 1;
1575                int nd_q = 1;
1576                const int tc_2 = tc >> 1;
1577                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
1578                    nd_p = 2;
1579                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
1580                    nd_q = 2;
1581
1582                for (d = 0; d < 4; d++) {
1583                    const int p2 = P2;
1584                    const int p1 = P1;
1585                    const int p0 = P0;
1586                    const int q0 = Q0;
1587                    const int q1 = Q1;
1588                    const int q2 = Q2;
1589                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
1590                    if (abs(delta0) < 10 * tc) {
1591                        delta0 = av_clip(delta0, -tc, tc);
1592                        if (!no_p)
1593                            P0 = av_clip_pixel(p0 + delta0);
1594                        if (!no_q)
1595                            Q0 = av_clip_pixel(q0 - delta0);
1596                        if (!no_p && nd_p > 1) {
1597                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
1598                            P1 = av_clip_pixel(p1 + deltap1);
1599                        }
1600                        if (!no_q && nd_q > 1) {
1601                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
1602                            Q1 = av_clip_pixel(q1 + deltaq1);
1603                        }
1604                    }
1605                    pix += ystride;
1606                }
1607            }
1608        }
1609    }
1610}
1611
1612static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
1613                                          ptrdiff_t _ystride, int *_tc,
1614                                          uint8_t *_no_p, uint8_t *_no_q)
1615{
1616    int d, j, no_p, no_q;
1617    pixel *pix        = (pixel *)_pix;
1618    ptrdiff_t xstride = _xstride / sizeof(pixel);
1619    ptrdiff_t ystride = _ystride / sizeof(pixel);
1620
1621    for (j = 0; j < 2; j++) {
1622        const int tc = _tc[j] << (BIT_DEPTH - 8);
1623        if (tc <= 0) {
1624            pix += 4 * ystride;
1625            continue;
1626        }
1627        no_p = _no_p[j];
1628        no_q = _no_q[j];
1629
1630        for (d = 0; d < 4; d++) {
1631            int delta0;
1632            const int p1 = P1;
1633            const int p0 = P0;
1634            const int q0 = Q0;
1635            const int q1 = Q1;
1636            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
1637            if (!no_p)
1638                P0 = av_clip_pixel(p0 + delta0);
1639            if (!no_q)
1640                Q0 = av_clip_pixel(q0 - delta0);
1641            pix += ystride;
1642        }
1643    }
1644}
1645
1646static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1647                                            int32_t *tc, uint8_t *no_p,
1648                                            uint8_t *no_q)
1649{
1650    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
1651}
1652
1653static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1654                                            int32_t *tc, uint8_t *no_p,
1655                                            uint8_t *no_q)
1656{
1657    FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
1658}
1659
1660static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1661                                          int beta, int32_t *tc, uint8_t *no_p,
1662                                          uint8_t *no_q)
1663{
1664    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
1665                                beta, tc, no_p, no_q);
1666}
1667
1668static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1669                                          int beta, int32_t *tc, uint8_t *no_p,
1670                                          uint8_t *no_q)
1671{
1672    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
1673                                beta, tc, no_p, no_q);
1674}
1675
1676#undef P3
1677#undef P2
1678#undef P1
1679#undef P0
1680#undef Q0
1681#undef Q1
1682#undef Q2
1683#undef Q3
1684
1685#undef TP3
1686#undef TP2
1687#undef TP1
1688#undef TP0
1689#undef TQ0
1690#undef TQ1
1691#undef TQ2
1692#undef TQ3
1693