1/*
2 * Copyright (c) 2013 Seppo Tomperi
3 * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "config.h"
23
24#include "libavutil/cpu.h"
25#include "libavutil/mem_internal.h"
26#include "libavutil/x86/asm.h"
27#include "libavutil/x86/cpu.h"
28#include "libavcodec/hevcdsp.h"
29#include "libavcodec/x86/hevcdsp.h"
30
31#define LFC_FUNC(DIR, DEPTH, OPT) \
32void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
33
34#define LFL_FUNC(DIR, DEPTH, OPT) \
35void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
36
37#define LFC_FUNCS(type, depth, opt) \
38    LFC_FUNC(h, depth, opt)  \
39    LFC_FUNC(v, depth, opt)
40
41#define LFL_FUNCS(type, depth, opt) \
42    LFL_FUNC(h, depth, opt)  \
43    LFL_FUNC(v, depth, opt)
44
45LFC_FUNCS(uint8_t,   8, sse2)
46LFC_FUNCS(uint8_t,  10, sse2)
47LFC_FUNCS(uint8_t,  12, sse2)
48LFC_FUNCS(uint8_t,   8, avx)
49LFC_FUNCS(uint8_t,  10, avx)
50LFC_FUNCS(uint8_t,  12, avx)
51LFL_FUNCS(uint8_t,   8, sse2)
52LFL_FUNCS(uint8_t,  10, sse2)
53LFL_FUNCS(uint8_t,  12, sse2)
54LFL_FUNCS(uint8_t,   8, ssse3)
55LFL_FUNCS(uint8_t,  10, ssse3)
56LFL_FUNCS(uint8_t,  12, ssse3)
57LFL_FUNCS(uint8_t,   8, avx)
58LFL_FUNCS(uint8_t,  10, avx)
59LFL_FUNCS(uint8_t,  12, avx)
60
61#define IDCT_DC_FUNCS(W, opt) \
62void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
63void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
64void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
65
66IDCT_DC_FUNCS(4x4,   mmxext);
67IDCT_DC_FUNCS(8x8,   sse2);
68IDCT_DC_FUNCS(16x16, sse2);
69IDCT_DC_FUNCS(32x32, sse2);
70IDCT_DC_FUNCS(16x16, avx2);
71IDCT_DC_FUNCS(32x32, avx2);
72
73#define IDCT_FUNCS(opt)                                             \
74void ff_hevc_idct_4x4_8_    ## opt(int16_t *coeffs, int col_limit); \
75void ff_hevc_idct_4x4_10_   ## opt(int16_t *coeffs, int col_limit); \
76void ff_hevc_idct_8x8_8_    ## opt(int16_t *coeffs, int col_limit); \
77void ff_hevc_idct_8x8_10_   ## opt(int16_t *coeffs, int col_limit); \
78void ff_hevc_idct_16x16_8_  ## opt(int16_t *coeffs, int col_limit); \
79void ff_hevc_idct_16x16_10_ ## opt(int16_t *coeffs, int col_limit); \
80void ff_hevc_idct_32x32_8_  ## opt(int16_t *coeffs, int col_limit); \
81void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
82
83IDCT_FUNCS(sse2)
84IDCT_FUNCS(avx)
85
86#define mc_rep_func(name, bitd, step, W, opt) \
87void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst,                                                 \
88                                                uint8_t *_src, ptrdiff_t _srcstride, int height,                \
89                                                intptr_t mx, intptr_t my, int width)                            \
90{                                                                                                               \
91    int i;                                                                                                      \
92    uint8_t *src;                                                                                               \
93    int16_t *dst;                                                                                               \
94    for (i = 0; i < W; i += step) {                                                                             \
95        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
96        dst = _dst + i;                                                                                         \
97        ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);            \
98    }                                                                                                           \
99}
100#define mc_rep_uni_func(name, bitd, step, W, opt) \
101void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride,                        \
102                                                    uint8_t *_src, ptrdiff_t _srcstride, int height,            \
103                                                    intptr_t mx, intptr_t my, int width)                        \
104{                                                                                                               \
105    int i;                                                                                                      \
106    uint8_t *src;                                                                                               \
107    uint8_t *dst;                                                                                               \
108    for (i = 0; i < W; i += step) {                                                                             \
109        src = _src + (i * ((bitd + 7) / 8));                                                                    \
110        dst = _dst + (i * ((bitd + 7) / 8));                                                                    \
111        ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride,                     \
112                                                          height, mx, my, width);                               \
113    }                                                                                                           \
114}
115#define mc_rep_bi_func(name, bitd, step, W, opt) \
116void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src,          \
117                                                   ptrdiff_t _srcstride, int16_t* _src2,                        \
118                                                   int height, intptr_t mx, intptr_t my, int width)             \
119{                                                                                                               \
120    int i;                                                                                                      \
121    uint8_t  *src;                                                                                              \
122    uint8_t  *dst;                                                                                              \
123    int16_t  *src2;                                                                                             \
124    for (i = 0; i < W ; i += step) {                                                                            \
125        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
126        dst  = _dst + (i * ((bitd + 7) / 8));                                                                   \
127        src2 = _src2 + i;                                                                                       \
128        ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2,                \
129                                                          height, mx, my, width);                               \
130    }                                                                                                           \
131}
132
133#define mc_rep_funcs(name, bitd, step, W, opt)        \
134    mc_rep_func(name, bitd, step, W, opt)            \
135    mc_rep_uni_func(name, bitd, step, W, opt)        \
136    mc_rep_bi_func(name, bitd, step, W, opt)
137
138#define mc_rep_func2(name, bitd, step1, step2, W, opt) \
139void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst,                                                  \
140                                                 uint8_t *src, ptrdiff_t _srcstride, int height,                \
141                                                 intptr_t mx, intptr_t my, int width)                           \
142{                                                                                                               \
143    ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);               \
144    ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)),              \
145                                                    _srcstride, height, mx, my, width);                         \
146}
147#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
148void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride,                         \
149                                                     uint8_t *src, ptrdiff_t _srcstride, int height,            \
150                                                     intptr_t mx, intptr_t my, int width)                       \
151{                                                                                                               \
152    ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
153    ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,            \
154                                                        src + (step1 * ((bitd + 7) / 8)), _srcstride,           \
155                                                        height, mx, my, width);                                 \
156}
157#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
158void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,            \
159                                                    ptrdiff_t _srcstride, int16_t* src2,                        \
160                                                    int height, intptr_t mx, intptr_t my, int width)            \
161{                                                                                                               \
162    ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
163    ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,             \
164                                                       src + (step1 * ((bitd + 7) / 8)), _srcstride,            \
165                                                       src2 + step1, height, mx, my, width);                    \
166}
167
168#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
169    mc_rep_func2(name, bitd, step1, step2, W, opt)      \
170    mc_rep_uni_func2(name, bitd, step1, step2, W, opt)  \
171    mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
172
173#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
174
175#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                       \
176void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,            \
177                                                 int height, intptr_t mx, intptr_t my, int width)             \
178                                                                                                              \
179{                                                                                                             \
180    ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width);                 \
181    ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
182}
183
184#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                    \
185void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,          \
186                                                    ptrdiff_t _srcstride, int16_t *src2,                      \
187                                                    int height, intptr_t mx, intptr_t my, int width)          \
188{                                                                                                             \
189    ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2,                     \
190                                                   height, mx, my, width);                                    \
191    ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
192                                                   height, mx, my, width);                                    \
193}
194
195#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                   \
196void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride,                       \
197                                                     uint8_t *src, ptrdiff_t _srcstride, int height,          \
198                                                     intptr_t mx, intptr_t my, int width)                     \
199{                                                                                                             \
200    ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride,                          \
201                                                      height, mx, my, width);                                 \
202    ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride,            \
203                                                      height, mx, my, width);                                 \
204}
205
206#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4)   \
207mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)            \
208mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)         \
209mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
210
211#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                                \
212void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,             \
213                                                int height, intptr_t mx, intptr_t my, int width)              \
214                                                                                                              \
215{                                                                                                             \
216    ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width);                  \
217    ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width);  \
218}
219
220#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                             \
221void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,           \
222                                                   ptrdiff_t _srcstride, int16_t* src2,                       \
223                                                   int height, intptr_t mx, intptr_t my, int width)           \
224{                                                                                                             \
225    ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                            \
226                                                  src2, height, mx, my, width);                               \
227    ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,              \
228                                                  src2+width2, height, mx, my, width);                        \
229}
230
231#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                            \
232void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride,                        \
233                                                    uint8_t *src, ptrdiff_t _srcstride, int height,           \
234                                                    intptr_t mx, intptr_t my, int width)                      \
235{                                                                                                             \
236    ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                           \
237                                                   height, mx, my, width);                                    \
238    ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,             \
239                                                   height, mx, my, width);                                    \
240}
241
242#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2)   \
243mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)            \
244mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)         \
245mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
246
247#if HAVE_AVX2_EXTERNAL
248
249mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
250mc_rep_mixs_8(epel_hv,    48, 32, 16, avx2, sse4)
251mc_rep_mixs_8(epel_h ,    48, 32, 16, avx2, sse4)
252mc_rep_mixs_8(epel_v ,    48, 32, 16, avx2, sse4)
253
254mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
255mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
256mc_rep_mixs_10(epel_hv,   24, 16, 8, avx2, sse4, 32)
257mc_rep_mixs_10(epel_h ,   24, 16, 8, avx2, sse4, 32)
258mc_rep_mixs_10(epel_v ,   24, 16, 8, avx2, sse4, 32)
259
260
261mc_rep_mixs_10(qpel_h ,   24, 16, 8, avx2, sse4, 32)
262mc_rep_mixs_10(qpel_v ,   24, 16, 8, avx2, sse4, 32)
263mc_rep_mixs_10(qpel_hv,   24, 16, 8, avx2, sse4, 32)
264
265
266mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
267mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
268
269mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
270
271mc_rep_func(pel_pixels, 10, 16, 32, avx2)
272mc_rep_func(pel_pixels, 10, 16, 48, avx2)
273mc_rep_func(pel_pixels, 10, 32, 64, avx2)
274
275mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
276mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
277mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
278
279mc_rep_funcs(epel_h, 8, 32, 64, avx2)
280
281mc_rep_funcs(epel_v, 8, 32, 64, avx2)
282
283mc_rep_funcs(epel_h, 10, 16, 32, avx2)
284mc_rep_funcs(epel_h, 10, 16, 48, avx2)
285mc_rep_funcs(epel_h, 10, 32, 64, avx2)
286
287mc_rep_funcs(epel_v, 10, 16, 32, avx2)
288mc_rep_funcs(epel_v, 10, 16, 48, avx2)
289mc_rep_funcs(epel_v, 10, 32, 64, avx2)
290
291
292mc_rep_funcs(epel_hv,  8, 32, 64, avx2)
293
294mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
295mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
296mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
297
298mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
299mc_rep_mixs_8(qpel_h ,  48, 32, 16, avx2, sse4)
300
301mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
302mc_rep_mixs_8(qpel_v,  48, 32, 16, avx2, sse4)
303
304mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
305mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
306mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
307
308mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
309mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
310mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
311
312mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
313mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
314mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
315
316#endif //AVX2
317
318mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
319mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
320mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
321mc_rep_funcs(pel_pixels, 8,  8, 24, sse4)
322mc_rep_funcs(pel_pixels,10,  8, 64, sse4)
323mc_rep_funcs(pel_pixels,10,  8, 48, sse4)
324mc_rep_funcs(pel_pixels,10,  8, 32, sse4)
325mc_rep_funcs(pel_pixels,10,  8, 24, sse4)
326mc_rep_funcs(pel_pixels,10,  8, 16, sse4)
327mc_rep_funcs(pel_pixels,10,  4, 12, sse4)
328mc_rep_funcs(pel_pixels,12,  8, 64, sse4)
329mc_rep_funcs(pel_pixels,12,  8, 48, sse4)
330mc_rep_funcs(pel_pixels,12,  8, 32, sse4)
331mc_rep_funcs(pel_pixels,12,  8, 24, sse4)
332mc_rep_funcs(pel_pixels,12,  8, 16, sse4)
333mc_rep_funcs(pel_pixels,12,  4, 12, sse4)
334
335mc_rep_funcs(epel_h, 8, 16, 64, sse4)
336mc_rep_funcs(epel_h, 8, 16, 48, sse4)
337mc_rep_funcs(epel_h, 8, 16, 32, sse4)
338mc_rep_funcs(epel_h, 8,  8, 24, sse4)
339mc_rep_funcs(epel_h,10,  8, 64, sse4)
340mc_rep_funcs(epel_h,10,  8, 48, sse4)
341mc_rep_funcs(epel_h,10,  8, 32, sse4)
342mc_rep_funcs(epel_h,10,  8, 24, sse4)
343mc_rep_funcs(epel_h,10,  8, 16, sse4)
344mc_rep_funcs(epel_h,10,  4, 12, sse4)
345mc_rep_funcs(epel_h,12,  8, 64, sse4)
346mc_rep_funcs(epel_h,12,  8, 48, sse4)
347mc_rep_funcs(epel_h,12,  8, 32, sse4)
348mc_rep_funcs(epel_h,12,  8, 24, sse4)
349mc_rep_funcs(epel_h,12,  8, 16, sse4)
350mc_rep_funcs(epel_h,12,  4, 12, sse4)
351mc_rep_funcs(epel_v, 8, 16, 64, sse4)
352mc_rep_funcs(epel_v, 8, 16, 48, sse4)
353mc_rep_funcs(epel_v, 8, 16, 32, sse4)
354mc_rep_funcs(epel_v, 8,  8, 24, sse4)
355mc_rep_funcs(epel_v,10,  8, 64, sse4)
356mc_rep_funcs(epel_v,10,  8, 48, sse4)
357mc_rep_funcs(epel_v,10,  8, 32, sse4)
358mc_rep_funcs(epel_v,10,  8, 24, sse4)
359mc_rep_funcs(epel_v,10,  8, 16, sse4)
360mc_rep_funcs(epel_v,10,  4, 12, sse4)
361mc_rep_funcs(epel_v,12,  8, 64, sse4)
362mc_rep_funcs(epel_v,12,  8, 48, sse4)
363mc_rep_funcs(epel_v,12,  8, 32, sse4)
364mc_rep_funcs(epel_v,12,  8, 24, sse4)
365mc_rep_funcs(epel_v,12,  8, 16, sse4)
366mc_rep_funcs(epel_v,12,  4, 12, sse4)
367mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
368mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
369mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
370mc_rep_funcs(epel_hv, 8,  8, 24, sse4)
371mc_rep_funcs2(epel_hv,8,  8,  4, 12, sse4)
372mc_rep_funcs(epel_hv,10,  8, 64, sse4)
373mc_rep_funcs(epel_hv,10,  8, 48, sse4)
374mc_rep_funcs(epel_hv,10,  8, 32, sse4)
375mc_rep_funcs(epel_hv,10,  8, 24, sse4)
376mc_rep_funcs(epel_hv,10,  8, 16, sse4)
377mc_rep_funcs(epel_hv,10,  4, 12, sse4)
378mc_rep_funcs(epel_hv,12,  8, 64, sse4)
379mc_rep_funcs(epel_hv,12,  8, 48, sse4)
380mc_rep_funcs(epel_hv,12,  8, 32, sse4)
381mc_rep_funcs(epel_hv,12,  8, 24, sse4)
382mc_rep_funcs(epel_hv,12,  8, 16, sse4)
383mc_rep_funcs(epel_hv,12,  4, 12, sse4)
384
385mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
386mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
387mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
388mc_rep_funcs(qpel_h, 8,  8, 24, sse4)
389mc_rep_funcs(qpel_h,10,  8, 64, sse4)
390mc_rep_funcs(qpel_h,10,  8, 48, sse4)
391mc_rep_funcs(qpel_h,10,  8, 32, sse4)
392mc_rep_funcs(qpel_h,10,  8, 24, sse4)
393mc_rep_funcs(qpel_h,10,  8, 16, sse4)
394mc_rep_funcs(qpel_h,10,  4, 12, sse4)
395mc_rep_funcs(qpel_h,12,  8, 64, sse4)
396mc_rep_funcs(qpel_h,12,  8, 48, sse4)
397mc_rep_funcs(qpel_h,12,  8, 32, sse4)
398mc_rep_funcs(qpel_h,12,  8, 24, sse4)
399mc_rep_funcs(qpel_h,12,  8, 16, sse4)
400mc_rep_funcs(qpel_h,12,  4, 12, sse4)
401mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
402mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
403mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
404mc_rep_funcs(qpel_v, 8,  8, 24, sse4)
405mc_rep_funcs(qpel_v,10,  8, 64, sse4)
406mc_rep_funcs(qpel_v,10,  8, 48, sse4)
407mc_rep_funcs(qpel_v,10,  8, 32, sse4)
408mc_rep_funcs(qpel_v,10,  8, 24, sse4)
409mc_rep_funcs(qpel_v,10,  8, 16, sse4)
410mc_rep_funcs(qpel_v,10,  4, 12, sse4)
411mc_rep_funcs(qpel_v,12,  8, 64, sse4)
412mc_rep_funcs(qpel_v,12,  8, 48, sse4)
413mc_rep_funcs(qpel_v,12,  8, 32, sse4)
414mc_rep_funcs(qpel_v,12,  8, 24, sse4)
415mc_rep_funcs(qpel_v,12,  8, 16, sse4)
416mc_rep_funcs(qpel_v,12,  4, 12, sse4)
417mc_rep_funcs(qpel_hv, 8,  8, 64, sse4)
418mc_rep_funcs(qpel_hv, 8,  8, 48, sse4)
419mc_rep_funcs(qpel_hv, 8,  8, 32, sse4)
420mc_rep_funcs(qpel_hv, 8,  8, 24, sse4)
421mc_rep_funcs(qpel_hv, 8,  8, 16, sse4)
422mc_rep_funcs2(qpel_hv,8,  8,  4, 12, sse4)
423mc_rep_funcs(qpel_hv,10,  8, 64, sse4)
424mc_rep_funcs(qpel_hv,10,  8, 48, sse4)
425mc_rep_funcs(qpel_hv,10,  8, 32, sse4)
426mc_rep_funcs(qpel_hv,10,  8, 24, sse4)
427mc_rep_funcs(qpel_hv,10,  8, 16, sse4)
428mc_rep_funcs(qpel_hv,10,  4, 12, sse4)
429mc_rep_funcs(qpel_hv,12,  8, 64, sse4)
430mc_rep_funcs(qpel_hv,12,  8, 48, sse4)
431mc_rep_funcs(qpel_hv,12,  8, 32, sse4)
432mc_rep_funcs(qpel_hv,12,  8, 24, sse4)
433mc_rep_funcs(qpel_hv,12,  8, 16, sse4)
434mc_rep_funcs(qpel_hv,12,  4, 12, sse4)
435
436#define mc_rep_uni_w(bitd, step, W, opt) \
437void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
438                                               int height, int denom,  int _wx, int _ox)                                \
439{                                                                                                                       \
440    int i;                                                                                                              \
441    int16_t *src;                                                                                                       \
442    uint8_t *dst;                                                                                                       \
443    for (i = 0; i < W; i += step) {                                                                                     \
444        src= _src + i;                                                                                                  \
445        dst= _dst + (i * ((bitd + 7) / 8));                                                                             \
446        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src,                                   \
447                                                     height, denom, _wx, _ox);                                          \
448    }                                                                                                                   \
449}
450
451mc_rep_uni_w(8, 6, 12, sse4)
452mc_rep_uni_w(8, 8, 16, sse4)
453mc_rep_uni_w(8, 8, 24, sse4)
454mc_rep_uni_w(8, 8, 32, sse4)
455mc_rep_uni_w(8, 8, 48, sse4)
456mc_rep_uni_w(8, 8, 64, sse4)
457
458mc_rep_uni_w(10, 6, 12, sse4)
459mc_rep_uni_w(10, 8, 16, sse4)
460mc_rep_uni_w(10, 8, 24, sse4)
461mc_rep_uni_w(10, 8, 32, sse4)
462mc_rep_uni_w(10, 8, 48, sse4)
463mc_rep_uni_w(10, 8, 64, sse4)
464
465mc_rep_uni_w(12, 6, 12, sse4)
466mc_rep_uni_w(12, 8, 16, sse4)
467mc_rep_uni_w(12, 8, 24, sse4)
468mc_rep_uni_w(12, 8, 32, sse4)
469mc_rep_uni_w(12, 8, 48, sse4)
470mc_rep_uni_w(12, 8, 64, sse4)
471
472#define mc_rep_bi_w(bitd, step, W, opt) \
473void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
474                                              int16_t *_src2, int height,                                               \
475                                              int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)                      \
476{                                                                                                                       \
477    int i;                                                                                                              \
478    int16_t *src;                                                                                                       \
479    int16_t *src2;                                                                                                      \
480    uint8_t *dst;                                                                                                       \
481    for (i = 0; i < W; i += step) {                                                                                     \
482        src  = _src  + i;                                                                                               \
483        src2 = _src2 + i;                                                                                               \
484        dst  = _dst  + (i * ((bitd + 7) / 8));                                                                          \
485        ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2,                             \
486                                                     height, denom, _wx0, _wx1, _ox0, _ox1);                             \
487    }                                                                                                                   \
488}
489
490mc_rep_bi_w(8, 6, 12, sse4)
491mc_rep_bi_w(8, 8, 16, sse4)
492mc_rep_bi_w(8, 8, 24, sse4)
493mc_rep_bi_w(8, 8, 32, sse4)
494mc_rep_bi_w(8, 8, 48, sse4)
495mc_rep_bi_w(8, 8, 64, sse4)
496
497mc_rep_bi_w(10, 6, 12, sse4)
498mc_rep_bi_w(10, 8, 16, sse4)
499mc_rep_bi_w(10, 8, 24, sse4)
500mc_rep_bi_w(10, 8, 32, sse4)
501mc_rep_bi_w(10, 8, 48, sse4)
502mc_rep_bi_w(10, 8, 64, sse4)
503
504mc_rep_bi_w(12, 6, 12, sse4)
505mc_rep_bi_w(12, 8, 16, sse4)
506mc_rep_bi_w(12, 8, 24, sse4)
507mc_rep_bi_w(12, 8, 32, sse4)
508mc_rep_bi_w(12, 8, 48, sse4)
509mc_rep_bi_w(12, 8, 64, sse4)
510
511#define mc_uni_w_func(name, bitd, W, opt) \
512void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,         \
513                                                      uint8_t *_src, ptrdiff_t _srcstride,          \
514                                                      int height, int denom,                        \
515                                                      int _wx, int _ox,                             \
516                                                      intptr_t mx, intptr_t my, int width)          \
517{                                                                                                   \
518    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                            \
519    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);     \
520    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
521}
522
523#define mc_uni_w_funcs(name, bitd, opt)      \
524        mc_uni_w_func(name, bitd, 4, opt)    \
525        mc_uni_w_func(name, bitd, 8, opt)    \
526        mc_uni_w_func(name, bitd, 12, opt)   \
527        mc_uni_w_func(name, bitd, 16, opt)   \
528        mc_uni_w_func(name, bitd, 24, opt)   \
529        mc_uni_w_func(name, bitd, 32, opt)   \
530        mc_uni_w_func(name, bitd, 48, opt)   \
531        mc_uni_w_func(name, bitd, 64, opt)
532
533mc_uni_w_funcs(pel_pixels, 8, sse4)
534mc_uni_w_func(pel_pixels, 8, 6, sse4)
535mc_uni_w_funcs(epel_h, 8, sse4)
536mc_uni_w_func(epel_h, 8, 6, sse4)
537mc_uni_w_funcs(epel_v, 8, sse4)
538mc_uni_w_func(epel_v, 8, 6, sse4)
539mc_uni_w_funcs(epel_hv, 8, sse4)
540mc_uni_w_func(epel_hv, 8, 6, sse4)
541mc_uni_w_funcs(qpel_h, 8, sse4)
542mc_uni_w_funcs(qpel_v, 8, sse4)
543mc_uni_w_funcs(qpel_hv, 8, sse4)
544
545mc_uni_w_funcs(pel_pixels, 10, sse4)
546mc_uni_w_func(pel_pixels, 10, 6, sse4)
547mc_uni_w_funcs(epel_h, 10, sse4)
548mc_uni_w_func(epel_h, 10, 6, sse4)
549mc_uni_w_funcs(epel_v, 10, sse4)
550mc_uni_w_func(epel_v, 10, 6, sse4)
551mc_uni_w_funcs(epel_hv, 10, sse4)
552mc_uni_w_func(epel_hv, 10, 6, sse4)
553mc_uni_w_funcs(qpel_h, 10, sse4)
554mc_uni_w_funcs(qpel_v, 10, sse4)
555mc_uni_w_funcs(qpel_hv, 10, sse4)
556
557mc_uni_w_funcs(pel_pixels, 12, sse4)
558mc_uni_w_func(pel_pixels, 12, 6, sse4)
559mc_uni_w_funcs(epel_h, 12, sse4)
560mc_uni_w_func(epel_h, 12, 6, sse4)
561mc_uni_w_funcs(epel_v, 12, sse4)
562mc_uni_w_func(epel_v, 12, 6, sse4)
563mc_uni_w_funcs(epel_hv, 12, sse4)
564mc_uni_w_func(epel_hv, 12, 6, sse4)
565mc_uni_w_funcs(qpel_h, 12, sse4)
566mc_uni_w_funcs(qpel_v, 12, sse4)
567mc_uni_w_funcs(qpel_hv, 12, sse4)
568
569#define mc_bi_w_func(name, bitd, W, opt) \
570void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,           \
571                                                     uint8_t *_src, ptrdiff_t _srcstride,            \
572                                                     int16_t *_src2,                                 \
573                                                     int height, int denom,                          \
574                                                     int _wx0, int _wx1, int _ox0, int _ox1,         \
575                                                     intptr_t mx, intptr_t my, int width)            \
576{                                                                                                    \
577    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                             \
578    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);      \
579    ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2,                         \
580                                              height, denom, _wx0, _wx1, _ox0, _ox1);                \
581}
582
583#define mc_bi_w_funcs(name, bitd, opt)      \
584        mc_bi_w_func(name, bitd, 4, opt)    \
585        mc_bi_w_func(name, bitd, 8, opt)    \
586        mc_bi_w_func(name, bitd, 12, opt)   \
587        mc_bi_w_func(name, bitd, 16, opt)   \
588        mc_bi_w_func(name, bitd, 24, opt)   \
589        mc_bi_w_func(name, bitd, 32, opt)   \
590        mc_bi_w_func(name, bitd, 48, opt)   \
591        mc_bi_w_func(name, bitd, 64, opt)
592
593mc_bi_w_funcs(pel_pixels, 8, sse4)
594mc_bi_w_func(pel_pixels, 8, 6, sse4)
595mc_bi_w_funcs(epel_h, 8, sse4)
596mc_bi_w_func(epel_h, 8, 6, sse4)
597mc_bi_w_funcs(epel_v, 8, sse4)
598mc_bi_w_func(epel_v, 8, 6, sse4)
599mc_bi_w_funcs(epel_hv, 8, sse4)
600mc_bi_w_func(epel_hv, 8, 6, sse4)
601mc_bi_w_funcs(qpel_h, 8, sse4)
602mc_bi_w_funcs(qpel_v, 8, sse4)
603mc_bi_w_funcs(qpel_hv, 8, sse4)
604
605mc_bi_w_funcs(pel_pixels, 10, sse4)
606mc_bi_w_func(pel_pixels, 10, 6, sse4)
607mc_bi_w_funcs(epel_h, 10, sse4)
608mc_bi_w_func(epel_h, 10, 6, sse4)
609mc_bi_w_funcs(epel_v, 10, sse4)
610mc_bi_w_func(epel_v, 10, 6, sse4)
611mc_bi_w_funcs(epel_hv, 10, sse4)
612mc_bi_w_func(epel_hv, 10, 6, sse4)
613mc_bi_w_funcs(qpel_h, 10, sse4)
614mc_bi_w_funcs(qpel_v, 10, sse4)
615mc_bi_w_funcs(qpel_hv, 10, sse4)
616
617mc_bi_w_funcs(pel_pixels, 12, sse4)
618mc_bi_w_func(pel_pixels, 12, 6, sse4)
619mc_bi_w_funcs(epel_h, 12, sse4)
620mc_bi_w_func(epel_h, 12, 6, sse4)
621mc_bi_w_funcs(epel_v, 12, sse4)
622mc_bi_w_func(epel_v, 12, 6, sse4)
623mc_bi_w_funcs(epel_hv, 12, sse4)
624mc_bi_w_func(epel_hv, 12, 6, sse4)
625mc_bi_w_funcs(qpel_h, 12, sse4)
626mc_bi_w_funcs(qpel_v, 12, sse4)
627mc_bi_w_funcs(qpel_hv, 12, sse4)
628#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
629
630#define SAO_BAND_FILTER_FUNCS(bitd, opt)                                                                                   \
631void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,  \
632                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
633void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
634                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
635void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
636                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
637void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
638                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
639void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
640                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
641
642SAO_BAND_FILTER_FUNCS(8,  sse2)
643SAO_BAND_FILTER_FUNCS(10, sse2)
644SAO_BAND_FILTER_FUNCS(12, sse2)
645SAO_BAND_FILTER_FUNCS(8,   avx)
646SAO_BAND_FILTER_FUNCS(10,  avx)
647SAO_BAND_FILTER_FUNCS(12,  avx)
648SAO_BAND_FILTER_FUNCS(8,  avx2)
649SAO_BAND_FILTER_FUNCS(10, avx2)
650SAO_BAND_FILTER_FUNCS(12, avx2)
651
652#define SAO_BAND_INIT(bitd, opt) do {                                       \
653    c->sao_band_filter[0]      = ff_hevc_sao_band_filter_8_##bitd##_##opt;  \
654    c->sao_band_filter[1]      = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
655    c->sao_band_filter[2]      = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
656    c->sao_band_filter[3]      = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
657    c->sao_band_filter[4]      = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
658} while (0)
659
660#define SAO_EDGE_FILTER_FUNCS(bitd, opt)                                                                                    \
661void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,  \
662                                              int eo, int width, int height);                                               \
663void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
664                                               int eo, int width, int height);                                              \
665void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
666                                               int eo, int width, int height);                                              \
667void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
668                                               int eo, int width, int height);                                              \
669void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
670                                               int eo, int width, int height);                                              \
671
672SAO_EDGE_FILTER_FUNCS(8, ssse3)
673SAO_EDGE_FILTER_FUNCS(8, avx2)
674SAO_EDGE_FILTER_FUNCS(10, sse2)
675SAO_EDGE_FILTER_FUNCS(10, avx2)
676SAO_EDGE_FILTER_FUNCS(12, sse2)
677SAO_EDGE_FILTER_FUNCS(12, avx2)
678
679#define SAO_EDGE_INIT(bitd, opt) do {                                       \
680    c->sao_edge_filter[0]      = ff_hevc_sao_edge_filter_8_##bitd##_##opt;  \
681    c->sao_edge_filter[1]      = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
682    c->sao_edge_filter[2]      = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
683    c->sao_edge_filter[3]      = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
684    c->sao_edge_filter[4]      = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
685} while (0)
686
687#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )           \
688        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
689        PEL_LINK(pointer, 2, my , mx , fname##6 ,  bitd, opt ); \
690        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
691        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
692        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
693        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
694        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
695        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
696        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
697#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)           \
698        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
699        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
700        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
701        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
702        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
703        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
704        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
705        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
706
707void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
708{
709    int cpu_flags = av_get_cpu_flags();
710
711    if (bit_depth == 8) {
712        if (EXTERNAL_MMXEXT(cpu_flags)) {
713            c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
714
715            c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
716        }
717        if (EXTERNAL_SSE2(cpu_flags)) {
718            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
719            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
720            if (ARCH_X86_64) {
721                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
722                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
723
724                c->idct[2] = ff_hevc_idct_16x16_8_sse2;
725                c->idct[3] = ff_hevc_idct_32x32_8_sse2;
726            }
727            SAO_BAND_INIT(8, sse2);
728
729            c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
730            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
731            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
732
733            c->idct[0]    = ff_hevc_idct_4x4_8_sse2;
734            c->idct[1]    = ff_hevc_idct_8x8_8_sse2;
735
736            c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
737            c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
738            c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
739        }
740        if (EXTERNAL_SSSE3(cpu_flags)) {
741            if(ARCH_X86_64) {
742                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
743                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
744            }
745            SAO_EDGE_INIT(8, ssse3);
746        }
747        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
748
749            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
750            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
751            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
752            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
753
754            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
755            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
756            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
757            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
758        }
759        if (EXTERNAL_AVX(cpu_flags)) {
760            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
761            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
762            if (ARCH_X86_64) {
763                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
764                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
765
766                c->idct[2] = ff_hevc_idct_16x16_8_avx;
767                c->idct[3] = ff_hevc_idct_32x32_8_avx;
768            }
769            SAO_BAND_INIT(8, avx);
770
771            c->idct[0] = ff_hevc_idct_4x4_8_avx;
772            c->idct[1] = ff_hevc_idct_8x8_8_avx;
773
774            c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
775            c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
776            c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
777        }
778        if (EXTERNAL_AVX2(cpu_flags)) {
779            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
780            c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
781        }
782        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
783            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
784            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
785            if (ARCH_X86_64) {
786                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
787                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
788                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
789
790                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
791                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
792                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
793
794                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
795                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
796                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
797
798                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
799                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
800                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
801
802                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
803                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
804                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
805
806                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
807                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
808                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
809
810                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
811                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
812                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
813
814                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
815                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
816                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
817
818                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
819                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
820                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
821
822                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
823                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
824                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
825
826                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
827                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
828                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
829
830                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
831                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
832                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
833
834                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
835                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
836                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
837
838                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
839                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
840                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
841
842                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
843                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
844                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
845
846                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
847                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
848                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
849
850                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
851                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
852                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
853
854                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
855                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
856                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
857
858                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
859                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
860                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
861
862                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
863                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
864                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
865
866                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
867                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
868                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
869            }
870            SAO_BAND_INIT(8, avx2);
871
872            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
873            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
874            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
875
876            c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
877        }
878        if (EXTERNAL_AVX512ICL(cpu_flags) && ARCH_X86_64) {
879            c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_avx512icl;
880            c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
881            c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_avx512icl;
882            c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx512icl;
883            c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx512icl;
884            c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
885        }
886    } else if (bit_depth == 10) {
887        if (EXTERNAL_MMXEXT(cpu_flags)) {
888            c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
889            c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
890        }
891        if (EXTERNAL_SSE2(cpu_flags)) {
892            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
893            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
894            if (ARCH_X86_64) {
895                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
896                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
897
898                c->idct[2] = ff_hevc_idct_16x16_10_sse2;
899                c->idct[3] = ff_hevc_idct_32x32_10_sse2;
900            }
901            SAO_BAND_INIT(10, sse2);
902            SAO_EDGE_INIT(10, sse2);
903
904            c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
905            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
906            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
907
908            c->idct[0]    = ff_hevc_idct_4x4_10_sse2;
909            c->idct[1]    = ff_hevc_idct_8x8_10_sse2;
910
911            c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
912            c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
913            c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
914        }
915        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
916            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
917            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
918        }
919        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
920            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
921            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
922            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
923            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
924
925            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
926            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
927            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
928            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
929        }
930        if (EXTERNAL_AVX(cpu_flags)) {
931            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
932            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
933            if (ARCH_X86_64) {
934                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
935                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
936
937                c->idct[2] = ff_hevc_idct_16x16_10_avx;
938                c->idct[3] = ff_hevc_idct_32x32_10_avx;
939            }
940
941            c->idct[0] = ff_hevc_idct_4x4_10_avx;
942            c->idct[1] = ff_hevc_idct_8x8_10_avx;
943
944            SAO_BAND_INIT(10, avx);
945        }
946        if (EXTERNAL_AVX2(cpu_flags)) {
947            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
948        }
949        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
950            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
951            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
952            if (ARCH_X86_64) {
953                c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
954                c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
955                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
956                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
957                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
958
959                c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
960                c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
961                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
962                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
963                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
964
965                c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
966                c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
967                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
968                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
969                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
970
971                c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
972                c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
973                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
974                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
975                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
976
977                c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
978                c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
979                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
980                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
981                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
982                c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
983                c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
984                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
985                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
986                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
987
988                c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
989                c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
990                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
991                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
992                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
993
994                c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
995                c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
996                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
997                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
998                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
999
1000                c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
1001                c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
1002                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
1003                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
1004                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
1005
1006                c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
1007                c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
1008                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
1009                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
1010                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
1011
1012                c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
1013                c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
1014                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
1015                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
1016                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
1017
1018                c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
1019                c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
1020                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
1021                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
1022                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
1023
1024                c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
1025                c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
1026                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
1027                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
1028                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
1029
1030                c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
1031                c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
1032                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
1033                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
1034                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
1035
1036                c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
1037                c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
1038                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
1039                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
1040                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
1041
1042                c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
1043                c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
1044                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
1045                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
1046                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
1047
1048                c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
1049                c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
1050                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
1051                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
1052                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
1053
1054                c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
1055                c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
1056                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
1057                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
1058                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
1059
1060                c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
1061                c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
1062                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
1063                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
1064                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
1065
1066                c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
1067                c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
1068                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
1069                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
1070                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
1071
1072                c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
1073                c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
1074                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
1075                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
1076                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
1077
1078                c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
1079                c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
1080                c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
1081                c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
1082                c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
1083
1084                c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
1085                c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
1086                c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
1087                c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
1088                c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
1089
1090                c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
1091                c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
1092                c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
1093                c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
1094                c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
1095            }
1096            SAO_BAND_INIT(10, avx2);
1097            SAO_EDGE_INIT(10, avx2);
1098
1099            c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
1100            c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
1101        }
1102    } else if (bit_depth == 12) {
1103        if (EXTERNAL_MMXEXT(cpu_flags)) {
1104            c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
1105        }
1106        if (EXTERNAL_SSE2(cpu_flags)) {
1107            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
1108            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
1109            if (ARCH_X86_64) {
1110                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
1111                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
1112            }
1113            SAO_BAND_INIT(12, sse2);
1114            SAO_EDGE_INIT(12, sse2);
1115
1116            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
1117            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
1118            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
1119        }
1120        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
1121            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
1122            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
1123        }
1124        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
1125            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
1126            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
1127            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
1128            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
1129
1130            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
1131            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
1132            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
1133            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
1134        }
1135        if (EXTERNAL_AVX(cpu_flags)) {
1136            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
1137            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
1138            if (ARCH_X86_64) {
1139                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
1140                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
1141            }
1142            SAO_BAND_INIT(12, avx);
1143        }
1144        if (EXTERNAL_AVX2(cpu_flags)) {
1145            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
1146        }
1147        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
1148            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
1149            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
1150
1151            SAO_BAND_INIT(12, avx2);
1152            SAO_EDGE_INIT(12, avx2);
1153        }
1154    }
1155}
1156