1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * VP9 SIMD optimizations
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/attributes.h"
24cabdff1aSopenharmony_ci#include "libavutil/cpu.h"
25cabdff1aSopenharmony_ci#include "libavutil/x86/cpu.h"
26cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h"
27cabdff1aSopenharmony_ci#include "libavcodec/x86/vp9dsp_init.h"
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ci#if HAVE_X86ASM
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_cidecl_fpel_func(put,  4,   , mmx);
32cabdff1aSopenharmony_cidecl_fpel_func(put,  8,   , mmx);
33cabdff1aSopenharmony_cidecl_fpel_func(put, 16,   , sse);
34cabdff1aSopenharmony_cidecl_fpel_func(put, 32,   , sse);
35cabdff1aSopenharmony_cidecl_fpel_func(put, 64,   , sse);
36cabdff1aSopenharmony_cidecl_fpel_func(avg,  4, _8, mmxext);
37cabdff1aSopenharmony_cidecl_fpel_func(avg,  8, _8, mmxext);
38cabdff1aSopenharmony_cidecl_fpel_func(avg, 16, _8, sse2);
39cabdff1aSopenharmony_cidecl_fpel_func(avg, 32, _8, sse2);
40cabdff1aSopenharmony_cidecl_fpel_func(avg, 64, _8, sse2);
41cabdff1aSopenharmony_cidecl_fpel_func(put, 32,   , avx);
42cabdff1aSopenharmony_cidecl_fpel_func(put, 64,   , avx);
43cabdff1aSopenharmony_cidecl_fpel_func(avg, 32, _8, avx2);
44cabdff1aSopenharmony_cidecl_fpel_func(avg, 64, _8, avx2);
45cabdff1aSopenharmony_ci
46cabdff1aSopenharmony_cidecl_mc_funcs(4, mmxext, int16_t, 8, 8);
47cabdff1aSopenharmony_cidecl_mc_funcs(8, sse2, int16_t,  8, 8);
48cabdff1aSopenharmony_cidecl_mc_funcs(4, ssse3, int8_t, 32, 8);
49cabdff1aSopenharmony_cidecl_mc_funcs(8, ssse3, int8_t, 32, 8);
50cabdff1aSopenharmony_ci#if ARCH_X86_64
51cabdff1aSopenharmony_cidecl_mc_funcs(16, ssse3, int8_t, 32, 8);
52cabdff1aSopenharmony_cidecl_mc_funcs(32, avx2, int8_t, 32, 8);
53cabdff1aSopenharmony_ci#endif
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_cimc_rep_funcs(16,  8,  8,  sse2, int16_t,  8, 8)
56cabdff1aSopenharmony_ci#if ARCH_X86_32
57cabdff1aSopenharmony_cimc_rep_funcs(16,  8,  8, ssse3, int8_t,  32, 8)
58cabdff1aSopenharmony_ci#endif
59cabdff1aSopenharmony_cimc_rep_funcs(32, 16, 16, sse2,  int16_t,  8, 8)
60cabdff1aSopenharmony_cimc_rep_funcs(32, 16, 16, ssse3, int8_t,  32, 8)
61cabdff1aSopenharmony_cimc_rep_funcs(64, 32, 32, sse2,  int16_t,  8, 8)
62cabdff1aSopenharmony_cimc_rep_funcs(64, 32, 32, ssse3, int8_t,  32, 8)
63cabdff1aSopenharmony_ci#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
64cabdff1aSopenharmony_cimc_rep_funcs(64, 32, 32, avx2,  int8_t,  32, 8)
65cabdff1aSopenharmony_ci#endif
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ciextern const int8_t ff_filters_ssse3[3][15][4][32];
68cabdff1aSopenharmony_ciextern const int16_t ff_filters_sse2[3][15][8][8];
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_cifilters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2)
71cabdff1aSopenharmony_cifilters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2)
72cabdff1aSopenharmony_cifilters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3)
73cabdff1aSopenharmony_cifilters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3)
74cabdff1aSopenharmony_ci#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
75cabdff1aSopenharmony_cifilters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3)
76cabdff1aSopenharmony_cifilters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3)
77cabdff1aSopenharmony_cifilters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3)
78cabdff1aSopenharmony_cifilters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3)
79cabdff1aSopenharmony_ci#endif
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_cifilters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2)
82cabdff1aSopenharmony_cifilters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2)
83cabdff1aSopenharmony_cifilters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3)
84cabdff1aSopenharmony_cifilters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3)
85cabdff1aSopenharmony_ci#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
86cabdff1aSopenharmony_cifilters_8tap_1d_fn2(put, 64, 8, avx2, ssse3)
87cabdff1aSopenharmony_cifilters_8tap_1d_fn2(put, 32, 8, avx2, ssse3)
88cabdff1aSopenharmony_cifilters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3)
89cabdff1aSopenharmony_cifilters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3)
90cabdff1aSopenharmony_ci#endif
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci#define itxfm_func(typea, typeb, size, opt) \
93cabdff1aSopenharmony_civoid ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
94cabdff1aSopenharmony_ci                                                            int16_t *block, int eob)
95cabdff1aSopenharmony_ci#define itxfm_funcs(size, opt) \
96cabdff1aSopenharmony_ciitxfm_func(idct,  idct,  size, opt); \
97cabdff1aSopenharmony_ciitxfm_func(iadst, idct,  size, opt); \
98cabdff1aSopenharmony_ciitxfm_func(idct,  iadst, size, opt); \
99cabdff1aSopenharmony_ciitxfm_func(iadst, iadst, size, opt)
100cabdff1aSopenharmony_ci
101cabdff1aSopenharmony_ciitxfm_func(idct,  idct,  4, mmxext);
102cabdff1aSopenharmony_ciitxfm_func(idct,  iadst, 4, sse2);
103cabdff1aSopenharmony_ciitxfm_func(iadst, idct,  4, sse2);
104cabdff1aSopenharmony_ciitxfm_func(iadst, iadst, 4, sse2);
105cabdff1aSopenharmony_ciitxfm_funcs(4, ssse3);
106cabdff1aSopenharmony_ciitxfm_funcs(8, sse2);
107cabdff1aSopenharmony_ciitxfm_funcs(8, ssse3);
108cabdff1aSopenharmony_ciitxfm_funcs(8, avx);
109cabdff1aSopenharmony_ciitxfm_funcs(16, sse2);
110cabdff1aSopenharmony_ciitxfm_funcs(16, ssse3);
111cabdff1aSopenharmony_ciitxfm_funcs(16, avx);
112cabdff1aSopenharmony_ciitxfm_func(idct, idct, 32, sse2);
113cabdff1aSopenharmony_ciitxfm_func(idct, idct, 32, ssse3);
114cabdff1aSopenharmony_ciitxfm_func(idct, idct, 32, avx);
115cabdff1aSopenharmony_ciitxfm_func(iwht, iwht, 4, mmx);
116cabdff1aSopenharmony_ciitxfm_funcs(16, avx2);
117cabdff1aSopenharmony_ciitxfm_func(idct, idct, 32, avx2);
118cabdff1aSopenharmony_ci
119cabdff1aSopenharmony_ci#undef itxfm_func
120cabdff1aSopenharmony_ci#undef itxfm_funcs
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci#define lpf_funcs(size1, size2, opt) \
123cabdff1aSopenharmony_civoid ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
124cabdff1aSopenharmony_ci                                                    int E, int I, int H); \
125cabdff1aSopenharmony_civoid ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
126cabdff1aSopenharmony_ci                                                    int E, int I, int H)
127cabdff1aSopenharmony_ci
128cabdff1aSopenharmony_cilpf_funcs(4, 8, mmxext);
129cabdff1aSopenharmony_cilpf_funcs(8, 8, mmxext);
130cabdff1aSopenharmony_cilpf_funcs(16, 16, sse2);
131cabdff1aSopenharmony_cilpf_funcs(16, 16, ssse3);
132cabdff1aSopenharmony_cilpf_funcs(16, 16, avx);
133cabdff1aSopenharmony_cilpf_funcs(44, 16, sse2);
134cabdff1aSopenharmony_cilpf_funcs(44, 16, ssse3);
135cabdff1aSopenharmony_cilpf_funcs(44, 16, avx);
136cabdff1aSopenharmony_cilpf_funcs(84, 16, sse2);
137cabdff1aSopenharmony_cilpf_funcs(84, 16, ssse3);
138cabdff1aSopenharmony_cilpf_funcs(84, 16, avx);
139cabdff1aSopenharmony_cilpf_funcs(48, 16, sse2);
140cabdff1aSopenharmony_cilpf_funcs(48, 16, ssse3);
141cabdff1aSopenharmony_cilpf_funcs(48, 16, avx);
142cabdff1aSopenharmony_cilpf_funcs(88, 16, sse2);
143cabdff1aSopenharmony_cilpf_funcs(88, 16, ssse3);
144cabdff1aSopenharmony_cilpf_funcs(88, 16, avx);
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_ci#undef lpf_funcs
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_ci#define ipred_func(size, type, opt) \
149cabdff1aSopenharmony_civoid ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
150cabdff1aSopenharmony_ci                                                   const uint8_t *l, const uint8_t *a)
151cabdff1aSopenharmony_ci
152cabdff1aSopenharmony_ciipred_func(8, v, mmx);
153cabdff1aSopenharmony_ci
154cabdff1aSopenharmony_ci#define ipred_dc_funcs(size, opt) \
155cabdff1aSopenharmony_ciipred_func(size, dc, opt); \
156cabdff1aSopenharmony_ciipred_func(size, dc_left, opt); \
157cabdff1aSopenharmony_ciipred_func(size, dc_top, opt)
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_ciipred_dc_funcs(4, mmxext);
160cabdff1aSopenharmony_ciipred_dc_funcs(8, mmxext);
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci#define ipred_dir_tm_funcs(size, opt) \
163cabdff1aSopenharmony_ciipred_func(size, tm, opt); \
164cabdff1aSopenharmony_ciipred_func(size, dl, opt); \
165cabdff1aSopenharmony_ciipred_func(size, dr, opt); \
166cabdff1aSopenharmony_ciipred_func(size, hd, opt); \
167cabdff1aSopenharmony_ciipred_func(size, hu, opt); \
168cabdff1aSopenharmony_ciipred_func(size, vl, opt); \
169cabdff1aSopenharmony_ciipred_func(size, vr, opt)
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ciipred_dir_tm_funcs(4, mmxext);
172cabdff1aSopenharmony_ci
173cabdff1aSopenharmony_ciipred_func(16, v, sse);
174cabdff1aSopenharmony_ciipred_func(32, v, sse);
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ciipred_dc_funcs(16, sse2);
177cabdff1aSopenharmony_ciipred_dc_funcs(32, sse2);
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci#define ipred_dir_tm_h_funcs(size, opt) \
180cabdff1aSopenharmony_ciipred_dir_tm_funcs(size, opt); \
181cabdff1aSopenharmony_ciipred_func(size, h, opt)
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ciipred_dir_tm_h_funcs(8, sse2);
184cabdff1aSopenharmony_ciipred_dir_tm_h_funcs(16, sse2);
185cabdff1aSopenharmony_ciipred_dir_tm_h_funcs(32, sse2);
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ciipred_func(4, h, sse2);
188cabdff1aSopenharmony_ci
189cabdff1aSopenharmony_ci#define ipred_all_funcs(size, opt) \
190cabdff1aSopenharmony_ciipred_dc_funcs(size, opt); \
191cabdff1aSopenharmony_ciipred_dir_tm_h_funcs(size, opt)
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci// FIXME hd/vl_4x4_ssse3 does not exist
194cabdff1aSopenharmony_ciipred_all_funcs(4, ssse3);
195cabdff1aSopenharmony_ciipred_all_funcs(8, ssse3);
196cabdff1aSopenharmony_ciipred_all_funcs(16, ssse3);
197cabdff1aSopenharmony_ciipred_all_funcs(32, ssse3);
198cabdff1aSopenharmony_ci
199cabdff1aSopenharmony_ciipred_dir_tm_h_funcs(8, avx);
200cabdff1aSopenharmony_ciipred_dir_tm_h_funcs(16, avx);
201cabdff1aSopenharmony_ciipred_dir_tm_h_funcs(32, avx);
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ciipred_func(32, v, avx);
204cabdff1aSopenharmony_ci
205cabdff1aSopenharmony_ciipred_dc_funcs(32, avx2);
206cabdff1aSopenharmony_ciipred_func(32, h, avx2);
207cabdff1aSopenharmony_ciipred_func(32, tm, avx2);
208cabdff1aSopenharmony_ci
209cabdff1aSopenharmony_ci#undef ipred_func
210cabdff1aSopenharmony_ci#undef ipred_dir_tm_h_funcs
211cabdff1aSopenharmony_ci#undef ipred_dir_tm_funcs
212cabdff1aSopenharmony_ci#undef ipred_dc_funcs
213cabdff1aSopenharmony_ci
214cabdff1aSopenharmony_ci#endif /* HAVE_X86ASM */
215cabdff1aSopenharmony_ci
216cabdff1aSopenharmony_ciav_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
217cabdff1aSopenharmony_ci{
218cabdff1aSopenharmony_ci#if HAVE_X86ASM
219cabdff1aSopenharmony_ci    int cpu_flags;
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_ci    if (bpp == 10) {
222cabdff1aSopenharmony_ci        ff_vp9dsp_init_10bpp_x86(dsp, bitexact);
223cabdff1aSopenharmony_ci        return;
224cabdff1aSopenharmony_ci    } else if (bpp == 12) {
225cabdff1aSopenharmony_ci        ff_vp9dsp_init_12bpp_x86(dsp, bitexact);
226cabdff1aSopenharmony_ci        return;
227cabdff1aSopenharmony_ci    }
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci    cpu_flags = av_get_cpu_flags();
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci#define init_lpf(opt) do { \
232cabdff1aSopenharmony_ci    dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
233cabdff1aSopenharmony_ci    dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
234cabdff1aSopenharmony_ci    dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
235cabdff1aSopenharmony_ci    dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
236cabdff1aSopenharmony_ci    dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
237cabdff1aSopenharmony_ci    dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
238cabdff1aSopenharmony_ci    dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
239cabdff1aSopenharmony_ci    dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
240cabdff1aSopenharmony_ci    dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
241cabdff1aSopenharmony_ci    dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
242cabdff1aSopenharmony_ci} while (0)
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_ci#define init_ipred(sz, opt, t, e) \
245cabdff1aSopenharmony_ci    dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
248cabdff1aSopenharmony_ci#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
249cabdff1aSopenharmony_ci#define init_dir_tm_ipred(sz, opt) do { \
250cabdff1aSopenharmony_ci    init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
251cabdff1aSopenharmony_ci    init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
252cabdff1aSopenharmony_ci    init_ipred(sz, opt, hd, HOR_DOWN); \
253cabdff1aSopenharmony_ci    init_ipred(sz, opt, vl, VERT_LEFT); \
254cabdff1aSopenharmony_ci    init_ipred(sz, opt, hu, HOR_UP); \
255cabdff1aSopenharmony_ci    init_ipred(sz, opt, tm, TM_VP8); \
256cabdff1aSopenharmony_ci    init_ipred(sz, opt, vr, VERT_RIGHT); \
257cabdff1aSopenharmony_ci} while (0)
258cabdff1aSopenharmony_ci#define init_dir_tm_h_ipred(sz, opt) do { \
259cabdff1aSopenharmony_ci    init_dir_tm_ipred(sz, opt); \
260cabdff1aSopenharmony_ci    init_ipred(sz, opt, h,  HOR); \
261cabdff1aSopenharmony_ci} while (0)
262cabdff1aSopenharmony_ci#define init_dc_ipred(sz, opt) do { \
263cabdff1aSopenharmony_ci    init_ipred(sz, opt, dc,      DC); \
264cabdff1aSopenharmony_ci    init_ipred(sz, opt, dc_left, LEFT_DC); \
265cabdff1aSopenharmony_ci    init_ipred(sz, opt, dc_top,  TOP_DC); \
266cabdff1aSopenharmony_ci} while (0)
267cabdff1aSopenharmony_ci#define init_all_ipred(sz, opt) do { \
268cabdff1aSopenharmony_ci    init_dc_ipred(sz, opt); \
269cabdff1aSopenharmony_ci    init_dir_tm_h_ipred(sz, opt); \
270cabdff1aSopenharmony_ci} while (0)
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_ci    if (EXTERNAL_MMX(cpu_flags)) {
273cabdff1aSopenharmony_ci        init_fpel_func(4, 0,  4, put, , mmx);
274cabdff1aSopenharmony_ci        init_fpel_func(3, 0,  8, put, , mmx);
275cabdff1aSopenharmony_ci        if (!bitexact) {
276cabdff1aSopenharmony_ci            dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
277cabdff1aSopenharmony_ci            dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
278cabdff1aSopenharmony_ci            dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
279cabdff1aSopenharmony_ci            dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
280cabdff1aSopenharmony_ci        }
281cabdff1aSopenharmony_ci        init_ipred(8, mmx, v, VERT);
282cabdff1aSopenharmony_ci    }
283cabdff1aSopenharmony_ci
284cabdff1aSopenharmony_ci    if (EXTERNAL_MMXEXT(cpu_flags)) {
285cabdff1aSopenharmony_ci        dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext;
286cabdff1aSopenharmony_ci        dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext;
287cabdff1aSopenharmony_ci        dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext;
288cabdff1aSopenharmony_ci        dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext;
289cabdff1aSopenharmony_ci        init_subpel2(4, 0, 4, put, 8, mmxext);
290cabdff1aSopenharmony_ci        init_subpel2(4, 1, 4, avg, 8, mmxext);
291cabdff1aSopenharmony_ci        init_fpel_func(4, 1,  4, avg, _8, mmxext);
292cabdff1aSopenharmony_ci        init_fpel_func(3, 1,  8, avg, _8, mmxext);
293cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
294cabdff1aSopenharmony_ci        init_dc_ipred(4, mmxext);
295cabdff1aSopenharmony_ci        init_dc_ipred(8, mmxext);
296cabdff1aSopenharmony_ci        init_dir_tm_ipred(4, mmxext);
297cabdff1aSopenharmony_ci    }
298cabdff1aSopenharmony_ci
299cabdff1aSopenharmony_ci    if (EXTERNAL_SSE(cpu_flags)) {
300cabdff1aSopenharmony_ci        init_fpel_func(2, 0, 16, put, , sse);
301cabdff1aSopenharmony_ci        init_fpel_func(1, 0, 32, put, , sse);
302cabdff1aSopenharmony_ci        init_fpel_func(0, 0, 64, put, , sse);
303cabdff1aSopenharmony_ci        init_ipred(16, sse, v, VERT);
304cabdff1aSopenharmony_ci        init_ipred(32, sse, v, VERT);
305cabdff1aSopenharmony_ci    }
306cabdff1aSopenharmony_ci
307cabdff1aSopenharmony_ci    if (EXTERNAL_SSE2(cpu_flags)) {
308cabdff1aSopenharmony_ci        init_subpel3_8to64(0, put, 8, sse2);
309cabdff1aSopenharmony_ci        init_subpel3_8to64(1, avg, 8, sse2);
310cabdff1aSopenharmony_ci        init_fpel_func(2, 1, 16, avg,  _8, sse2);
311cabdff1aSopenharmony_ci        init_fpel_func(1, 1, 32, avg,  _8, sse2);
312cabdff1aSopenharmony_ci        init_fpel_func(0, 1, 64, avg,  _8, sse2);
313cabdff1aSopenharmony_ci        init_lpf(sse2);
314cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_sse2;
315cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_sse2;
316cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2;
317cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2;
318cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_sse2;
319cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_sse2;
320cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2;
321cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_sse2;
322cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_sse2;
323cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_sse2;
324cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2;
325cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][ADST_ADST] =
326cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][ADST_DCT] =
327cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][DCT_ADST] =
328cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
329cabdff1aSopenharmony_ci        init_dc_ipred(16, sse2);
330cabdff1aSopenharmony_ci        init_dc_ipred(32, sse2);
331cabdff1aSopenharmony_ci        init_dir_tm_h_ipred(8, sse2);
332cabdff1aSopenharmony_ci        init_dir_tm_h_ipred(16, sse2);
333cabdff1aSopenharmony_ci        init_dir_tm_h_ipred(32, sse2);
334cabdff1aSopenharmony_ci        init_ipred(4, sse2, h, HOR);
335cabdff1aSopenharmony_ci    }
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_ci    if (EXTERNAL_SSSE3(cpu_flags)) {
338cabdff1aSopenharmony_ci        init_subpel3(0, put, 8, ssse3);
339cabdff1aSopenharmony_ci        init_subpel3(1, avg, 8, ssse3);
340cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
341cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_ssse3;
342cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_ssse3;
343cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
344cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
345cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_ssse3;
346cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_ssse3;
347cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
348cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_ssse3;
349cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_ssse3;
350cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_ssse3;
351cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
352cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][ADST_ADST] =
353cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][ADST_DCT] =
354cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][DCT_ADST] =
355cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
356cabdff1aSopenharmony_ci        init_lpf(ssse3);
357cabdff1aSopenharmony_ci        init_all_ipred(4, ssse3);
358cabdff1aSopenharmony_ci        init_all_ipred(8, ssse3);
359cabdff1aSopenharmony_ci        init_all_ipred(16, ssse3);
360cabdff1aSopenharmony_ci        init_all_ipred(32, ssse3);
361cabdff1aSopenharmony_ci    }
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ci    if (EXTERNAL_AVX(cpu_flags)) {
364cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
365cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_avx;
366cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_avx;
367cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
368cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
369cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx;
370cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx;
371cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
372cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][ADST_ADST] =
373cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][ADST_DCT] =
374cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][DCT_ADST] =
375cabdff1aSopenharmony_ci        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
376cabdff1aSopenharmony_ci        init_lpf(avx);
377cabdff1aSopenharmony_ci        init_dir_tm_h_ipred(8, avx);
378cabdff1aSopenharmony_ci        init_dir_tm_h_ipred(16, avx);
379cabdff1aSopenharmony_ci        init_dir_tm_h_ipred(32, avx);
380cabdff1aSopenharmony_ci    }
381cabdff1aSopenharmony_ci    if (EXTERNAL_AVX_FAST(cpu_flags)) {
382cabdff1aSopenharmony_ci        init_fpel_func(1, 0, 32, put, , avx);
383cabdff1aSopenharmony_ci        init_fpel_func(0, 0, 64, put, , avx);
384cabdff1aSopenharmony_ci        init_ipred(32, avx, v, VERT);
385cabdff1aSopenharmony_ci    }
386cabdff1aSopenharmony_ci
387cabdff1aSopenharmony_ci    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
388cabdff1aSopenharmony_ci        init_fpel_func(1, 1, 32, avg, _8, avx2);
389cabdff1aSopenharmony_ci        init_fpel_func(0, 1, 64, avg, _8, avx2);
390cabdff1aSopenharmony_ci        if (ARCH_X86_64) {
391cabdff1aSopenharmony_ci#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
392cabdff1aSopenharmony_ci            dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2;
393cabdff1aSopenharmony_ci            dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx2;
394cabdff1aSopenharmony_ci            dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx2;
395cabdff1aSopenharmony_ci            dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2;
396cabdff1aSopenharmony_ci            dsp->itxfm_add[TX_32X32][ADST_ADST] =
397cabdff1aSopenharmony_ci            dsp->itxfm_add[TX_32X32][ADST_DCT] =
398cabdff1aSopenharmony_ci            dsp->itxfm_add[TX_32X32][DCT_ADST] =
399cabdff1aSopenharmony_ci            dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2;
400cabdff1aSopenharmony_ci            init_subpel3_32_64(0, put, 8, avx2);
401cabdff1aSopenharmony_ci            init_subpel3_32_64(1, avg, 8, avx2);
402cabdff1aSopenharmony_ci#endif
403cabdff1aSopenharmony_ci        }
404cabdff1aSopenharmony_ci        init_dc_ipred(32, avx2);
405cabdff1aSopenharmony_ci        init_ipred(32, avx2, h,  HOR);
406cabdff1aSopenharmony_ci        init_ipred(32, avx2, tm, TM_VP8);
407cabdff1aSopenharmony_ci    }
408cabdff1aSopenharmony_ci
409cabdff1aSopenharmony_ci#undef init_fpel
410cabdff1aSopenharmony_ci#undef init_subpel1
411cabdff1aSopenharmony_ci#undef init_subpel2
412cabdff1aSopenharmony_ci#undef init_subpel3
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci#endif /* HAVE_X86ASM */
415cabdff1aSopenharmony_ci}
416