1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc.
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include <stdint.h>
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/attributes.h"
24cabdff1aSopenharmony_ci#include "libavutil/internal.h"
25cabdff1aSopenharmony_ci#include "libavutil/mem_internal.h"
26cabdff1aSopenharmony_ci#include "libavutil/aarch64/cpu.h"
27cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h"
28cabdff1aSopenharmony_ci#include "vp9dsp_init.h"
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci#define declare_fpel(type, sz)                                          \
31cabdff1aSopenharmony_civoid ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
32cabdff1aSopenharmony_ci                              const uint8_t *src, ptrdiff_t src_stride, \
33cabdff1aSopenharmony_ci                              int h, int mx, int my)
34cabdff1aSopenharmony_ci
35cabdff1aSopenharmony_ci#define declare_copy_avg(sz) \
36cabdff1aSopenharmony_ci    declare_fpel(copy, sz);  \
37cabdff1aSopenharmony_ci    declare_fpel(avg , sz)
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ci#define decl_mc_func(op, filter, dir, sz)                                                \
40cabdff1aSopenharmony_civoid ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
41cabdff1aSopenharmony_ci                                               const uint8_t *src, ptrdiff_t src_stride, \
42cabdff1aSopenharmony_ci                                               int h, int mx, int my)
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_ci#define define_8tap_2d_fn(op, filter, sz)                                         \
45cabdff1aSopenharmony_cistatic void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
46cabdff1aSopenharmony_ci                                        const uint8_t *src, ptrdiff_t src_stride, \
47cabdff1aSopenharmony_ci                                        int h, int mx, int my)                    \
48cabdff1aSopenharmony_ci{                                                                                 \
49cabdff1aSopenharmony_ci    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]);           \
50cabdff1aSopenharmony_ci    /* We only need h + 7 lines, but the horizontal filter assumes an             \
51cabdff1aSopenharmony_ci     * even number of rows, so filter h + 8 lines here. */                        \
52cabdff1aSopenharmony_ci    ff_vp9_put_##filter##sz##_h_neon(temp, sz,                                    \
53cabdff1aSopenharmony_ci                                     src - 3 * src_stride, src_stride,            \
54cabdff1aSopenharmony_ci                                     h + 8, mx, 0);                               \
55cabdff1aSopenharmony_ci    ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride,                          \
56cabdff1aSopenharmony_ci                                        temp + 3 * sz, sz,                        \
57cabdff1aSopenharmony_ci                                        h, 0, my);                                \
58cabdff1aSopenharmony_ci}
59cabdff1aSopenharmony_ci
60cabdff1aSopenharmony_ci#define decl_filter_funcs(op, dir, sz)  \
61cabdff1aSopenharmony_ci    decl_mc_func(op, regular, dir, sz); \
62cabdff1aSopenharmony_ci    decl_mc_func(op, sharp,   dir, sz); \
63cabdff1aSopenharmony_ci    decl_mc_func(op, smooth,  dir, sz)
64cabdff1aSopenharmony_ci
65cabdff1aSopenharmony_ci#define decl_mc_funcs(sz)           \
66cabdff1aSopenharmony_ci    decl_filter_funcs(put, h,  sz); \
67cabdff1aSopenharmony_ci    decl_filter_funcs(avg, h,  sz); \
68cabdff1aSopenharmony_ci    decl_filter_funcs(put, v,  sz); \
69cabdff1aSopenharmony_ci    decl_filter_funcs(avg, v,  sz); \
70cabdff1aSopenharmony_ci    decl_filter_funcs(put, hv, sz); \
71cabdff1aSopenharmony_ci    decl_filter_funcs(avg, hv, sz)
72cabdff1aSopenharmony_ci
73cabdff1aSopenharmony_ci#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
74cabdff1aSopenharmony_ci#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_cideclare_copy_avg(64);
77cabdff1aSopenharmony_cideclare_copy_avg(32);
78cabdff1aSopenharmony_cideclare_copy_avg(16);
79cabdff1aSopenharmony_cideclare_copy_avg(8);
80cabdff1aSopenharmony_cideclare_copy_avg(4);
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_cidecl_mc_funcs(64);
83cabdff1aSopenharmony_cidecl_mc_funcs(32);
84cabdff1aSopenharmony_cidecl_mc_funcs(16);
85cabdff1aSopenharmony_cidecl_mc_funcs(8);
86cabdff1aSopenharmony_cidecl_mc_funcs(4);
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci#define define_8tap_2d_funcs(sz)        \
89cabdff1aSopenharmony_ci    define_8tap_2d_fn(put, regular, sz) \
90cabdff1aSopenharmony_ci    define_8tap_2d_fn(put, sharp,   sz) \
91cabdff1aSopenharmony_ci    define_8tap_2d_fn(put, smooth,  sz) \
92cabdff1aSopenharmony_ci    define_8tap_2d_fn(avg, regular, sz) \
93cabdff1aSopenharmony_ci    define_8tap_2d_fn(avg, sharp,   sz) \
94cabdff1aSopenharmony_ci    define_8tap_2d_fn(avg, smooth,  sz)
95cabdff1aSopenharmony_ci
96cabdff1aSopenharmony_cidefine_8tap_2d_funcs(64)
97cabdff1aSopenharmony_cidefine_8tap_2d_funcs(32)
98cabdff1aSopenharmony_cidefine_8tap_2d_funcs(16)
99cabdff1aSopenharmony_cidefine_8tap_2d_funcs(8)
100cabdff1aSopenharmony_cidefine_8tap_2d_funcs(4)
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_cistatic av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
103cabdff1aSopenharmony_ci{
104cabdff1aSopenharmony_ci    int cpu_flags = av_get_cpu_flags();
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci#define init_fpel(idx1, idx2, sz, type, suffix)      \
107cabdff1aSopenharmony_ci    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
108cabdff1aSopenharmony_ci    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
109cabdff1aSopenharmony_ci    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
110cabdff1aSopenharmony_ci    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci#define init_copy(idx, sz, suffix) \
113cabdff1aSopenharmony_ci    init_fpel(idx, 0, sz, copy, suffix)
114cabdff1aSopenharmony_ci
115cabdff1aSopenharmony_ci#define init_avg(idx, sz, suffix) \
116cabdff1aSopenharmony_ci    init_fpel(idx, 1, sz, avg,  suffix)
117cabdff1aSopenharmony_ci
118cabdff1aSopenharmony_ci#define init_copy_avg(idx, sz) \
119cabdff1aSopenharmony_ci    init_copy(idx, sz, _neon); \
120cabdff1aSopenharmony_ci    init_avg (idx, sz, _neon)
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci    if (have_armv8(cpu_flags)) {
123cabdff1aSopenharmony_ci        init_copy(0, 64, _aarch64);
124cabdff1aSopenharmony_ci        init_copy(1, 32, _aarch64);
125cabdff1aSopenharmony_ci    }
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci    if (have_neon(cpu_flags)) {
128cabdff1aSopenharmony_ci#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
129cabdff1aSopenharmony_ci    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci#define init_mc_funcs(idx, dir, mx, my, sz, pfx)                                   \
132cabdff1aSopenharmony_ci    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
133cabdff1aSopenharmony_ci    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
134cabdff1aSopenharmony_ci    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx); \
135cabdff1aSopenharmony_ci    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
136cabdff1aSopenharmony_ci    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
137cabdff1aSopenharmony_ci    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx)
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci#define init_mc_funcs_dirs(idx, sz)            \
140cabdff1aSopenharmony_ci    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_); \
141cabdff1aSopenharmony_ci    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_); \
142cabdff1aSopenharmony_ci    init_mc_funcs(idx, hv, 1, 1, sz,)
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_ci        init_avg(0, 64, _neon);
145cabdff1aSopenharmony_ci        init_avg(1, 32, _neon);
146cabdff1aSopenharmony_ci        init_copy_avg(2, 16);
147cabdff1aSopenharmony_ci        init_copy_avg(3, 8);
148cabdff1aSopenharmony_ci        init_copy_avg(4, 4);
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci        init_mc_funcs_dirs(0, 64);
151cabdff1aSopenharmony_ci        init_mc_funcs_dirs(1, 32);
152cabdff1aSopenharmony_ci        init_mc_funcs_dirs(2, 16);
153cabdff1aSopenharmony_ci        init_mc_funcs_dirs(3, 8);
154cabdff1aSopenharmony_ci        init_mc_funcs_dirs(4, 4);
155cabdff1aSopenharmony_ci    }
156cabdff1aSopenharmony_ci}
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ci#define define_itxfm(type_a, type_b, sz)                                   \
159cabdff1aSopenharmony_civoid ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst,    \
160cabdff1aSopenharmony_ci                                                         ptrdiff_t stride, \
161cabdff1aSopenharmony_ci                                                         int16_t *_block, int eob)
162cabdff1aSopenharmony_ci
163cabdff1aSopenharmony_ci#define define_itxfm_funcs(sz)      \
164cabdff1aSopenharmony_ci    define_itxfm(idct,  idct,  sz); \
165cabdff1aSopenharmony_ci    define_itxfm(iadst, idct,  sz); \
166cabdff1aSopenharmony_ci    define_itxfm(idct,  iadst, sz); \
167cabdff1aSopenharmony_ci    define_itxfm(iadst, iadst, sz)
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_cidefine_itxfm_funcs(4);
170cabdff1aSopenharmony_cidefine_itxfm_funcs(8);
171cabdff1aSopenharmony_cidefine_itxfm_funcs(16);
172cabdff1aSopenharmony_cidefine_itxfm(idct, idct, 32);
173cabdff1aSopenharmony_cidefine_itxfm(iwht, iwht, 4);
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_cistatic av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
177cabdff1aSopenharmony_ci{
178cabdff1aSopenharmony_ci    int cpu_flags = av_get_cpu_flags();
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci    if (have_neon(cpu_flags)) {
181cabdff1aSopenharmony_ci#define init_itxfm(tx, sz)                                             \
182cabdff1aSopenharmony_ci    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_neon;  \
183cabdff1aSopenharmony_ci    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_neon; \
184cabdff1aSopenharmony_ci    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_neon; \
185cabdff1aSopenharmony_ci    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci#define init_idct(tx, nm)           \
188cabdff1aSopenharmony_ci    dsp->itxfm_add[tx][DCT_DCT]   = \
189cabdff1aSopenharmony_ci    dsp->itxfm_add[tx][ADST_DCT]  = \
190cabdff1aSopenharmony_ci    dsp->itxfm_add[tx][DCT_ADST]  = \
191cabdff1aSopenharmony_ci    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci        init_itxfm(TX_4X4, 4x4);
194cabdff1aSopenharmony_ci        init_itxfm(TX_8X8, 8x8);
195cabdff1aSopenharmony_ci        init_itxfm(TX_16X16, 16x16);
196cabdff1aSopenharmony_ci        init_idct(TX_32X32, idct_idct_32x32);
197cabdff1aSopenharmony_ci        init_idct(4, iwht_iwht_4x4);
198cabdff1aSopenharmony_ci    }
199cabdff1aSopenharmony_ci}
200cabdff1aSopenharmony_ci
201cabdff1aSopenharmony_ci#define define_loop_filter(dir, wd, len) \
202cabdff1aSopenharmony_civoid ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
203cabdff1aSopenharmony_ci
204cabdff1aSopenharmony_ci#define define_loop_filters(wd, len) \
205cabdff1aSopenharmony_ci    define_loop_filter(h, wd, len);  \
206cabdff1aSopenharmony_ci    define_loop_filter(v, wd, len)
207cabdff1aSopenharmony_ci
208cabdff1aSopenharmony_cidefine_loop_filters(4, 8);
209cabdff1aSopenharmony_cidefine_loop_filters(8, 8);
210cabdff1aSopenharmony_cidefine_loop_filters(16, 8);
211cabdff1aSopenharmony_ci
212cabdff1aSopenharmony_cidefine_loop_filters(16, 16);
213cabdff1aSopenharmony_ci
214cabdff1aSopenharmony_cidefine_loop_filters(44, 16);
215cabdff1aSopenharmony_cidefine_loop_filters(48, 16);
216cabdff1aSopenharmony_cidefine_loop_filters(84, 16);
217cabdff1aSopenharmony_cidefine_loop_filters(88, 16);
218cabdff1aSopenharmony_ci
219cabdff1aSopenharmony_cistatic av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
220cabdff1aSopenharmony_ci{
221cabdff1aSopenharmony_ci    int cpu_flags = av_get_cpu_flags();
222cabdff1aSopenharmony_ci
223cabdff1aSopenharmony_ci    if (have_neon(cpu_flags)) {
224cabdff1aSopenharmony_ci        dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
225cabdff1aSopenharmony_ci        dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
226cabdff1aSopenharmony_ci        dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
227cabdff1aSopenharmony_ci        dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
228cabdff1aSopenharmony_ci        dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
229cabdff1aSopenharmony_ci        dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci        dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
232cabdff1aSopenharmony_ci        dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci        dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
235cabdff1aSopenharmony_ci        dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
236cabdff1aSopenharmony_ci        dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon;
237cabdff1aSopenharmony_ci        dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon;
238cabdff1aSopenharmony_ci        dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon;
239cabdff1aSopenharmony_ci        dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon;
240cabdff1aSopenharmony_ci        dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon;
241cabdff1aSopenharmony_ci        dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon;
242cabdff1aSopenharmony_ci    }
243cabdff1aSopenharmony_ci}
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ciav_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
246cabdff1aSopenharmony_ci{
247cabdff1aSopenharmony_ci    if (bpp == 10) {
248cabdff1aSopenharmony_ci        ff_vp9dsp_init_10bpp_aarch64(dsp);
249cabdff1aSopenharmony_ci        return;
250cabdff1aSopenharmony_ci    } else if (bpp == 12) {
251cabdff1aSopenharmony_ci        ff_vp9dsp_init_12bpp_aarch64(dsp);
252cabdff1aSopenharmony_ci        return;
253cabdff1aSopenharmony_ci    } else if (bpp != 8)
254cabdff1aSopenharmony_ci        return;
255cabdff1aSopenharmony_ci
256cabdff1aSopenharmony_ci    vp9dsp_mc_init_aarch64(dsp);
257cabdff1aSopenharmony_ci    vp9dsp_loopfilter_init_aarch64(dsp);
258cabdff1aSopenharmony_ci    vp9dsp_itxfm_init_aarch64(dsp);
259cabdff1aSopenharmony_ci}
260