1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * x86 optimized discrete wavelet transform
3cabdff1aSopenharmony_ci * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
4cabdff1aSopenharmony_ci * Copyright (c) 2010 David Conrad
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/x86/asm.h"
24cabdff1aSopenharmony_ci#include "libavutil/x86/cpu.h"
25cabdff1aSopenharmony_ci#include "libavcodec/dirac_dwt.h"
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci#define COMPOSE_VERTICAL(ext, align) \
28cabdff1aSopenharmony_civoid ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
29cabdff1aSopenharmony_civoid ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
30cabdff1aSopenharmony_civoid ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
31cabdff1aSopenharmony_civoid ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
32cabdff1aSopenharmony_civoid ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
33cabdff1aSopenharmony_civoid ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
34cabdff1aSopenharmony_civoid ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
35cabdff1aSopenharmony_ci\
36cabdff1aSopenharmony_cistatic void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
37cabdff1aSopenharmony_ci{ \
38cabdff1aSopenharmony_ci    int i, width_align = width&~(align-1); \
39cabdff1aSopenharmony_ci    int16_t *b0 = (int16_t *)_b0; \
40cabdff1aSopenharmony_ci    int16_t *b1 = (int16_t *)_b1; \
41cabdff1aSopenharmony_ci    int16_t *b2 = (int16_t *)_b2; \
42cabdff1aSopenharmony_ci\
43cabdff1aSopenharmony_ci    for(i=width_align; i<width; i++) \
44cabdff1aSopenharmony_ci        b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
45cabdff1aSopenharmony_ci\
46cabdff1aSopenharmony_ci    ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
47cabdff1aSopenharmony_ci} \
48cabdff1aSopenharmony_ci\
49cabdff1aSopenharmony_cistatic void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
50cabdff1aSopenharmony_ci{ \
51cabdff1aSopenharmony_ci    int i, width_align = width&~(align-1); \
52cabdff1aSopenharmony_ci    int16_t *b0 = (int16_t *)_b0; \
53cabdff1aSopenharmony_ci    int16_t *b1 = (int16_t *)_b1; \
54cabdff1aSopenharmony_ci    int16_t *b2 = (int16_t *)_b2; \
55cabdff1aSopenharmony_ci\
56cabdff1aSopenharmony_ci    for(i=width_align; i<width; i++) \
57cabdff1aSopenharmony_ci        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
58cabdff1aSopenharmony_ci\
59cabdff1aSopenharmony_ci    ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
60cabdff1aSopenharmony_ci} \
61cabdff1aSopenharmony_ci\
62cabdff1aSopenharmony_cistatic void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
63cabdff1aSopenharmony_ci                                           uint8_t *_b3, uint8_t *_b4, int width) \
64cabdff1aSopenharmony_ci{ \
65cabdff1aSopenharmony_ci    int i, width_align = width&~(align-1); \
66cabdff1aSopenharmony_ci    int16_t *b0 = (int16_t *)_b0; \
67cabdff1aSopenharmony_ci    int16_t *b1 = (int16_t *)_b1; \
68cabdff1aSopenharmony_ci    int16_t *b2 = (int16_t *)_b2; \
69cabdff1aSopenharmony_ci    int16_t *b3 = (int16_t *)_b3; \
70cabdff1aSopenharmony_ci    int16_t *b4 = (int16_t *)_b4; \
71cabdff1aSopenharmony_ci\
72cabdff1aSopenharmony_ci    for(i=width_align; i<width; i++) \
73cabdff1aSopenharmony_ci        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
74cabdff1aSopenharmony_ci\
75cabdff1aSopenharmony_ci    ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
76cabdff1aSopenharmony_ci} \
77cabdff1aSopenharmony_ci\
78cabdff1aSopenharmony_cistatic void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
79cabdff1aSopenharmony_ci                                          uint8_t *_b3, uint8_t *_b4, int width) \
80cabdff1aSopenharmony_ci{ \
81cabdff1aSopenharmony_ci    int i, width_align = width&~(align-1); \
82cabdff1aSopenharmony_ci    int16_t *b0 = (int16_t *)_b0; \
83cabdff1aSopenharmony_ci    int16_t *b1 = (int16_t *)_b1; \
84cabdff1aSopenharmony_ci    int16_t *b2 = (int16_t *)_b2; \
85cabdff1aSopenharmony_ci    int16_t *b3 = (int16_t *)_b3; \
86cabdff1aSopenharmony_ci    int16_t *b4 = (int16_t *)_b4; \
87cabdff1aSopenharmony_ci\
88cabdff1aSopenharmony_ci    for(i=width_align; i<width; i++) \
89cabdff1aSopenharmony_ci        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
90cabdff1aSopenharmony_ci\
91cabdff1aSopenharmony_ci    ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
92cabdff1aSopenharmony_ci} \
93cabdff1aSopenharmony_cistatic void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
94cabdff1aSopenharmony_ci{ \
95cabdff1aSopenharmony_ci    int i, width_align = width&~(align-1); \
96cabdff1aSopenharmony_ci    int16_t *b0 = (int16_t *)_b0; \
97cabdff1aSopenharmony_ci    int16_t *b1 = (int16_t *)_b1; \
98cabdff1aSopenharmony_ci\
99cabdff1aSopenharmony_ci    for(i=width_align; i<width; i++) { \
100cabdff1aSopenharmony_ci        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
101cabdff1aSopenharmony_ci        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
102cabdff1aSopenharmony_ci    } \
103cabdff1aSopenharmony_ci\
104cabdff1aSopenharmony_ci    ff_vertical_compose_haar##ext(b0, b1, width_align); \
105cabdff1aSopenharmony_ci} \
106cabdff1aSopenharmony_cistatic void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
107cabdff1aSopenharmony_ci{\
108cabdff1aSopenharmony_ci    int w2= w>>1;\
109cabdff1aSopenharmony_ci    int x= w2 - (w2&(align-1));\
110cabdff1aSopenharmony_ci    int16_t *b = (int16_t *)_b; \
111cabdff1aSopenharmony_ci    int16_t *tmp = (int16_t *)_tmp; \
112cabdff1aSopenharmony_ci\
113cabdff1aSopenharmony_ci    ff_horizontal_compose_haar0i##ext(b, tmp, w);\
114cabdff1aSopenharmony_ci\
115cabdff1aSopenharmony_ci    for (; x < w2; x++) {\
116cabdff1aSopenharmony_ci        b[2*x  ] = tmp[x];\
117cabdff1aSopenharmony_ci        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
118cabdff1aSopenharmony_ci    }\
119cabdff1aSopenharmony_ci}\
120cabdff1aSopenharmony_cistatic void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
121cabdff1aSopenharmony_ci{\
122cabdff1aSopenharmony_ci    int w2= w>>1;\
123cabdff1aSopenharmony_ci    int x= w2 - (w2&(align-1));\
124cabdff1aSopenharmony_ci    int16_t *b = (int16_t *)_b; \
125cabdff1aSopenharmony_ci    int16_t *tmp = (int16_t *)_tmp; \
126cabdff1aSopenharmony_ci\
127cabdff1aSopenharmony_ci    ff_horizontal_compose_haar1i##ext(b, tmp, w);\
128cabdff1aSopenharmony_ci\
129cabdff1aSopenharmony_ci    for (; x < w2; x++) {\
130cabdff1aSopenharmony_ci        b[2*x  ] = (tmp[x] + 1)>>1;\
131cabdff1aSopenharmony_ci        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
132cabdff1aSopenharmony_ci    }\
133cabdff1aSopenharmony_ci}\
134cabdff1aSopenharmony_ci\
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci#if HAVE_X86ASM
137cabdff1aSopenharmony_ciCOMPOSE_VERTICAL(_sse2, 8)
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci
140cabdff1aSopenharmony_civoid ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
141cabdff1aSopenharmony_ci
142cabdff1aSopenharmony_cistatic void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
143cabdff1aSopenharmony_ci{
144cabdff1aSopenharmony_ci    int w2= w>>1;
145cabdff1aSopenharmony_ci    int x= w2 - (w2&7);
146cabdff1aSopenharmony_ci    int16_t *b = (int16_t *)_b;
147cabdff1aSopenharmony_ci    int16_t *tmp = (int16_t *)_tmp;
148cabdff1aSopenharmony_ci
149cabdff1aSopenharmony_ci    ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
150cabdff1aSopenharmony_ci
151cabdff1aSopenharmony_ci    for (; x < w2; x++) {
152cabdff1aSopenharmony_ci        b[2*x  ] = (tmp[x] + 1)>>1;
153cabdff1aSopenharmony_ci        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
154cabdff1aSopenharmony_ci    }
155cabdff1aSopenharmony_ci}
156cabdff1aSopenharmony_ci#endif
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_civoid ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
159cabdff1aSopenharmony_ci{
160cabdff1aSopenharmony_ci#if HAVE_X86ASM
161cabdff1aSopenharmony_ci  int mm_flags = av_get_cpu_flags();
162cabdff1aSopenharmony_ci
163cabdff1aSopenharmony_ci    if (!(mm_flags & AV_CPU_FLAG_SSE2))
164cabdff1aSopenharmony_ci        return;
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_ci    switch (type) {
167cabdff1aSopenharmony_ci    case DWT_DIRAC_DD9_7:
168cabdff1aSopenharmony_ci        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
169cabdff1aSopenharmony_ci        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
170cabdff1aSopenharmony_ci        break;
171cabdff1aSopenharmony_ci    case DWT_DIRAC_LEGALL5_3:
172cabdff1aSopenharmony_ci        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
173cabdff1aSopenharmony_ci        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
174cabdff1aSopenharmony_ci        break;
175cabdff1aSopenharmony_ci    case DWT_DIRAC_DD13_7:
176cabdff1aSopenharmony_ci        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
177cabdff1aSopenharmony_ci        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
178cabdff1aSopenharmony_ci        break;
179cabdff1aSopenharmony_ci    case DWT_DIRAC_HAAR0:
180cabdff1aSopenharmony_ci        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
181cabdff1aSopenharmony_ci        d->horizontal_compose = horizontal_compose_haar0i_sse2;
182cabdff1aSopenharmony_ci        break;
183cabdff1aSopenharmony_ci    case DWT_DIRAC_HAAR1:
184cabdff1aSopenharmony_ci        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
185cabdff1aSopenharmony_ci        d->horizontal_compose = horizontal_compose_haar1i_sse2;
186cabdff1aSopenharmony_ci        break;
187cabdff1aSopenharmony_ci    }
188cabdff1aSopenharmony_ci
189cabdff1aSopenharmony_ci    if (!(mm_flags & AV_CPU_FLAG_SSSE3))
190cabdff1aSopenharmony_ci        return;
191cabdff1aSopenharmony_ci
192cabdff1aSopenharmony_ci    switch (type) {
193cabdff1aSopenharmony_ci    case DWT_DIRAC_DD9_7:
194cabdff1aSopenharmony_ci        d->horizontal_compose = horizontal_compose_dd97i_ssse3;
195cabdff1aSopenharmony_ci        break;
196cabdff1aSopenharmony_ci    }
197cabdff1aSopenharmony_ci#endif // HAVE_X86ASM
198cabdff1aSopenharmony_ci}
199