1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* x86 optimized discrete wavelet trasnform
3cabdff1aSopenharmony_ci;* Copyright (c) 2010 David Conrad
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci;******************************************************************************
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciSECTION_RODATA
25cabdff1aSopenharmony_cipw_1991: times 4 dw 9,-1
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_cicextern pw_1
28cabdff1aSopenharmony_cicextern pw_2
29cabdff1aSopenharmony_cicextern pw_8
30cabdff1aSopenharmony_cicextern pw_16
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ciSECTION .text
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ci; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
35cabdff1aSopenharmony_ci%macro COMPOSE_53iL0 4
36cabdff1aSopenharmony_ci    paddw   %2, %3
37cabdff1aSopenharmony_ci    paddw   %2, %4
38cabdff1aSopenharmony_ci    psraw   %2, 2
39cabdff1aSopenharmony_ci    psubw   %1, %2
40cabdff1aSopenharmony_ci%endm
41cabdff1aSopenharmony_ci
42cabdff1aSopenharmony_ci; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
43cabdff1aSopenharmony_ci; if %4 is supplied, %1 is loaded unaligned from there
44cabdff1aSopenharmony_ci; m2: clobbered  m3: pw_8  m4: pw_1991
45cabdff1aSopenharmony_ci%macro COMPOSE_DD97iH0 3-4
46cabdff1aSopenharmony_ci    paddw   m0, %3
47cabdff1aSopenharmony_ci    paddw   m1, %2
48cabdff1aSopenharmony_ci    psubw   m0, m3
49cabdff1aSopenharmony_ci    mova    m2, m1
50cabdff1aSopenharmony_ci    punpcklwd m1, m0
51cabdff1aSopenharmony_ci    punpckhwd m2, m0
52cabdff1aSopenharmony_ci    pmaddwd m1, m4
53cabdff1aSopenharmony_ci    pmaddwd m2, m4
54cabdff1aSopenharmony_ci%if %0 > 3
55cabdff1aSopenharmony_ci    movu    %1, %4
56cabdff1aSopenharmony_ci%endif
57cabdff1aSopenharmony_ci    psrad   m1, 4
58cabdff1aSopenharmony_ci    psrad   m2, 4
59cabdff1aSopenharmony_ci    packssdw m1, m2
60cabdff1aSopenharmony_ci    paddw   m1, %1
61cabdff1aSopenharmony_ci%endm
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ci%macro COMPOSE_VERTICAL 1
64cabdff1aSopenharmony_ci; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
65cabdff1aSopenharmony_ci;                                  int width)
66cabdff1aSopenharmony_cicglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
67cabdff1aSopenharmony_ci    mova    m2, [pw_2]
68cabdff1aSopenharmony_ci%if ARCH_X86_64
69cabdff1aSopenharmony_ci    mov     widthd, widthd
70cabdff1aSopenharmony_ci%endif
71cabdff1aSopenharmony_ci.loop:
72cabdff1aSopenharmony_ci    sub     widthq, mmsize/2
73cabdff1aSopenharmony_ci    mova    m1, [b0q+2*widthq]
74cabdff1aSopenharmony_ci    mova    m0, [b1q+2*widthq]
75cabdff1aSopenharmony_ci    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
76cabdff1aSopenharmony_ci    mova    [b1q+2*widthq], m0
77cabdff1aSopenharmony_ci    jg      .loop
78cabdff1aSopenharmony_ci    REP_RET
79cabdff1aSopenharmony_ci
80cabdff1aSopenharmony_ci; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
81cabdff1aSopenharmony_ci;                                  int width)
82cabdff1aSopenharmony_cicglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
83cabdff1aSopenharmony_ci    mova    m1, [pw_1]
84cabdff1aSopenharmony_ci%if ARCH_X86_64
85cabdff1aSopenharmony_ci    mov     widthd, widthd
86cabdff1aSopenharmony_ci%endif
87cabdff1aSopenharmony_ci.loop:
88cabdff1aSopenharmony_ci    sub     widthq, mmsize/2
89cabdff1aSopenharmony_ci    mova    m0, [b0q+2*widthq]
90cabdff1aSopenharmony_ci    paddw   m0, [b2q+2*widthq]
91cabdff1aSopenharmony_ci    paddw   m0, m1
92cabdff1aSopenharmony_ci    psraw   m0, 1
93cabdff1aSopenharmony_ci    paddw   m0, [b1q+2*widthq]
94cabdff1aSopenharmony_ci    mova    [b1q+2*widthq], m0
95cabdff1aSopenharmony_ci    jg      .loop
96cabdff1aSopenharmony_ci    REP_RET
97cabdff1aSopenharmony_ci
98cabdff1aSopenharmony_ci; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
99cabdff1aSopenharmony_ci;                               IDWTELEM *b3, IDWTELEM *b4, int width)
100cabdff1aSopenharmony_cicglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
101cabdff1aSopenharmony_ci    mova    m3, [pw_8]
102cabdff1aSopenharmony_ci    mova    m4, [pw_1991]
103cabdff1aSopenharmony_ci%if ARCH_X86_64
104cabdff1aSopenharmony_ci    mov     widthd, widthd
105cabdff1aSopenharmony_ci%endif
106cabdff1aSopenharmony_ci.loop:
107cabdff1aSopenharmony_ci    sub     widthq, mmsize/2
108cabdff1aSopenharmony_ci    mova    m0, [b0q+2*widthq]
109cabdff1aSopenharmony_ci    mova    m1, [b1q+2*widthq]
110cabdff1aSopenharmony_ci    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
111cabdff1aSopenharmony_ci    mova    [b2q+2*widthq], m1
112cabdff1aSopenharmony_ci    jg      .loop
113cabdff1aSopenharmony_ci    REP_RET
114cabdff1aSopenharmony_ci
115cabdff1aSopenharmony_ci; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
116cabdff1aSopenharmony_ci;                                IDWTELEM *b3, IDWTELEM *b4, int width)
117cabdff1aSopenharmony_cicglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
118cabdff1aSopenharmony_ci    mova    m3, [pw_16]
119cabdff1aSopenharmony_ci    mova    m4, [pw_1991]
120cabdff1aSopenharmony_ci%if ARCH_X86_64
121cabdff1aSopenharmony_ci    mov     widthd, widthd
122cabdff1aSopenharmony_ci%endif
123cabdff1aSopenharmony_ci.loop:
124cabdff1aSopenharmony_ci    sub     widthq, mmsize/2
125cabdff1aSopenharmony_ci    mova    m0, [b0q+2*widthq]
126cabdff1aSopenharmony_ci    mova    m1, [b1q+2*widthq]
127cabdff1aSopenharmony_ci    mova    m5, [b2q+2*widthq]
128cabdff1aSopenharmony_ci    paddw   m0, [b4q+2*widthq]
129cabdff1aSopenharmony_ci    paddw   m1, [b3q+2*widthq]
130cabdff1aSopenharmony_ci    psubw   m0, m3
131cabdff1aSopenharmony_ci    mova    m2, m1
132cabdff1aSopenharmony_ci    punpcklwd m1, m0
133cabdff1aSopenharmony_ci    punpckhwd m2, m0
134cabdff1aSopenharmony_ci    pmaddwd m1, m4
135cabdff1aSopenharmony_ci    pmaddwd m2, m4
136cabdff1aSopenharmony_ci    psrad   m1, 5
137cabdff1aSopenharmony_ci    psrad   m2, 5
138cabdff1aSopenharmony_ci    packssdw m1, m2
139cabdff1aSopenharmony_ci    psubw   m5, m1
140cabdff1aSopenharmony_ci    mova    [b2q+2*widthq], m5
141cabdff1aSopenharmony_ci    jg      .loop
142cabdff1aSopenharmony_ci    REP_RET
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_ci; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
145cabdff1aSopenharmony_cicglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
146cabdff1aSopenharmony_ci    mova    m3, [pw_1]
147cabdff1aSopenharmony_ci%if ARCH_X86_64
148cabdff1aSopenharmony_ci    mov     widthd, widthd
149cabdff1aSopenharmony_ci%endif
150cabdff1aSopenharmony_ci.loop:
151cabdff1aSopenharmony_ci    sub     widthq, mmsize/2
152cabdff1aSopenharmony_ci    mova    m1, [b1q+2*widthq]
153cabdff1aSopenharmony_ci    mova    m0, [b0q+2*widthq]
154cabdff1aSopenharmony_ci    mova    m2, m1
155cabdff1aSopenharmony_ci    paddw   m1, m3
156cabdff1aSopenharmony_ci    psraw   m1, 1
157cabdff1aSopenharmony_ci    psubw   m0, m1
158cabdff1aSopenharmony_ci    mova    [b0q+2*widthq], m0
159cabdff1aSopenharmony_ci    paddw   m2, m0
160cabdff1aSopenharmony_ci    mova    [b1q+2*widthq], m2
161cabdff1aSopenharmony_ci    jg      .loop
162cabdff1aSopenharmony_ci    REP_RET
163cabdff1aSopenharmony_ci%endmacro
164cabdff1aSopenharmony_ci
165cabdff1aSopenharmony_ci; extend the left and right edges of the tmp array by %1 and %2 respectively
166cabdff1aSopenharmony_ci%macro EDGE_EXTENSION 3
167cabdff1aSopenharmony_ci    mov     %3, [tmpq]
168cabdff1aSopenharmony_ci%assign %%i 1
169cabdff1aSopenharmony_ci%rep %1
170cabdff1aSopenharmony_ci    mov     [tmpq-2*%%i], %3
171cabdff1aSopenharmony_ci    %assign %%i %%i+1
172cabdff1aSopenharmony_ci%endrep
173cabdff1aSopenharmony_ci    mov     %3, [tmpq+2*w2q-2]
174cabdff1aSopenharmony_ci%assign %%i 0
175cabdff1aSopenharmony_ci%rep %2
176cabdff1aSopenharmony_ci    mov     [tmpq+2*w2q+2*%%i], %3
177cabdff1aSopenharmony_ci    %assign %%i %%i+1
178cabdff1aSopenharmony_ci%endrep
179cabdff1aSopenharmony_ci%endmacro
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci
182cabdff1aSopenharmony_ci%macro HAAR_HORIZONTAL 2
183cabdff1aSopenharmony_ci; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
184cabdff1aSopenharmony_cicglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
185cabdff1aSopenharmony_ci    mov    w2d, wd
186cabdff1aSopenharmony_ci    xor     xq, xq
187cabdff1aSopenharmony_ci    shr    w2d, 1
188cabdff1aSopenharmony_ci    lea  b_w2q, [bq+wq]
189cabdff1aSopenharmony_ci    mova    m3, [pw_1]
190cabdff1aSopenharmony_ci.lowpass_loop:
191cabdff1aSopenharmony_ci    movu    m1, [b_w2q + 2*xq]
192cabdff1aSopenharmony_ci    mova    m0, [bq    + 2*xq]
193cabdff1aSopenharmony_ci    paddw   m1, m3
194cabdff1aSopenharmony_ci    psraw   m1, 1
195cabdff1aSopenharmony_ci    psubw   m0, m1
196cabdff1aSopenharmony_ci    mova    [tmpq + 2*xq], m0
197cabdff1aSopenharmony_ci    add     xq, mmsize/2
198cabdff1aSopenharmony_ci    cmp     xq, w2q
199cabdff1aSopenharmony_ci    jl      .lowpass_loop
200cabdff1aSopenharmony_ci
201cabdff1aSopenharmony_ci    xor     xq, xq
202cabdff1aSopenharmony_ci    and    w2q, ~(mmsize/2 - 1)
203cabdff1aSopenharmony_ci    cmp    w2q, mmsize/2
204cabdff1aSopenharmony_ci    jl      .end
205cabdff1aSopenharmony_ci
206cabdff1aSopenharmony_ci.highpass_loop:
207cabdff1aSopenharmony_ci    movu    m1, [b_w2q + 2*xq]
208cabdff1aSopenharmony_ci    mova    m0, [tmpq  + 2*xq]
209cabdff1aSopenharmony_ci    paddw   m1, m0
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_ci    ; shift and interleave
212cabdff1aSopenharmony_ci%if %2 == 1
213cabdff1aSopenharmony_ci    paddw   m0, m3
214cabdff1aSopenharmony_ci    paddw   m1, m3
215cabdff1aSopenharmony_ci    psraw   m0, 1
216cabdff1aSopenharmony_ci    psraw   m1, 1
217cabdff1aSopenharmony_ci%endif
218cabdff1aSopenharmony_ci    mova    m2, m0
219cabdff1aSopenharmony_ci    punpcklwd m0, m1
220cabdff1aSopenharmony_ci    punpckhwd m2, m1
221cabdff1aSopenharmony_ci    mova    [bq+4*xq], m0
222cabdff1aSopenharmony_ci    mova    [bq+4*xq+mmsize], m2
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ci    add     xq, mmsize/2
225cabdff1aSopenharmony_ci    cmp     xq, w2q
226cabdff1aSopenharmony_ci    jl      .highpass_loop
227cabdff1aSopenharmony_ci.end:
228cabdff1aSopenharmony_ci    REP_RET
229cabdff1aSopenharmony_ci%endmacro
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci
232cabdff1aSopenharmony_ciINIT_XMM
233cabdff1aSopenharmony_ci; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
234cabdff1aSopenharmony_cicglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
235cabdff1aSopenharmony_ci    mov    w2d, wd
236cabdff1aSopenharmony_ci    xor     xd, xd
237cabdff1aSopenharmony_ci    shr    w2d, 1
238cabdff1aSopenharmony_ci    lea  b_w2q, [bq+wq]
239cabdff1aSopenharmony_ci    movu    m4, [bq+wq]
240cabdff1aSopenharmony_ci    mova    m7, [pw_2]
241cabdff1aSopenharmony_ci    pslldq  m4, 14
242cabdff1aSopenharmony_ci.lowpass_loop:
243cabdff1aSopenharmony_ci    movu    m1, [b_w2q + 2*xq]
244cabdff1aSopenharmony_ci    mova    m0, [bq    + 2*xq]
245cabdff1aSopenharmony_ci    mova    m2, m1
246cabdff1aSopenharmony_ci    palignr m1, m4, 14
247cabdff1aSopenharmony_ci    mova    m4, m2
248cabdff1aSopenharmony_ci    COMPOSE_53iL0 m0, m1, m2, m7
249cabdff1aSopenharmony_ci    mova    [tmpq + 2*xq], m0
250cabdff1aSopenharmony_ci    add     xd, mmsize/2
251cabdff1aSopenharmony_ci    cmp     xd, w2d
252cabdff1aSopenharmony_ci    jl      .lowpass_loop
253cabdff1aSopenharmony_ci
254cabdff1aSopenharmony_ci    EDGE_EXTENSION 1, 2, xw
255cabdff1aSopenharmony_ci    ; leave the last up to 7 (sse) or 3 (mmx) values for C
256cabdff1aSopenharmony_ci    xor     xd, xd
257cabdff1aSopenharmony_ci    and    w2d, ~(mmsize/2 - 1)
258cabdff1aSopenharmony_ci    cmp    w2d, mmsize/2
259cabdff1aSopenharmony_ci    jl      .end
260cabdff1aSopenharmony_ci
261cabdff1aSopenharmony_ci    mova    m7, [tmpq-mmsize]
262cabdff1aSopenharmony_ci    mova    m0, [tmpq]
263cabdff1aSopenharmony_ci    mova    m5, [pw_1]
264cabdff1aSopenharmony_ci    mova    m3, [pw_8]
265cabdff1aSopenharmony_ci    mova    m4, [pw_1991]
266cabdff1aSopenharmony_ci.highpass_loop:
267cabdff1aSopenharmony_ci    mova    m6, m0
268cabdff1aSopenharmony_ci    palignr m0, m7, 14
269cabdff1aSopenharmony_ci    mova    m7, [tmpq + 2*xq + 16]
270cabdff1aSopenharmony_ci    mova    m1, m7
271cabdff1aSopenharmony_ci    mova    m2, m7
272cabdff1aSopenharmony_ci    palignr m1, m6, 2
273cabdff1aSopenharmony_ci    palignr m2, m6, 4
274cabdff1aSopenharmony_ci    COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
275cabdff1aSopenharmony_ci    mova    m0, m7
276cabdff1aSopenharmony_ci    mova    m7, m6
277cabdff1aSopenharmony_ci
278cabdff1aSopenharmony_ci    ; shift and interleave
279cabdff1aSopenharmony_ci    paddw   m6, m5
280cabdff1aSopenharmony_ci    paddw   m1, m5
281cabdff1aSopenharmony_ci    psraw   m6, 1
282cabdff1aSopenharmony_ci    psraw   m1, 1
283cabdff1aSopenharmony_ci    mova    m2, m6
284cabdff1aSopenharmony_ci    punpcklwd m6, m1
285cabdff1aSopenharmony_ci    punpckhwd m2, m1
286cabdff1aSopenharmony_ci    mova    [bq+4*xq], m6
287cabdff1aSopenharmony_ci    mova    [bq+4*xq+mmsize], m2
288cabdff1aSopenharmony_ci
289cabdff1aSopenharmony_ci    add     xd, mmsize/2
290cabdff1aSopenharmony_ci    cmp     xd, w2d
291cabdff1aSopenharmony_ci    jl      .highpass_loop
292cabdff1aSopenharmony_ci.end:
293cabdff1aSopenharmony_ci    REP_RET
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ciINIT_XMM
297cabdff1aSopenharmony_ciCOMPOSE_VERTICAL sse2
298cabdff1aSopenharmony_ciHAAR_HORIZONTAL sse2, 0
299cabdff1aSopenharmony_ciHAAR_HORIZONTAL sse2, 1
300