1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* Copyright (c) 2010 David Conrad
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci;*
11cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
15cabdff1aSopenharmony_ci;*
16cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci;******************************************************************************
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ciSECTION_RODATA
24cabdff1aSopenharmony_cipw_7: times 8 dw 7
25cabdff1aSopenharmony_ciconvert_to_unsigned_10bit: times 4 dd 0x200
26cabdff1aSopenharmony_ciclip_10bit:                times 8 dw 0x3ff
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_cicextern pw_3
29cabdff1aSopenharmony_cicextern pw_16
30cabdff1aSopenharmony_cicextern pw_32
31cabdff1aSopenharmony_cicextern pb_80
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ciSECTION .text
34cabdff1aSopenharmony_ci
35cabdff1aSopenharmony_ci%macro UNPACK_ADD 6
36cabdff1aSopenharmony_ci    mov%5   %1, %3
37cabdff1aSopenharmony_ci    mov%6   m5, %4
38cabdff1aSopenharmony_ci    mova    m4, %1
39cabdff1aSopenharmony_ci    mova    %2, m5
40cabdff1aSopenharmony_ci    punpcklbw %1, m7
41cabdff1aSopenharmony_ci    punpcklbw m5, m7
42cabdff1aSopenharmony_ci    punpckhbw m4, m7
43cabdff1aSopenharmony_ci    punpckhbw %2, m7
44cabdff1aSopenharmony_ci    paddw   %1, m5
45cabdff1aSopenharmony_ci    paddw   %2, m4
46cabdff1aSopenharmony_ci%endmacro
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_ci%macro HPEL_FILTER 1
49cabdff1aSopenharmony_ci; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
50cabdff1aSopenharmony_cicglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
51cabdff1aSopenharmony_ci    mov     src0q, srcq
52cabdff1aSopenharmony_ci    lea     stridex3q, [3*strideq]
53cabdff1aSopenharmony_ci    sub     src0q, stridex3q
54cabdff1aSopenharmony_ci    pxor    m7, m7
55cabdff1aSopenharmony_ci.loop:
56cabdff1aSopenharmony_ci    ; 7*(src[0] + src[1])
57cabdff1aSopenharmony_ci    UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
58cabdff1aSopenharmony_ci    pmullw  m0, [pw_7]
59cabdff1aSopenharmony_ci    pmullw  m1, [pw_7]
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci    ; 3*( ... + src[-2] + src[3])
62cabdff1aSopenharmony_ci    UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
63cabdff1aSopenharmony_ci    paddw   m0, m2
64cabdff1aSopenharmony_ci    paddw   m1, m3
65cabdff1aSopenharmony_ci    pmullw  m0, [pw_3]
66cabdff1aSopenharmony_ci    pmullw  m1, [pw_3]
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci    ; ... - 7*(src[-1] + src[2])
69cabdff1aSopenharmony_ci    UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
70cabdff1aSopenharmony_ci    pmullw  m2, [pw_7]
71cabdff1aSopenharmony_ci    pmullw  m3, [pw_7]
72cabdff1aSopenharmony_ci    psubw   m0, m2
73cabdff1aSopenharmony_ci    psubw   m1, m3
74cabdff1aSopenharmony_ci
75cabdff1aSopenharmony_ci    ; ... - (src[-3] + src[4])
76cabdff1aSopenharmony_ci    UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
77cabdff1aSopenharmony_ci    psubw   m0, m2
78cabdff1aSopenharmony_ci    psubw   m1, m3
79cabdff1aSopenharmony_ci
80cabdff1aSopenharmony_ci    paddw   m0, [pw_16]
81cabdff1aSopenharmony_ci    paddw   m1, [pw_16]
82cabdff1aSopenharmony_ci    psraw   m0, 5
83cabdff1aSopenharmony_ci    psraw   m1, 5
84cabdff1aSopenharmony_ci    packuswb m0, m1
85cabdff1aSopenharmony_ci    mova    [dstq], m0
86cabdff1aSopenharmony_ci    add     dstq, mmsize
87cabdff1aSopenharmony_ci    add     srcq, mmsize
88cabdff1aSopenharmony_ci    add     src0q, mmsize
89cabdff1aSopenharmony_ci    sub     widthd, mmsize
90cabdff1aSopenharmony_ci    jg      .loop
91cabdff1aSopenharmony_ci    RET
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
94cabdff1aSopenharmony_cicglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
95cabdff1aSopenharmony_ci    dec     widthd
96cabdff1aSopenharmony_ci    pxor    m7, m7
97cabdff1aSopenharmony_ci    and     widthd, ~(mmsize-1)
98cabdff1aSopenharmony_ci.loop:
99cabdff1aSopenharmony_ci    ; 7*(src[0] + src[1])
100cabdff1aSopenharmony_ci    UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
101cabdff1aSopenharmony_ci    pmullw  m0, [pw_7]
102cabdff1aSopenharmony_ci    pmullw  m1, [pw_7]
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci    ; 3*( ... + src[-2] + src[3])
105cabdff1aSopenharmony_ci    UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
106cabdff1aSopenharmony_ci    paddw   m0, m2
107cabdff1aSopenharmony_ci    paddw   m1, m3
108cabdff1aSopenharmony_ci    pmullw  m0, [pw_3]
109cabdff1aSopenharmony_ci    pmullw  m1, [pw_3]
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_ci    ; ... - 7*(src[-1] + src[2])
112cabdff1aSopenharmony_ci    UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
113cabdff1aSopenharmony_ci    pmullw  m2, [pw_7]
114cabdff1aSopenharmony_ci    pmullw  m3, [pw_7]
115cabdff1aSopenharmony_ci    psubw   m0, m2
116cabdff1aSopenharmony_ci    psubw   m1, m3
117cabdff1aSopenharmony_ci
118cabdff1aSopenharmony_ci    ; ... - (src[-3] + src[4])
119cabdff1aSopenharmony_ci    UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
120cabdff1aSopenharmony_ci    psubw   m0, m2
121cabdff1aSopenharmony_ci    psubw   m1, m3
122cabdff1aSopenharmony_ci
123cabdff1aSopenharmony_ci    paddw   m0, [pw_16]
124cabdff1aSopenharmony_ci    paddw   m1, [pw_16]
125cabdff1aSopenharmony_ci    psraw   m0, 5
126cabdff1aSopenharmony_ci    psraw   m1, 5
127cabdff1aSopenharmony_ci    packuswb m0, m1
128cabdff1aSopenharmony_ci    mova    [dstq + widthq], m0
129cabdff1aSopenharmony_ci    sub     widthd, mmsize
130cabdff1aSopenharmony_ci    jge     .loop
131cabdff1aSopenharmony_ci    RET
132cabdff1aSopenharmony_ci%endmacro
133cabdff1aSopenharmony_ci
134cabdff1aSopenharmony_ci%macro PUT_RECT 1
135cabdff1aSopenharmony_ci; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
136cabdff1aSopenharmony_cicglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
137cabdff1aSopenharmony_ci    mova    m0, [pb_80]
138cabdff1aSopenharmony_ci    add     wd, (mmsize-1)
139cabdff1aSopenharmony_ci    and     wd, ~(mmsize-1)
140cabdff1aSopenharmony_ci
141cabdff1aSopenharmony_ci%if ARCH_X86_64
142cabdff1aSopenharmony_ci    movsxd   dst_strideq, dst_strided
143cabdff1aSopenharmony_ci    movsxd   src_strideq, src_strided
144cabdff1aSopenharmony_ci    mov   r7d, r5m
145cabdff1aSopenharmony_ci    mov   r8d, wd
146cabdff1aSopenharmony_ci    %define wspill r8d
147cabdff1aSopenharmony_ci    %define hd r7d
148cabdff1aSopenharmony_ci%else
149cabdff1aSopenharmony_ci    mov    r4m, wd
150cabdff1aSopenharmony_ci    %define wspill r4m
151cabdff1aSopenharmony_ci    %define hd r5mp
152cabdff1aSopenharmony_ci%endif
153cabdff1aSopenharmony_ci
154cabdff1aSopenharmony_ci.loopy:
155cabdff1aSopenharmony_ci    lea     src2q, [srcq+src_strideq]
156cabdff1aSopenharmony_ci    lea     dst2q, [dstq+dst_strideq]
157cabdff1aSopenharmony_ci.loopx:
158cabdff1aSopenharmony_ci    sub      wd, mmsize
159cabdff1aSopenharmony_ci    mova     m1, [srcq +2*wq]
160cabdff1aSopenharmony_ci    mova     m2, [src2q+2*wq]
161cabdff1aSopenharmony_ci    packsswb m1, [srcq +2*wq+mmsize]
162cabdff1aSopenharmony_ci    packsswb m2, [src2q+2*wq+mmsize]
163cabdff1aSopenharmony_ci    paddb    m1, m0
164cabdff1aSopenharmony_ci    paddb    m2, m0
165cabdff1aSopenharmony_ci    mova    [dstq +wq], m1
166cabdff1aSopenharmony_ci    mova    [dst2q+wq], m2
167cabdff1aSopenharmony_ci    jg      .loopx
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_ci    lea   srcq, [srcq+src_strideq*2]
170cabdff1aSopenharmony_ci    lea   dstq, [dstq+dst_strideq*2]
171cabdff1aSopenharmony_ci    sub     hd, 2
172cabdff1aSopenharmony_ci    mov     wd, wspill
173cabdff1aSopenharmony_ci    jg      .loopy
174cabdff1aSopenharmony_ci    RET
175cabdff1aSopenharmony_ci%endm
176cabdff1aSopenharmony_ci
177cabdff1aSopenharmony_ci%macro ADD_RECT 1
178cabdff1aSopenharmony_ci; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
179cabdff1aSopenharmony_cicglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
180cabdff1aSopenharmony_ci    mova    m0, [pw_32]
181cabdff1aSopenharmony_ci    add     wd, (mmsize-1)
182cabdff1aSopenharmony_ci    and     wd, ~(mmsize-1)
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci%if ARCH_X86_64
185cabdff1aSopenharmony_ci    movsxd   strideq, strided
186cabdff1aSopenharmony_ci    movsxd   idwt_strideq, idwt_strided
187cabdff1aSopenharmony_ci    mov   r8d, wd
188cabdff1aSopenharmony_ci    %define wspill r8d
189cabdff1aSopenharmony_ci%else
190cabdff1aSopenharmony_ci    mov    r5m, wd
191cabdff1aSopenharmony_ci    %define wspill r5m
192cabdff1aSopenharmony_ci%endif
193cabdff1aSopenharmony_ci
194cabdff1aSopenharmony_ci.loop:
195cabdff1aSopenharmony_ci    sub     wd, mmsize
196cabdff1aSopenharmony_ci    movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
197cabdff1aSopenharmony_ci    paddw   m1, m0
198cabdff1aSopenharmony_ci    psraw   m1, 6
199cabdff1aSopenharmony_ci    movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
200cabdff1aSopenharmony_ci    paddw   m2, m0
201cabdff1aSopenharmony_ci    psraw   m2, 6
202cabdff1aSopenharmony_ci    paddw   m1, [idwtq+2*wq]
203cabdff1aSopenharmony_ci    paddw   m2, [idwtq+2*wq+mmsize]
204cabdff1aSopenharmony_ci    packuswb m1, m2
205cabdff1aSopenharmony_ci    mova    [dstq +wq], m1
206cabdff1aSopenharmony_ci    jg      .loop
207cabdff1aSopenharmony_ci
208cabdff1aSopenharmony_ci    lea   srcq, [srcq + 2*strideq]
209cabdff1aSopenharmony_ci    add   dstq, strideq
210cabdff1aSopenharmony_ci    lea  idwtq, [idwtq+ 2*idwt_strideq]
211cabdff1aSopenharmony_ci    sub     hd, 1
212cabdff1aSopenharmony_ci    mov     wd, wspill
213cabdff1aSopenharmony_ci    jg      .loop
214cabdff1aSopenharmony_ci    RET
215cabdff1aSopenharmony_ci%endm
216cabdff1aSopenharmony_ci
217cabdff1aSopenharmony_ci%macro ADD_OBMC 2
218cabdff1aSopenharmony_ci; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
219cabdff1aSopenharmony_cicglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
220cabdff1aSopenharmony_ci    pxor        m4, m4
221cabdff1aSopenharmony_ci.loop:
222cabdff1aSopenharmony_ci%assign i 0
223cabdff1aSopenharmony_ci%rep %1 / mmsize
224cabdff1aSopenharmony_ci    mova        m0, [srcq+i]
225cabdff1aSopenharmony_ci    mova        m1, m0
226cabdff1aSopenharmony_ci    punpcklbw   m0, m4
227cabdff1aSopenharmony_ci    punpckhbw   m1, m4
228cabdff1aSopenharmony_ci    mova        m2, [obmcq+i]
229cabdff1aSopenharmony_ci    mova        m3, m2
230cabdff1aSopenharmony_ci   punpcklbw   m2, m4
231cabdff1aSopenharmony_ci    punpckhbw   m3, m4
232cabdff1aSopenharmony_ci    pmullw      m0, m2
233cabdff1aSopenharmony_ci    pmullw      m1, m3
234cabdff1aSopenharmony_ci    movu        m2, [dstq+2*i]
235cabdff1aSopenharmony_ci    movu        m3, [dstq+2*i+mmsize]
236cabdff1aSopenharmony_ci    paddw       m0, m2
237cabdff1aSopenharmony_ci    paddw       m1, m3
238cabdff1aSopenharmony_ci    movu        [dstq+2*i], m0
239cabdff1aSopenharmony_ci    movu        [dstq+2*i+mmsize], m1
240cabdff1aSopenharmony_ci%assign i i+mmsize
241cabdff1aSopenharmony_ci%endrep
242cabdff1aSopenharmony_ci    lea         srcq, [srcq+strideq]
243cabdff1aSopenharmony_ci    lea         dstq, [dstq+2*strideq]
244cabdff1aSopenharmony_ci    add         obmcq, 32
245cabdff1aSopenharmony_ci    sub         yblend, 1
246cabdff1aSopenharmony_ci    jg          .loop
247cabdff1aSopenharmony_ci    RET
248cabdff1aSopenharmony_ci%endm
249cabdff1aSopenharmony_ci
250cabdff1aSopenharmony_ciINIT_MMX
251cabdff1aSopenharmony_ciADD_OBMC 8, mmx
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ciINIT_XMM
254cabdff1aSopenharmony_ciPUT_RECT sse2
255cabdff1aSopenharmony_ciADD_RECT sse2
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_ciHPEL_FILTER sse2
258cabdff1aSopenharmony_ciADD_OBMC 32, sse2
259cabdff1aSopenharmony_ciADD_OBMC 16, sse2
260cabdff1aSopenharmony_ci
261cabdff1aSopenharmony_ciINIT_XMM sse4
262cabdff1aSopenharmony_ci
263cabdff1aSopenharmony_ci; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
264cabdff1aSopenharmony_cicglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
265cabdff1aSopenharmony_ci    movd   m2, qfd
266cabdff1aSopenharmony_ci    movd   m3, qsd
267cabdff1aSopenharmony_ci    SPLATD m2
268cabdff1aSopenharmony_ci    SPLATD m3
269cabdff1aSopenharmony_ci    mov    r4d, tot_hd
270cabdff1aSopenharmony_ci    mov    r3, dstq
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_ci    .loop_v:
273cabdff1aSopenharmony_ci    mov    tot_hq, r4
274cabdff1aSopenharmony_ci    mov    dstq,   r3
275cabdff1aSopenharmony_ci
276cabdff1aSopenharmony_ci    .loop_h:
277cabdff1aSopenharmony_ci    movu   m0, [srcq]
278cabdff1aSopenharmony_ci
279cabdff1aSopenharmony_ci    pabsd  m1, m0
280cabdff1aSopenharmony_ci    pmulld m1, m2
281cabdff1aSopenharmony_ci    paddd  m1, m3
282cabdff1aSopenharmony_ci    psrld  m1,  2
283cabdff1aSopenharmony_ci    psignd m1, m0
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_ci    movu   [dstq], m1
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci    add    srcq, mmsize
288cabdff1aSopenharmony_ci    add    dstq, mmsize
289cabdff1aSopenharmony_ci    sub    tot_hq, 4
290cabdff1aSopenharmony_ci    jg     .loop_h
291cabdff1aSopenharmony_ci    lea    srcq, [srcq + 4*tot_hq]
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci    add    r3, strideq
294cabdff1aSopenharmony_ci    dec    tot_vd
295cabdff1aSopenharmony_ci    jg     .loop_v
296cabdff1aSopenharmony_ci
297cabdff1aSopenharmony_ci    RET
298cabdff1aSopenharmony_ci
299cabdff1aSopenharmony_ciINIT_XMM sse4
300cabdff1aSopenharmony_ci; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
301cabdff1aSopenharmony_ci%if ARCH_X86_64
302cabdff1aSopenharmony_cicglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
303cabdff1aSopenharmony_ci%else
304cabdff1aSopenharmony_cicglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
305cabdff1aSopenharmony_ci    %define  hd  r5mp
306cabdff1aSopenharmony_ci%endif
307cabdff1aSopenharmony_ci    shl      wd, 2
308cabdff1aSopenharmony_ci    add    srcq, wq
309cabdff1aSopenharmony_ci    neg      wq
310cabdff1aSopenharmony_ci    mov     t2q, dstq
311cabdff1aSopenharmony_ci    mov     t1q, wq
312cabdff1aSopenharmony_ci    pxor     m2, m2
313cabdff1aSopenharmony_ci    mova     m3, [clip_10bit]
314cabdff1aSopenharmony_ci    mova     m4, [convert_to_unsigned_10bit]
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ci    .loop_h:
317cabdff1aSopenharmony_ci    mov    dstq, t2q
318cabdff1aSopenharmony_ci    mov      wq, t1q
319cabdff1aSopenharmony_ci
320cabdff1aSopenharmony_ci    .loop_w:
321cabdff1aSopenharmony_ci    movu     m0, [srcq+wq+0*mmsize]
322cabdff1aSopenharmony_ci    movu     m1, [srcq+wq+1*mmsize]
323cabdff1aSopenharmony_ci
324cabdff1aSopenharmony_ci    paddd    m0, m4
325cabdff1aSopenharmony_ci    paddd    m1, m4
326cabdff1aSopenharmony_ci    packusdw m0, m0, m1
327cabdff1aSopenharmony_ci    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_ci    movu     [dstq], m0
330cabdff1aSopenharmony_ci
331cabdff1aSopenharmony_ci    add      dstq, 1*mmsize
332cabdff1aSopenharmony_ci    add      wq,   2*mmsize
333cabdff1aSopenharmony_ci    jl       .loop_w
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci    add    srcq, src_strideq
336cabdff1aSopenharmony_ci    add     t2q, dst_strideq
337cabdff1aSopenharmony_ci    sub      hd, 1
338cabdff1aSopenharmony_ci    jg       .loop_h
339cabdff1aSopenharmony_ci
340cabdff1aSopenharmony_ci    RET
341