1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* x86 optimizations for PNG decoding
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
5cabdff1aSopenharmony_ci;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;******************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION_RODATA
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_cicextern pw_255
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ciSECTION .text
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci; %1 = nr. of xmm registers used
33cabdff1aSopenharmony_ci%macro ADD_BYTES_FN 1
34cabdff1aSopenharmony_cicglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
35cabdff1aSopenharmony_ci%if ARCH_X86_64
36cabdff1aSopenharmony_ci    movsxd             waq, wad
37cabdff1aSopenharmony_ci%endif
38cabdff1aSopenharmony_ci    xor                 iq, iq
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ci    ; vector loop
41cabdff1aSopenharmony_ci    mov                 wq, waq
42cabdff1aSopenharmony_ci    and                waq, ~(mmsize*2-1)
43cabdff1aSopenharmony_ci    jmp .end_v
44cabdff1aSopenharmony_ci.loop_v:
45cabdff1aSopenharmony_ci    movu                m0, [src2q+iq]
46cabdff1aSopenharmony_ci    movu                m1, [src2q+iq+mmsize]
47cabdff1aSopenharmony_ci    paddb               m0, [src1q+iq]
48cabdff1aSopenharmony_ci    paddb               m1, [src1q+iq+mmsize]
49cabdff1aSopenharmony_ci    movu  [dstq+iq       ], m0
50cabdff1aSopenharmony_ci    movu  [dstq+iq+mmsize], m1
51cabdff1aSopenharmony_ci    add                 iq, mmsize*2
52cabdff1aSopenharmony_ci.end_v:
53cabdff1aSopenharmony_ci    cmp                 iq, waq
54cabdff1aSopenharmony_ci    jl .loop_v
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci%if mmsize == 16
57cabdff1aSopenharmony_ci    ; vector loop
58cabdff1aSopenharmony_ci    mov                waq, wq
59cabdff1aSopenharmony_ci    and                waq, ~7
60cabdff1aSopenharmony_ci    jmp .end_l
61cabdff1aSopenharmony_ci.loop_l:
62cabdff1aSopenharmony_ci    movq               mm0, [src1q+iq]
63cabdff1aSopenharmony_ci    paddb              mm0, [src2q+iq]
64cabdff1aSopenharmony_ci    movq  [dstq+iq       ], mm0
65cabdff1aSopenharmony_ci    add                 iq, 8
66cabdff1aSopenharmony_ci.end_l:
67cabdff1aSopenharmony_ci    cmp                 iq, waq
68cabdff1aSopenharmony_ci    jl .loop_l
69cabdff1aSopenharmony_ci%endif
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_ci    ; scalar loop for leftover
72cabdff1aSopenharmony_ci    jmp .end_s
73cabdff1aSopenharmony_ci.loop_s:
74cabdff1aSopenharmony_ci    mov                wab, [src1q+iq]
75cabdff1aSopenharmony_ci    add                wab, [src2q+iq]
76cabdff1aSopenharmony_ci    mov          [dstq+iq], wab
77cabdff1aSopenharmony_ci    inc                 iq
78cabdff1aSopenharmony_ci.end_s:
79cabdff1aSopenharmony_ci    cmp                 iq, wq
80cabdff1aSopenharmony_ci    jl .loop_s
81cabdff1aSopenharmony_ci    REP_RET
82cabdff1aSopenharmony_ci%endmacro
83cabdff1aSopenharmony_ci
84cabdff1aSopenharmony_ci%if ARCH_X86_32
85cabdff1aSopenharmony_ciINIT_MMX mmx
86cabdff1aSopenharmony_ciADD_BYTES_FN 0
87cabdff1aSopenharmony_ci%endif
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ciINIT_XMM sse2
90cabdff1aSopenharmony_ciADD_BYTES_FN 2
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci%macro ADD_PAETH_PRED_FN 1
93cabdff1aSopenharmony_cicglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
94cabdff1aSopenharmony_ci%if ARCH_X86_64
95cabdff1aSopenharmony_ci    movsxd            bppq, bppd
96cabdff1aSopenharmony_ci    movsxd              wq, wd
97cabdff1aSopenharmony_ci%endif
98cabdff1aSopenharmony_ci    lea               endq, [dstq+wq-(mmsize/2-1)]
99cabdff1aSopenharmony_ci    sub               topq, dstq
100cabdff1aSopenharmony_ci    sub               srcq, dstq
101cabdff1aSopenharmony_ci    sub               dstq, bppq
102cabdff1aSopenharmony_ci    pxor                m7, m7
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci    PUSH              dstq
105cabdff1aSopenharmony_ci    lea              cntrq, [bppq-1]
106cabdff1aSopenharmony_ci    shr              cntrq, 2 + mmsize/16
107cabdff1aSopenharmony_ci.bpp_loop:
108cabdff1aSopenharmony_ci    lea               dstq, [dstq+cntrq*(mmsize/2)]
109cabdff1aSopenharmony_ci    movh                m0, [dstq]
110cabdff1aSopenharmony_ci    movh                m1, [topq+dstq]
111cabdff1aSopenharmony_ci    punpcklbw           m0, m7
112cabdff1aSopenharmony_ci    punpcklbw           m1, m7
113cabdff1aSopenharmony_ci    add               dstq, bppq
114cabdff1aSopenharmony_ci.loop:
115cabdff1aSopenharmony_ci    mova                m2, m1
116cabdff1aSopenharmony_ci    movh                m1, [topq+dstq]
117cabdff1aSopenharmony_ci    mova                m3, m2
118cabdff1aSopenharmony_ci    punpcklbw           m1, m7
119cabdff1aSopenharmony_ci    mova                m4, m2
120cabdff1aSopenharmony_ci    psubw               m3, m1
121cabdff1aSopenharmony_ci    psubw               m4, m0
122cabdff1aSopenharmony_ci    mova                m5, m3
123cabdff1aSopenharmony_ci    paddw               m5, m4
124cabdff1aSopenharmony_ci%if cpuflag(ssse3)
125cabdff1aSopenharmony_ci    pabsw               m3, m3
126cabdff1aSopenharmony_ci    pabsw               m4, m4
127cabdff1aSopenharmony_ci    pabsw               m5, m5
128cabdff1aSopenharmony_ci%else ; !cpuflag(ssse3)
129cabdff1aSopenharmony_ci    psubw               m7, m5
130cabdff1aSopenharmony_ci    pmaxsw              m5, m7
131cabdff1aSopenharmony_ci    pxor                m6, m6
132cabdff1aSopenharmony_ci    pxor                m7, m7
133cabdff1aSopenharmony_ci    psubw               m6, m3
134cabdff1aSopenharmony_ci    psubw               m7, m4
135cabdff1aSopenharmony_ci    pmaxsw              m3, m6
136cabdff1aSopenharmony_ci    pmaxsw              m4, m7
137cabdff1aSopenharmony_ci    pxor                m7, m7
138cabdff1aSopenharmony_ci%endif ; cpuflag(ssse3)
139cabdff1aSopenharmony_ci    mova                m6, m4
140cabdff1aSopenharmony_ci    pminsw              m6, m5
141cabdff1aSopenharmony_ci    pcmpgtw             m3, m6
142cabdff1aSopenharmony_ci    pcmpgtw             m4, m5
143cabdff1aSopenharmony_ci    mova                m6, m4
144cabdff1aSopenharmony_ci    pand                m4, m3
145cabdff1aSopenharmony_ci    pandn               m6, m3
146cabdff1aSopenharmony_ci    pandn               m3, m0
147cabdff1aSopenharmony_ci    movh                m0, [srcq+dstq]
148cabdff1aSopenharmony_ci    pand                m6, m1
149cabdff1aSopenharmony_ci    pand                m2, m4
150cabdff1aSopenharmony_ci    punpcklbw           m0, m7
151cabdff1aSopenharmony_ci    paddw               m0, m6
152cabdff1aSopenharmony_ci    paddw               m3, m2
153cabdff1aSopenharmony_ci    paddw               m0, m3
154cabdff1aSopenharmony_ci    pand                m0, [pw_255]
155cabdff1aSopenharmony_ci    mova                m3, m0
156cabdff1aSopenharmony_ci    packuswb            m3, m3
157cabdff1aSopenharmony_ci    movh            [dstq], m3
158cabdff1aSopenharmony_ci    add               dstq, bppq
159cabdff1aSopenharmony_ci    cmp               dstq, endq
160cabdff1aSopenharmony_ci    jl .loop
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci    mov               dstq, [rsp]
163cabdff1aSopenharmony_ci    dec              cntrq
164cabdff1aSopenharmony_ci    jge .bpp_loop
165cabdff1aSopenharmony_ci    POP               dstq
166cabdff1aSopenharmony_ci    RET
167cabdff1aSopenharmony_ci%endmacro
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_ciINIT_MMX mmxext
170cabdff1aSopenharmony_ciADD_PAETH_PRED_FN 0
171cabdff1aSopenharmony_ci
172cabdff1aSopenharmony_ciINIT_MMX ssse3
173cabdff1aSopenharmony_ciADD_PAETH_PRED_FN 0
174