1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (C) 2005-2011 x264 project
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
9cabdff1aSopenharmony_ci;*
10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
14cabdff1aSopenharmony_ci;*
15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
19cabdff1aSopenharmony_ci;*
20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23cabdff1aSopenharmony_ci;******************************************************************************
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ciSECTION_RODATA 32
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_cisq_1: dq 1
30cabdff1aSopenharmony_ci      dq 0
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_cicextern pw_1
33cabdff1aSopenharmony_cicextern pw_1023
34cabdff1aSopenharmony_ci%define pw_pixel_max pw_1023
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ciSECTION .text
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
39cabdff1aSopenharmony_ci; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height,
40cabdff1aSopenharmony_ci;                           int log2_denom, int weight, int offset);
41cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
42cabdff1aSopenharmony_ci%macro WEIGHT_PROLOGUE 0
43cabdff1aSopenharmony_ci.prologue:
44cabdff1aSopenharmony_ci    PROLOGUE 0,6,8
45cabdff1aSopenharmony_ci    movifnidn  r0, r0mp
46cabdff1aSopenharmony_ci    movifnidn r1d, r1m
47cabdff1aSopenharmony_ci    movifnidn r2d, r2m
48cabdff1aSopenharmony_ci    movifnidn r4d, r4m
49cabdff1aSopenharmony_ci    movifnidn r5d, r5m
50cabdff1aSopenharmony_ci%endmacro
51cabdff1aSopenharmony_ci
52cabdff1aSopenharmony_ci%macro WEIGHT_SETUP 0
53cabdff1aSopenharmony_ci    mova       m0, [pw_1]
54cabdff1aSopenharmony_ci    movd       m2, r3m
55cabdff1aSopenharmony_ci    pslld      m0, m2       ; 1<<log2_denom
56cabdff1aSopenharmony_ci    SPLATW     m0, m0
57cabdff1aSopenharmony_ci    shl        r5, 19       ; *8, move to upper half of dword
58cabdff1aSopenharmony_ci    lea        r5, [r5+r4*2+0x10000]
59cabdff1aSopenharmony_ci    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
60cabdff1aSopenharmony_ci    pshufd     m3, m3, 0
61cabdff1aSopenharmony_ci    mova       m4, [pw_pixel_max]
62cabdff1aSopenharmony_ci    paddw      m2, [sq_1]   ; log2_denom+1
63cabdff1aSopenharmony_ci%if notcpuflag(sse4)
64cabdff1aSopenharmony_ci    pxor       m7, m7
65cabdff1aSopenharmony_ci%endif
66cabdff1aSopenharmony_ci%endmacro
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci%macro WEIGHT_OP 1-2
69cabdff1aSopenharmony_ci%if %0==1
70cabdff1aSopenharmony_ci    mova        m5, [r0+%1]
71cabdff1aSopenharmony_ci    punpckhwd   m6, m5, m0
72cabdff1aSopenharmony_ci    punpcklwd   m5, m0
73cabdff1aSopenharmony_ci%else
74cabdff1aSopenharmony_ci    movq        m5, [r0+%1]
75cabdff1aSopenharmony_ci    movq        m6, [r0+%2]
76cabdff1aSopenharmony_ci    punpcklwd   m5, m0
77cabdff1aSopenharmony_ci    punpcklwd   m6, m0
78cabdff1aSopenharmony_ci%endif
79cabdff1aSopenharmony_ci    pmaddwd     m5, m3
80cabdff1aSopenharmony_ci    pmaddwd     m6, m3
81cabdff1aSopenharmony_ci    psrad       m5, m2
82cabdff1aSopenharmony_ci    psrad       m6, m2
83cabdff1aSopenharmony_ci%if cpuflag(sse4)
84cabdff1aSopenharmony_ci    packusdw    m5, m6
85cabdff1aSopenharmony_ci    pminsw      m5, m4
86cabdff1aSopenharmony_ci%else
87cabdff1aSopenharmony_ci    packssdw    m5, m6
88cabdff1aSopenharmony_ci    CLIPW       m5, m7, m4
89cabdff1aSopenharmony_ci%endif
90cabdff1aSopenharmony_ci%endmacro
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci%macro WEIGHT_FUNC_DBL 0
93cabdff1aSopenharmony_cicglobal h264_weight_16_10
94cabdff1aSopenharmony_ci    WEIGHT_PROLOGUE
95cabdff1aSopenharmony_ci    WEIGHT_SETUP
96cabdff1aSopenharmony_ci.nextrow:
97cabdff1aSopenharmony_ci    WEIGHT_OP  0
98cabdff1aSopenharmony_ci    mova [r0   ], m5
99cabdff1aSopenharmony_ci    WEIGHT_OP 16
100cabdff1aSopenharmony_ci    mova [r0+16], m5
101cabdff1aSopenharmony_ci    add       r0, r1
102cabdff1aSopenharmony_ci    dec       r2d
103cabdff1aSopenharmony_ci    jnz .nextrow
104cabdff1aSopenharmony_ci    REP_RET
105cabdff1aSopenharmony_ci%endmacro
106cabdff1aSopenharmony_ci
107cabdff1aSopenharmony_ciINIT_XMM sse2
108cabdff1aSopenharmony_ciWEIGHT_FUNC_DBL
109cabdff1aSopenharmony_ciINIT_XMM sse4
110cabdff1aSopenharmony_ciWEIGHT_FUNC_DBL
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci
113cabdff1aSopenharmony_ci%macro WEIGHT_FUNC_MM 0
114cabdff1aSopenharmony_cicglobal h264_weight_8_10
115cabdff1aSopenharmony_ci    WEIGHT_PROLOGUE
116cabdff1aSopenharmony_ci    WEIGHT_SETUP
117cabdff1aSopenharmony_ci.nextrow:
118cabdff1aSopenharmony_ci    WEIGHT_OP   0
119cabdff1aSopenharmony_ci    mova     [r0], m5
120cabdff1aSopenharmony_ci    add        r0, r1
121cabdff1aSopenharmony_ci    dec        r2d
122cabdff1aSopenharmony_ci    jnz .nextrow
123cabdff1aSopenharmony_ci    REP_RET
124cabdff1aSopenharmony_ci%endmacro
125cabdff1aSopenharmony_ci
126cabdff1aSopenharmony_ciINIT_XMM sse2
127cabdff1aSopenharmony_ciWEIGHT_FUNC_MM
128cabdff1aSopenharmony_ciINIT_XMM sse4
129cabdff1aSopenharmony_ciWEIGHT_FUNC_MM
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci%macro WEIGHT_FUNC_HALF_MM 0
133cabdff1aSopenharmony_cicglobal h264_weight_4_10
134cabdff1aSopenharmony_ci    WEIGHT_PROLOGUE
135cabdff1aSopenharmony_ci    sar         r2d, 1
136cabdff1aSopenharmony_ci    WEIGHT_SETUP
137cabdff1aSopenharmony_ci    lea         r3, [r1*2]
138cabdff1aSopenharmony_ci.nextrow:
139cabdff1aSopenharmony_ci    WEIGHT_OP    0, r1
140cabdff1aSopenharmony_ci    movh      [r0], m5
141cabdff1aSopenharmony_ci    movhps [r0+r1], m5
142cabdff1aSopenharmony_ci    add         r0, r3
143cabdff1aSopenharmony_ci    dec         r2d
144cabdff1aSopenharmony_ci    jnz .nextrow
145cabdff1aSopenharmony_ci    REP_RET
146cabdff1aSopenharmony_ci%endmacro
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_ciINIT_XMM sse2
149cabdff1aSopenharmony_ciWEIGHT_FUNC_HALF_MM
150cabdff1aSopenharmony_ciINIT_XMM sse4
151cabdff1aSopenharmony_ciWEIGHT_FUNC_HALF_MM
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_ci
154cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
155cabdff1aSopenharmony_ci; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride,
156cabdff1aSopenharmony_ci;                             int height, int log2_denom, int weightd,
157cabdff1aSopenharmony_ci;                             int weights, int offset);
158cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
159cabdff1aSopenharmony_ci%if ARCH_X86_32
160cabdff1aSopenharmony_ciDECLARE_REG_TMP 3
161cabdff1aSopenharmony_ci%else
162cabdff1aSopenharmony_ciDECLARE_REG_TMP 7
163cabdff1aSopenharmony_ci%endif
164cabdff1aSopenharmony_ci
165cabdff1aSopenharmony_ci%macro BIWEIGHT_PROLOGUE 0
166cabdff1aSopenharmony_ci.prologue:
167cabdff1aSopenharmony_ci    PROLOGUE 0,8,8
168cabdff1aSopenharmony_ci    movifnidn  r0, r0mp
169cabdff1aSopenharmony_ci    movifnidn  r1, r1mp
170cabdff1aSopenharmony_ci    movifnidn r2d, r2m
171cabdff1aSopenharmony_ci    movifnidn r5d, r5m
172cabdff1aSopenharmony_ci    movifnidn r6d, r6m
173cabdff1aSopenharmony_ci    movifnidn t0d, r7m
174cabdff1aSopenharmony_ci%endmacro
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ci%macro BIWEIGHT_SETUP 0
177cabdff1aSopenharmony_ci    lea        t0, [t0*4+1] ; (offset<<2)+1
178cabdff1aSopenharmony_ci    or         t0, 1
179cabdff1aSopenharmony_ci    shl        r6, 16
180cabdff1aSopenharmony_ci    or         r5, r6
181cabdff1aSopenharmony_ci    movd       m4, r5d      ; weightd | weights
182cabdff1aSopenharmony_ci    movd       m5, t0d      ; (offset+1)|1
183cabdff1aSopenharmony_ci    movd       m6, r4m      ; log2_denom
184cabdff1aSopenharmony_ci    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
185cabdff1aSopenharmony_ci    paddd      m6, [sq_1]
186cabdff1aSopenharmony_ci    pshufd     m4, m4, 0
187cabdff1aSopenharmony_ci    pshufd     m5, m5, 0
188cabdff1aSopenharmony_ci    mova       m3, [pw_pixel_max]
189cabdff1aSopenharmony_ci    movifnidn r3d, r3m
190cabdff1aSopenharmony_ci%if notcpuflag(sse4)
191cabdff1aSopenharmony_ci    pxor       m7, m7
192cabdff1aSopenharmony_ci%endif
193cabdff1aSopenharmony_ci%endmacro
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci%macro BIWEIGHT 1-2
196cabdff1aSopenharmony_ci%if %0==1
197cabdff1aSopenharmony_ci    mova       m0, [r0+%1]
198cabdff1aSopenharmony_ci    mova       m1, [r1+%1]
199cabdff1aSopenharmony_ci    punpckhwd  m2, m0, m1
200cabdff1aSopenharmony_ci    punpcklwd  m0, m1
201cabdff1aSopenharmony_ci%else
202cabdff1aSopenharmony_ci    movq       m0, [r0+%1]
203cabdff1aSopenharmony_ci    movq       m1, [r1+%1]
204cabdff1aSopenharmony_ci    punpcklwd  m0, m1
205cabdff1aSopenharmony_ci    movq       m2, [r0+%2]
206cabdff1aSopenharmony_ci    movq       m1, [r1+%2]
207cabdff1aSopenharmony_ci    punpcklwd  m2, m1
208cabdff1aSopenharmony_ci%endif
209cabdff1aSopenharmony_ci    pmaddwd    m0, m4
210cabdff1aSopenharmony_ci    pmaddwd    m2, m4
211cabdff1aSopenharmony_ci    paddd      m0, m5
212cabdff1aSopenharmony_ci    paddd      m2, m5
213cabdff1aSopenharmony_ci    psrad      m0, m6
214cabdff1aSopenharmony_ci    psrad      m2, m6
215cabdff1aSopenharmony_ci%if cpuflag(sse4)
216cabdff1aSopenharmony_ci    packusdw   m0, m2
217cabdff1aSopenharmony_ci    pminsw     m0, m3
218cabdff1aSopenharmony_ci%else
219cabdff1aSopenharmony_ci    packssdw   m0, m2
220cabdff1aSopenharmony_ci    CLIPW      m0, m7, m3
221cabdff1aSopenharmony_ci%endif
222cabdff1aSopenharmony_ci%endmacro
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ci%macro BIWEIGHT_FUNC_DBL 0
225cabdff1aSopenharmony_cicglobal h264_biweight_16_10
226cabdff1aSopenharmony_ci    BIWEIGHT_PROLOGUE
227cabdff1aSopenharmony_ci    BIWEIGHT_SETUP
228cabdff1aSopenharmony_ci.nextrow:
229cabdff1aSopenharmony_ci    BIWEIGHT   0
230cabdff1aSopenharmony_ci    mova [r0   ], m0
231cabdff1aSopenharmony_ci    BIWEIGHT  16
232cabdff1aSopenharmony_ci    mova [r0+16], m0
233cabdff1aSopenharmony_ci    add       r0, r2
234cabdff1aSopenharmony_ci    add       r1, r2
235cabdff1aSopenharmony_ci    dec       r3d
236cabdff1aSopenharmony_ci    jnz .nextrow
237cabdff1aSopenharmony_ci    REP_RET
238cabdff1aSopenharmony_ci%endmacro
239cabdff1aSopenharmony_ci
240cabdff1aSopenharmony_ciINIT_XMM sse2
241cabdff1aSopenharmony_ciBIWEIGHT_FUNC_DBL
242cabdff1aSopenharmony_ciINIT_XMM sse4
243cabdff1aSopenharmony_ciBIWEIGHT_FUNC_DBL
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci%macro BIWEIGHT_FUNC 0
246cabdff1aSopenharmony_cicglobal h264_biweight_8_10
247cabdff1aSopenharmony_ci    BIWEIGHT_PROLOGUE
248cabdff1aSopenharmony_ci    BIWEIGHT_SETUP
249cabdff1aSopenharmony_ci.nextrow:
250cabdff1aSopenharmony_ci    BIWEIGHT  0
251cabdff1aSopenharmony_ci    mova   [r0], m0
252cabdff1aSopenharmony_ci    add      r0, r2
253cabdff1aSopenharmony_ci    add      r1, r2
254cabdff1aSopenharmony_ci    dec      r3d
255cabdff1aSopenharmony_ci    jnz .nextrow
256cabdff1aSopenharmony_ci    REP_RET
257cabdff1aSopenharmony_ci%endmacro
258cabdff1aSopenharmony_ci
259cabdff1aSopenharmony_ciINIT_XMM sse2
260cabdff1aSopenharmony_ciBIWEIGHT_FUNC
261cabdff1aSopenharmony_ciINIT_XMM sse4
262cabdff1aSopenharmony_ciBIWEIGHT_FUNC
263cabdff1aSopenharmony_ci
264cabdff1aSopenharmony_ci%macro BIWEIGHT_FUNC_HALF 0
265cabdff1aSopenharmony_cicglobal h264_biweight_4_10
266cabdff1aSopenharmony_ci    BIWEIGHT_PROLOGUE
267cabdff1aSopenharmony_ci    BIWEIGHT_SETUP
268cabdff1aSopenharmony_ci    sar        r3d, 1
269cabdff1aSopenharmony_ci    lea        r4, [r2*2]
270cabdff1aSopenharmony_ci.nextrow:
271cabdff1aSopenharmony_ci    BIWEIGHT     0, r2
272cabdff1aSopenharmony_ci    movh   [r0   ], m0
273cabdff1aSopenharmony_ci    movhps [r0+r2], m0
274cabdff1aSopenharmony_ci    add         r0, r4
275cabdff1aSopenharmony_ci    add         r1, r4
276cabdff1aSopenharmony_ci    dec         r3d
277cabdff1aSopenharmony_ci    jnz .nextrow
278cabdff1aSopenharmony_ci    REP_RET
279cabdff1aSopenharmony_ci%endmacro
280cabdff1aSopenharmony_ci
281cabdff1aSopenharmony_ciINIT_XMM sse2
282cabdff1aSopenharmony_ciBIWEIGHT_FUNC_HALF
283cabdff1aSopenharmony_ciINIT_XMM sse4
284cabdff1aSopenharmony_ciBIWEIGHT_FUNC_HALF
285