1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSE2-optimized functions for the RV40 decoder
3cabdff1aSopenharmony_ci;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4cabdff1aSopenharmony_ci;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5cabdff1aSopenharmony_ci;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;******************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION_RODATA
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_cipw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_cisixtap_filter_hb_m:  times 8 db   1, -5
31cabdff1aSopenharmony_ci                     times 8 db  52, 20
32cabdff1aSopenharmony_ci                     ; multiplied by 2 to have the same shift
33cabdff1aSopenharmony_ci                     times 8 db   2, -10
34cabdff1aSopenharmony_ci                     times 8 db  40,  40
35cabdff1aSopenharmony_ci                     ; back to normal
36cabdff1aSopenharmony_ci                     times 8 db   1, -5
37cabdff1aSopenharmony_ci                     times 8 db  20, 52
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_cisixtap_filter_v_m:   times 8 dw   1
40cabdff1aSopenharmony_ci                     times 8 dw  -5
41cabdff1aSopenharmony_ci                     times 8 dw  52
42cabdff1aSopenharmony_ci                     times 8 dw  20
43cabdff1aSopenharmony_ci                     ; multiplied by 2 to have the same shift
44cabdff1aSopenharmony_ci                     times 8 dw   2
45cabdff1aSopenharmony_ci                     times 8 dw -10
46cabdff1aSopenharmony_ci                     times 8 dw  40
47cabdff1aSopenharmony_ci                     times 8 dw  40
48cabdff1aSopenharmony_ci                     ; back to normal
49cabdff1aSopenharmony_ci                     times 8 dw   1
50cabdff1aSopenharmony_ci                     times 8 dw  -5
51cabdff1aSopenharmony_ci                     times 8 dw  20
52cabdff1aSopenharmony_ci                     times 8 dw  52
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci%ifdef PIC
55cabdff1aSopenharmony_ci%define sixtap_filter_hw   picregq
56cabdff1aSopenharmony_ci%define sixtap_filter_hb   picregq
57cabdff1aSopenharmony_ci%define sixtap_filter_v    picregq
58cabdff1aSopenharmony_ci%define npicregs 1
59cabdff1aSopenharmony_ci%else
60cabdff1aSopenharmony_ci%define sixtap_filter_hw   sixtap_filter_hw_m
61cabdff1aSopenharmony_ci%define sixtap_filter_hb   sixtap_filter_hb_m
62cabdff1aSopenharmony_ci%define sixtap_filter_v    sixtap_filter_v_m
63cabdff1aSopenharmony_ci%define npicregs 0
64cabdff1aSopenharmony_ci%endif
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_cifilter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,  5, 6,  6,  7,  7,  8
67cabdff1aSopenharmony_cifilter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,  7, 8,  8,  9,  9, 10
68cabdff1aSopenharmony_cifilter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_cicextern  pw_32
71cabdff1aSopenharmony_cicextern  pw_16
72cabdff1aSopenharmony_cicextern  pw_512
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_ciSECTION .text
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
77cabdff1aSopenharmony_ci; subpel MC functions:
78cabdff1aSopenharmony_ci;
79cabdff1aSopenharmony_ci; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
80cabdff1aSopenharmony_ci;                                          uint8_t *src, int srcstride,
81cabdff1aSopenharmony_ci;                                          int len, int m);
82cabdff1aSopenharmony_ci;----------------------------------------------------------------------
83cabdff1aSopenharmony_ci%macro LOAD  2
84cabdff1aSopenharmony_ci%if WIN64
85cabdff1aSopenharmony_ci   movsxd   %1q, %1d
86cabdff1aSopenharmony_ci%endif
87cabdff1aSopenharmony_ci%ifdef PIC
88cabdff1aSopenharmony_ci   add      %1q, picregq
89cabdff1aSopenharmony_ci%else
90cabdff1aSopenharmony_ci   add      %1q, %2
91cabdff1aSopenharmony_ci%endif
92cabdff1aSopenharmony_ci%endmacro
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci%macro STORE 3
95cabdff1aSopenharmony_ci%ifidn %3, avg
96cabdff1aSopenharmony_ci    movh      %2, [dstq]
97cabdff1aSopenharmony_ci%endif
98cabdff1aSopenharmony_ci    packuswb  %1, %1
99cabdff1aSopenharmony_ci%ifidn %3, avg
100cabdff1aSopenharmony_ci    PAVGB     %1, %2
101cabdff1aSopenharmony_ci%endif
102cabdff1aSopenharmony_ci    movh  [dstq], %1
103cabdff1aSopenharmony_ci%endmacro
104cabdff1aSopenharmony_ci
105cabdff1aSopenharmony_ci%macro FILTER_V 1
106cabdff1aSopenharmony_cicglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
107cabdff1aSopenharmony_ci%ifdef PIC
108cabdff1aSopenharmony_ci    lea  picregq, [sixtap_filter_v_m]
109cabdff1aSopenharmony_ci%endif
110cabdff1aSopenharmony_ci    pxor      m7, m7
111cabdff1aSopenharmony_ci    LOAD      my, sixtap_filter_v
112cabdff1aSopenharmony_ci
113cabdff1aSopenharmony_ci    ; read 5 lines
114cabdff1aSopenharmony_ci    sub     srcq, srcstrideq
115cabdff1aSopenharmony_ci    sub     srcq, srcstrideq
116cabdff1aSopenharmony_ci    movh      m0, [srcq]
117cabdff1aSopenharmony_ci    movh      m1, [srcq+srcstrideq]
118cabdff1aSopenharmony_ci    movh      m2, [srcq+srcstrideq*2]
119cabdff1aSopenharmony_ci    lea     srcq, [srcq+srcstrideq*2]
120cabdff1aSopenharmony_ci    add     srcq, srcstrideq
121cabdff1aSopenharmony_ci    movh      m3, [srcq]
122cabdff1aSopenharmony_ci    movh      m4, [srcq+srcstrideq]
123cabdff1aSopenharmony_ci    punpcklbw m0, m7
124cabdff1aSopenharmony_ci    punpcklbw m1, m7
125cabdff1aSopenharmony_ci    punpcklbw m2, m7
126cabdff1aSopenharmony_ci    punpcklbw m3, m7
127cabdff1aSopenharmony_ci    punpcklbw m4, m7
128cabdff1aSopenharmony_ci
129cabdff1aSopenharmony_ci%ifdef m8
130cabdff1aSopenharmony_ci    mova      m8, [myq+ 0]
131cabdff1aSopenharmony_ci    mova      m9, [myq+16]
132cabdff1aSopenharmony_ci    mova     m10, [myq+32]
133cabdff1aSopenharmony_ci    mova     m11, [myq+48]
134cabdff1aSopenharmony_ci%define COEFF05  m8
135cabdff1aSopenharmony_ci%define COEFF14  m9
136cabdff1aSopenharmony_ci%define COEFF2   m10
137cabdff1aSopenharmony_ci%define COEFF3   m11
138cabdff1aSopenharmony_ci%else
139cabdff1aSopenharmony_ci%define COEFF05  [myq+ 0]
140cabdff1aSopenharmony_ci%define COEFF14  [myq+16]
141cabdff1aSopenharmony_ci%define COEFF2   [myq+32]
142cabdff1aSopenharmony_ci%define COEFF3   [myq+48]
143cabdff1aSopenharmony_ci%endif
144cabdff1aSopenharmony_ci.nextrow:
145cabdff1aSopenharmony_ci    mova      m6, m1
146cabdff1aSopenharmony_ci    movh      m5, [srcq+2*srcstrideq]      ; read new row
147cabdff1aSopenharmony_ci    paddw     m6, m4
148cabdff1aSopenharmony_ci    punpcklbw m5, m7
149cabdff1aSopenharmony_ci    pmullw    m6, COEFF14
150cabdff1aSopenharmony_ci    paddw     m0, m5
151cabdff1aSopenharmony_ci    pmullw    m0, COEFF05
152cabdff1aSopenharmony_ci    paddw     m6, m0
153cabdff1aSopenharmony_ci    mova      m0, m1
154cabdff1aSopenharmony_ci    paddw     m6, [pw_32]
155cabdff1aSopenharmony_ci    mova      m1, m2
156cabdff1aSopenharmony_ci    pmullw    m2, COEFF2
157cabdff1aSopenharmony_ci    paddw     m6, m2
158cabdff1aSopenharmony_ci    mova      m2, m3
159cabdff1aSopenharmony_ci    pmullw    m3, COEFF3
160cabdff1aSopenharmony_ci    paddw     m6, m3
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci    ; round/clip/store
163cabdff1aSopenharmony_ci    mova      m3, m4
164cabdff1aSopenharmony_ci    psraw     m6, 6
165cabdff1aSopenharmony_ci    mova      m4, m5
166cabdff1aSopenharmony_ci    STORE     m6, m5, %1
167cabdff1aSopenharmony_ci
168cabdff1aSopenharmony_ci    ; go to next line
169cabdff1aSopenharmony_ci    add     dstq, dststrideq
170cabdff1aSopenharmony_ci    add     srcq, srcstrideq
171cabdff1aSopenharmony_ci    dec  heightd                           ; next row
172cabdff1aSopenharmony_ci    jg .nextrow
173cabdff1aSopenharmony_ci    REP_RET
174cabdff1aSopenharmony_ci%endmacro
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ci%macro FILTER_H  1
177cabdff1aSopenharmony_cicglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
178cabdff1aSopenharmony_ci%ifdef PIC
179cabdff1aSopenharmony_ci    lea  picregq, [sixtap_filter_v_m]
180cabdff1aSopenharmony_ci%endif
181cabdff1aSopenharmony_ci    pxor      m7, m7
182cabdff1aSopenharmony_ci    LOAD      mx, sixtap_filter_v
183cabdff1aSopenharmony_ci    mova      m6, [pw_32]
184cabdff1aSopenharmony_ci%ifdef m8
185cabdff1aSopenharmony_ci    mova      m8, [mxq+ 0]
186cabdff1aSopenharmony_ci    mova      m9, [mxq+16]
187cabdff1aSopenharmony_ci    mova     m10, [mxq+32]
188cabdff1aSopenharmony_ci    mova     m11, [mxq+48]
189cabdff1aSopenharmony_ci%define COEFF05  m8
190cabdff1aSopenharmony_ci%define COEFF14  m9
191cabdff1aSopenharmony_ci%define COEFF2   m10
192cabdff1aSopenharmony_ci%define COEFF3   m11
193cabdff1aSopenharmony_ci%else
194cabdff1aSopenharmony_ci%define COEFF05  [mxq+ 0]
195cabdff1aSopenharmony_ci%define COEFF14  [mxq+16]
196cabdff1aSopenharmony_ci%define COEFF2   [mxq+32]
197cabdff1aSopenharmony_ci%define COEFF3   [mxq+48]
198cabdff1aSopenharmony_ci%endif
199cabdff1aSopenharmony_ci.nextrow:
200cabdff1aSopenharmony_ci    movq      m0, [srcq-2]
201cabdff1aSopenharmony_ci    movq      m5, [srcq+3]
202cabdff1aSopenharmony_ci    movq      m1, [srcq-1]
203cabdff1aSopenharmony_ci    movq      m4, [srcq+2]
204cabdff1aSopenharmony_ci    punpcklbw m0, m7
205cabdff1aSopenharmony_ci    punpcklbw m5, m7
206cabdff1aSopenharmony_ci    punpcklbw m1, m7
207cabdff1aSopenharmony_ci    punpcklbw m4, m7
208cabdff1aSopenharmony_ci    movq      m2, [srcq-0]
209cabdff1aSopenharmony_ci    movq      m3, [srcq+1]
210cabdff1aSopenharmony_ci    paddw     m0, m5
211cabdff1aSopenharmony_ci    paddw     m1, m4
212cabdff1aSopenharmony_ci    punpcklbw m2, m7
213cabdff1aSopenharmony_ci    punpcklbw m3, m7
214cabdff1aSopenharmony_ci    pmullw    m0, COEFF05
215cabdff1aSopenharmony_ci    pmullw    m1, COEFF14
216cabdff1aSopenharmony_ci    pmullw    m2, COEFF2
217cabdff1aSopenharmony_ci    pmullw    m3, COEFF3
218cabdff1aSopenharmony_ci    paddw     m0, m6
219cabdff1aSopenharmony_ci    paddw     m1, m2
220cabdff1aSopenharmony_ci    paddw     m0, m3
221cabdff1aSopenharmony_ci    paddw     m0, m1
222cabdff1aSopenharmony_ci    psraw     m0, 6
223cabdff1aSopenharmony_ci    STORE     m0, m1, %1
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_ci    ; go to next line
226cabdff1aSopenharmony_ci    add     dstq, dststrideq
227cabdff1aSopenharmony_ci    add     srcq, srcstrideq
228cabdff1aSopenharmony_ci    dec  heightd            ; next row
229cabdff1aSopenharmony_ci    jg .nextrow
230cabdff1aSopenharmony_ci    REP_RET
231cabdff1aSopenharmony_ci%endmacro
232cabdff1aSopenharmony_ci
233cabdff1aSopenharmony_ciINIT_XMM  sse2
234cabdff1aSopenharmony_ciFILTER_H  put
235cabdff1aSopenharmony_ciFILTER_H  avg
236cabdff1aSopenharmony_ciFILTER_V  put
237cabdff1aSopenharmony_ciFILTER_V  avg
238cabdff1aSopenharmony_ci
239cabdff1aSopenharmony_ci%macro FILTER_SSSE3 1
240cabdff1aSopenharmony_cicglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
241cabdff1aSopenharmony_ci%ifdef PIC
242cabdff1aSopenharmony_ci    lea  picregq, [sixtap_filter_hb_m]
243cabdff1aSopenharmony_ci%endif
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci    ; read 5 lines
246cabdff1aSopenharmony_ci    sub     srcq, srcstrideq
247cabdff1aSopenharmony_ci    LOAD      my, sixtap_filter_hb
248cabdff1aSopenharmony_ci    sub     srcq, srcstrideq
249cabdff1aSopenharmony_ci    movh      m0, [srcq]
250cabdff1aSopenharmony_ci    movh      m1, [srcq+srcstrideq]
251cabdff1aSopenharmony_ci    movh      m2, [srcq+srcstrideq*2]
252cabdff1aSopenharmony_ci    lea     srcq, [srcq+srcstrideq*2]
253cabdff1aSopenharmony_ci    add     srcq, srcstrideq
254cabdff1aSopenharmony_ci    mova      m5, [myq]
255cabdff1aSopenharmony_ci    movh      m3, [srcq]
256cabdff1aSopenharmony_ci    movh      m4, [srcq+srcstrideq]
257cabdff1aSopenharmony_ci    lea     srcq, [srcq+2*srcstrideq]
258cabdff1aSopenharmony_ci
259cabdff1aSopenharmony_ci.nextrow:
260cabdff1aSopenharmony_ci    mova      m6, m2
261cabdff1aSopenharmony_ci    punpcklbw m0, m1
262cabdff1aSopenharmony_ci    punpcklbw m6, m3
263cabdff1aSopenharmony_ci    pmaddubsw m0, m5
264cabdff1aSopenharmony_ci    pmaddubsw m6, [myq+16]
265cabdff1aSopenharmony_ci    movh      m7, [srcq]      ; read new row
266cabdff1aSopenharmony_ci    paddw     m6, m0
267cabdff1aSopenharmony_ci    mova      m0, m1
268cabdff1aSopenharmony_ci    mova      m1, m2
269cabdff1aSopenharmony_ci    mova      m2, m3
270cabdff1aSopenharmony_ci    mova      m3, m4
271cabdff1aSopenharmony_ci    mova      m4, m7
272cabdff1aSopenharmony_ci    punpcklbw m7, m3
273cabdff1aSopenharmony_ci    pmaddubsw m7, m5
274cabdff1aSopenharmony_ci    paddw     m6, m7
275cabdff1aSopenharmony_ci    pmulhrsw  m6, [pw_512]
276cabdff1aSopenharmony_ci    STORE     m6, m7, %1
277cabdff1aSopenharmony_ci
278cabdff1aSopenharmony_ci    ; go to next line
279cabdff1aSopenharmony_ci    add     dstq, dststrideq
280cabdff1aSopenharmony_ci    add     srcq, srcstrideq
281cabdff1aSopenharmony_ci    dec       heightd                          ; next row
282cabdff1aSopenharmony_ci    jg       .nextrow
283cabdff1aSopenharmony_ci    REP_RET
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_cicglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
286cabdff1aSopenharmony_ci%ifdef PIC
287cabdff1aSopenharmony_ci    lea  picregq, [sixtap_filter_hb_m]
288cabdff1aSopenharmony_ci%endif
289cabdff1aSopenharmony_ci    mova      m3, [filter_h6_shuf2]
290cabdff1aSopenharmony_ci    mova      m4, [filter_h6_shuf3]
291cabdff1aSopenharmony_ci    LOAD      mx, sixtap_filter_hb
292cabdff1aSopenharmony_ci    mova      m5, [mxq] ; set up 6tap filter in bytes
293cabdff1aSopenharmony_ci    mova      m6, [mxq+16]
294cabdff1aSopenharmony_ci    mova      m7, [filter_h6_shuf1]
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ci.nextrow:
297cabdff1aSopenharmony_ci    movu      m0, [srcq-2]
298cabdff1aSopenharmony_ci    mova      m1, m0
299cabdff1aSopenharmony_ci    mova      m2, m0
300cabdff1aSopenharmony_ci    pshufb    m0, m7
301cabdff1aSopenharmony_ci    pshufb    m1, m3
302cabdff1aSopenharmony_ci    pshufb    m2, m4
303cabdff1aSopenharmony_ci    pmaddubsw m0, m5
304cabdff1aSopenharmony_ci    pmaddubsw m1, m6
305cabdff1aSopenharmony_ci    pmaddubsw m2, m5
306cabdff1aSopenharmony_ci    paddw     m0, m1
307cabdff1aSopenharmony_ci    paddw     m0, m2
308cabdff1aSopenharmony_ci    pmulhrsw  m0, [pw_512]
309cabdff1aSopenharmony_ci    STORE     m0, m1, %1
310cabdff1aSopenharmony_ci
311cabdff1aSopenharmony_ci    ; go to next line
312cabdff1aSopenharmony_ci    add     dstq, dststrideq
313cabdff1aSopenharmony_ci    add     srcq, srcstrideq
314cabdff1aSopenharmony_ci    dec  heightd            ; next row
315cabdff1aSopenharmony_ci    jg .nextrow
316cabdff1aSopenharmony_ci    REP_RET
317cabdff1aSopenharmony_ci%endmacro
318cabdff1aSopenharmony_ci
319cabdff1aSopenharmony_ciINIT_XMM ssse3
320cabdff1aSopenharmony_ciFILTER_SSSE3  put
321cabdff1aSopenharmony_ciFILTER_SSSE3  avg
322cabdff1aSopenharmony_ci
323cabdff1aSopenharmony_ci; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2
324cabdff1aSopenharmony_ci%macro RV40_WCORE  4-5
325cabdff1aSopenharmony_ci    movh       m4, [%3 + r6 + 0]
326cabdff1aSopenharmony_ci    movh       m5, [%4 + r6 + 0]
327cabdff1aSopenharmony_ci%if %0 == 4
328cabdff1aSopenharmony_ci%define OFFSET r6 + mmsize / 2
329cabdff1aSopenharmony_ci%else
330cabdff1aSopenharmony_ci    ; 8x8 block and SSE2, stride was provided
331cabdff1aSopenharmony_ci%define OFFSET r6
332cabdff1aSopenharmony_ci    add        r6, r5
333cabdff1aSopenharmony_ci%endif
334cabdff1aSopenharmony_ci    movh       m6, [%3 + OFFSET]
335cabdff1aSopenharmony_ci    movh       m7, [%4 + OFFSET]
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_ci%if %1 == 0
338cabdff1aSopenharmony_ci    ; 14-bit weights
339cabdff1aSopenharmony_ci    punpcklbw  m4, m0
340cabdff1aSopenharmony_ci    punpcklbw  m5, m0
341cabdff1aSopenharmony_ci    punpcklbw  m6, m0
342cabdff1aSopenharmony_ci    punpcklbw  m7, m0
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_ci    psllw      m4, 7
345cabdff1aSopenharmony_ci    psllw      m5, 7
346cabdff1aSopenharmony_ci    psllw      m6, 7
347cabdff1aSopenharmony_ci    psllw      m7, 7
348cabdff1aSopenharmony_ci    pmulhw     m4, m3
349cabdff1aSopenharmony_ci    pmulhw     m5, m2
350cabdff1aSopenharmony_ci    pmulhw     m6, m3
351cabdff1aSopenharmony_ci    pmulhw     m7, m2
352cabdff1aSopenharmony_ci
353cabdff1aSopenharmony_ci    paddw      m4, m5
354cabdff1aSopenharmony_ci    paddw      m6, m7
355cabdff1aSopenharmony_ci%else
356cabdff1aSopenharmony_ci    ; 5-bit weights
357cabdff1aSopenharmony_ci%if cpuflag(ssse3)
358cabdff1aSopenharmony_ci    punpcklbw  m4, m5
359cabdff1aSopenharmony_ci    punpcklbw  m6, m7
360cabdff1aSopenharmony_ci
361cabdff1aSopenharmony_ci    pmaddubsw  m4, m3
362cabdff1aSopenharmony_ci    pmaddubsw  m6, m3
363cabdff1aSopenharmony_ci%else
364cabdff1aSopenharmony_ci    punpcklbw  m4, m0
365cabdff1aSopenharmony_ci    punpcklbw  m5, m0
366cabdff1aSopenharmony_ci    punpcklbw  m6, m0
367cabdff1aSopenharmony_ci    punpcklbw  m7, m0
368cabdff1aSopenharmony_ci
369cabdff1aSopenharmony_ci    pmullw     m4, m3
370cabdff1aSopenharmony_ci    pmullw     m5, m2
371cabdff1aSopenharmony_ci    pmullw     m6, m3
372cabdff1aSopenharmony_ci    pmullw     m7, m2
373cabdff1aSopenharmony_ci    paddw      m4, m5
374cabdff1aSopenharmony_ci    paddw      m6, m7
375cabdff1aSopenharmony_ci%endif
376cabdff1aSopenharmony_ci
377cabdff1aSopenharmony_ci%endif
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_ci    ; bias and shift down
380cabdff1aSopenharmony_ci%if cpuflag(ssse3)
381cabdff1aSopenharmony_ci    pmulhrsw   m4, m1
382cabdff1aSopenharmony_ci    pmulhrsw   m6, m1
383cabdff1aSopenharmony_ci%else
384cabdff1aSopenharmony_ci    paddw      m4, m1
385cabdff1aSopenharmony_ci    paddw      m6, m1
386cabdff1aSopenharmony_ci    psrlw      m4, 5
387cabdff1aSopenharmony_ci    psrlw      m6, 5
388cabdff1aSopenharmony_ci%endif
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_ci    packuswb   m4, m6
391cabdff1aSopenharmony_ci%if %0 == 5
392cabdff1aSopenharmony_ci    ; Only called for 8x8 blocks and SSE2
393cabdff1aSopenharmony_ci    sub        r6, r5
394cabdff1aSopenharmony_ci    movh       [%2 + r6], m4
395cabdff1aSopenharmony_ci    add        r6, r5
396cabdff1aSopenharmony_ci    movhps     [%2 + r6], m4
397cabdff1aSopenharmony_ci%else
398cabdff1aSopenharmony_ci    mova       [%2 + r6], m4
399cabdff1aSopenharmony_ci%endif
400cabdff1aSopenharmony_ci%endmacro
401cabdff1aSopenharmony_ci
402cabdff1aSopenharmony_ci
403cabdff1aSopenharmony_ci%macro MAIN_LOOP   2
404cabdff1aSopenharmony_ci%if mmsize == 8
405cabdff1aSopenharmony_ci    RV40_WCORE %2, r0, r1, r2
406cabdff1aSopenharmony_ci%if %1 == 16
407cabdff1aSopenharmony_ci    RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
408cabdff1aSopenharmony_ci%endif
409cabdff1aSopenharmony_ci
410cabdff1aSopenharmony_ci    ; Prepare for next loop
411cabdff1aSopenharmony_ci    add        r6, r5
412cabdff1aSopenharmony_ci%else
413cabdff1aSopenharmony_ci%ifidn %1, 8
414cabdff1aSopenharmony_ci    RV40_WCORE %2, r0, r1, r2, r5
415cabdff1aSopenharmony_ci    ; Prepare 2 next lines
416cabdff1aSopenharmony_ci    add        r6, r5
417cabdff1aSopenharmony_ci%else
418cabdff1aSopenharmony_ci    RV40_WCORE %2, r0, r1, r2
419cabdff1aSopenharmony_ci    ; Prepare single next line
420cabdff1aSopenharmony_ci    add        r6, r5
421cabdff1aSopenharmony_ci%endif
422cabdff1aSopenharmony_ci%endif
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci%endmacro
425cabdff1aSopenharmony_ci
426cabdff1aSopenharmony_ci; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
427cabdff1aSopenharmony_ci; %1=size  %2=num of xmm regs
428cabdff1aSopenharmony_ci; The weights are FP0.14 notation of fractions depending on pts.
429cabdff1aSopenharmony_ci; For timebases without rounding error (i.e. PAL), the fractions
430cabdff1aSopenharmony_ci; can be simplified, and several operations can be avoided.
431cabdff1aSopenharmony_ci; Therefore, we check here whether they are multiples of 2^9 for
432cabdff1aSopenharmony_ci; those simplifications to occur.
433cabdff1aSopenharmony_ci%macro RV40_WEIGHT  3
434cabdff1aSopenharmony_cicglobal rv40_weight_func_%1_%2, 6, 7, 8
435cabdff1aSopenharmony_ci%if cpuflag(ssse3)
436cabdff1aSopenharmony_ci    mova       m1, [pw_1024]
437cabdff1aSopenharmony_ci%else
438cabdff1aSopenharmony_ci    mova       m1, [pw_16]
439cabdff1aSopenharmony_ci%endif
440cabdff1aSopenharmony_ci    pxor       m0, m0
441cabdff1aSopenharmony_ci    ; Set loop counter and increments
442cabdff1aSopenharmony_ci    mov        r6, r5
443cabdff1aSopenharmony_ci    shl        r6, %3
444cabdff1aSopenharmony_ci    add        r0, r6
445cabdff1aSopenharmony_ci    add        r1, r6
446cabdff1aSopenharmony_ci    add        r2, r6
447cabdff1aSopenharmony_ci    neg        r6
448cabdff1aSopenharmony_ci
449cabdff1aSopenharmony_ci    movd       m2, r3d
450cabdff1aSopenharmony_ci    movd       m3, r4d
451cabdff1aSopenharmony_ci%ifidn %1,rnd
452cabdff1aSopenharmony_ci%define  RND   0
453cabdff1aSopenharmony_ci    SPLATW     m2, m2
454cabdff1aSopenharmony_ci%else
455cabdff1aSopenharmony_ci%define  RND   1
456cabdff1aSopenharmony_ci%if cpuflag(ssse3)
457cabdff1aSopenharmony_ci    punpcklbw  m3, m2
458cabdff1aSopenharmony_ci%else
459cabdff1aSopenharmony_ci    SPLATW     m2, m2
460cabdff1aSopenharmony_ci%endif
461cabdff1aSopenharmony_ci%endif
462cabdff1aSopenharmony_ci    SPLATW     m3, m3
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ci.loop:
465cabdff1aSopenharmony_ci    MAIN_LOOP  %2, RND
466cabdff1aSopenharmony_ci    jnz        .loop
467cabdff1aSopenharmony_ci    REP_RET
468cabdff1aSopenharmony_ci%endmacro
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_ciINIT_XMM sse2
471cabdff1aSopenharmony_ciRV40_WEIGHT   rnd,    8, 3
472cabdff1aSopenharmony_ciRV40_WEIGHT   rnd,   16, 4
473cabdff1aSopenharmony_ciRV40_WEIGHT   nornd,  8, 3
474cabdff1aSopenharmony_ciRV40_WEIGHT   nornd, 16, 4
475cabdff1aSopenharmony_ci
476cabdff1aSopenharmony_ciINIT_XMM ssse3
477cabdff1aSopenharmony_ciRV40_WEIGHT   rnd,    8, 3
478cabdff1aSopenharmony_ciRV40_WEIGHT   rnd,   16, 4
479cabdff1aSopenharmony_ciRV40_WEIGHT   nornd,  8, 3
480cabdff1aSopenharmony_ciRV40_WEIGHT   nornd, 16, 4
481