1;******************************************************************************
2;* MMX/SSE2-optimized functions for the RV40 decoder
3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28pw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
29
30sixtap_filter_hb_m:  times 8 db   1, -5
31                     times 8 db  52, 20
32                     ; multiplied by 2 to have the same shift
33                     times 8 db   2, -10
34                     times 8 db  40,  40
35                     ; back to normal
36                     times 8 db   1, -5
37                     times 8 db  20, 52
38
39sixtap_filter_v_m:   times 8 dw   1
40                     times 8 dw  -5
41                     times 8 dw  52
42                     times 8 dw  20
43                     ; multiplied by 2 to have the same shift
44                     times 8 dw   2
45                     times 8 dw -10
46                     times 8 dw  40
47                     times 8 dw  40
48                     ; back to normal
49                     times 8 dw   1
50                     times 8 dw  -5
51                     times 8 dw  20
52                     times 8 dw  52
53
54%ifdef PIC
55%define sixtap_filter_hw   picregq
56%define sixtap_filter_hb   picregq
57%define sixtap_filter_v    picregq
58%define npicregs 1
59%else
60%define sixtap_filter_hw   sixtap_filter_hw_m
61%define sixtap_filter_hb   sixtap_filter_hb_m
62%define sixtap_filter_v    sixtap_filter_v_m
63%define npicregs 0
64%endif
65
66filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,  5, 6,  6,  7,  7,  8
67filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,  7, 8,  8,  9,  9, 10
68filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
69
70cextern  pw_32
71cextern  pw_16
72cextern  pw_512
73
74SECTION .text
75
76;-----------------------------------------------------------------------------
77; subpel MC functions:
78;
79; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
80;                                          uint8_t *src, int srcstride,
81;                                          int len, int m);
82;----------------------------------------------------------------------
83%macro LOAD  2
84%if WIN64
85   movsxd   %1q, %1d
86%endif
87%ifdef PIC
88   add      %1q, picregq
89%else
90   add      %1q, %2
91%endif
92%endmacro
93
94%macro STORE 3
95%ifidn %3, avg
96    movh      %2, [dstq]
97%endif
98    packuswb  %1, %1
99%ifidn %3, avg
100    PAVGB     %1, %2
101%endif
102    movh  [dstq], %1
103%endmacro
104
105%macro FILTER_V 1
106cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
107%ifdef PIC
108    lea  picregq, [sixtap_filter_v_m]
109%endif
110    pxor      m7, m7
111    LOAD      my, sixtap_filter_v
112
113    ; read 5 lines
114    sub     srcq, srcstrideq
115    sub     srcq, srcstrideq
116    movh      m0, [srcq]
117    movh      m1, [srcq+srcstrideq]
118    movh      m2, [srcq+srcstrideq*2]
119    lea     srcq, [srcq+srcstrideq*2]
120    add     srcq, srcstrideq
121    movh      m3, [srcq]
122    movh      m4, [srcq+srcstrideq]
123    punpcklbw m0, m7
124    punpcklbw m1, m7
125    punpcklbw m2, m7
126    punpcklbw m3, m7
127    punpcklbw m4, m7
128
129%ifdef m8
130    mova      m8, [myq+ 0]
131    mova      m9, [myq+16]
132    mova     m10, [myq+32]
133    mova     m11, [myq+48]
134%define COEFF05  m8
135%define COEFF14  m9
136%define COEFF2   m10
137%define COEFF3   m11
138%else
139%define COEFF05  [myq+ 0]
140%define COEFF14  [myq+16]
141%define COEFF2   [myq+32]
142%define COEFF3   [myq+48]
143%endif
144.nextrow:
145    mova      m6, m1
146    movh      m5, [srcq+2*srcstrideq]      ; read new row
147    paddw     m6, m4
148    punpcklbw m5, m7
149    pmullw    m6, COEFF14
150    paddw     m0, m5
151    pmullw    m0, COEFF05
152    paddw     m6, m0
153    mova      m0, m1
154    paddw     m6, [pw_32]
155    mova      m1, m2
156    pmullw    m2, COEFF2
157    paddw     m6, m2
158    mova      m2, m3
159    pmullw    m3, COEFF3
160    paddw     m6, m3
161
162    ; round/clip/store
163    mova      m3, m4
164    psraw     m6, 6
165    mova      m4, m5
166    STORE     m6, m5, %1
167
168    ; go to next line
169    add     dstq, dststrideq
170    add     srcq, srcstrideq
171    dec  heightd                           ; next row
172    jg .nextrow
173    REP_RET
174%endmacro
175
176%macro FILTER_H  1
177cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
178%ifdef PIC
179    lea  picregq, [sixtap_filter_v_m]
180%endif
181    pxor      m7, m7
182    LOAD      mx, sixtap_filter_v
183    mova      m6, [pw_32]
184%ifdef m8
185    mova      m8, [mxq+ 0]
186    mova      m9, [mxq+16]
187    mova     m10, [mxq+32]
188    mova     m11, [mxq+48]
189%define COEFF05  m8
190%define COEFF14  m9
191%define COEFF2   m10
192%define COEFF3   m11
193%else
194%define COEFF05  [mxq+ 0]
195%define COEFF14  [mxq+16]
196%define COEFF2   [mxq+32]
197%define COEFF3   [mxq+48]
198%endif
199.nextrow:
200    movq      m0, [srcq-2]
201    movq      m5, [srcq+3]
202    movq      m1, [srcq-1]
203    movq      m4, [srcq+2]
204    punpcklbw m0, m7
205    punpcklbw m5, m7
206    punpcklbw m1, m7
207    punpcklbw m4, m7
208    movq      m2, [srcq-0]
209    movq      m3, [srcq+1]
210    paddw     m0, m5
211    paddw     m1, m4
212    punpcklbw m2, m7
213    punpcklbw m3, m7
214    pmullw    m0, COEFF05
215    pmullw    m1, COEFF14
216    pmullw    m2, COEFF2
217    pmullw    m3, COEFF3
218    paddw     m0, m6
219    paddw     m1, m2
220    paddw     m0, m3
221    paddw     m0, m1
222    psraw     m0, 6
223    STORE     m0, m1, %1
224
225    ; go to next line
226    add     dstq, dststrideq
227    add     srcq, srcstrideq
228    dec  heightd            ; next row
229    jg .nextrow
230    REP_RET
231%endmacro
232
233INIT_XMM  sse2
234FILTER_H  put
235FILTER_H  avg
236FILTER_V  put
237FILTER_V  avg
238
239%macro FILTER_SSSE3 1
240cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
241%ifdef PIC
242    lea  picregq, [sixtap_filter_hb_m]
243%endif
244
245    ; read 5 lines
246    sub     srcq, srcstrideq
247    LOAD      my, sixtap_filter_hb
248    sub     srcq, srcstrideq
249    movh      m0, [srcq]
250    movh      m1, [srcq+srcstrideq]
251    movh      m2, [srcq+srcstrideq*2]
252    lea     srcq, [srcq+srcstrideq*2]
253    add     srcq, srcstrideq
254    mova      m5, [myq]
255    movh      m3, [srcq]
256    movh      m4, [srcq+srcstrideq]
257    lea     srcq, [srcq+2*srcstrideq]
258
259.nextrow:
260    mova      m6, m2
261    punpcklbw m0, m1
262    punpcklbw m6, m3
263    pmaddubsw m0, m5
264    pmaddubsw m6, [myq+16]
265    movh      m7, [srcq]      ; read new row
266    paddw     m6, m0
267    mova      m0, m1
268    mova      m1, m2
269    mova      m2, m3
270    mova      m3, m4
271    mova      m4, m7
272    punpcklbw m7, m3
273    pmaddubsw m7, m5
274    paddw     m6, m7
275    pmulhrsw  m6, [pw_512]
276    STORE     m6, m7, %1
277
278    ; go to next line
279    add     dstq, dststrideq
280    add     srcq, srcstrideq
281    dec       heightd                          ; next row
282    jg       .nextrow
283    REP_RET
284
285cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
286%ifdef PIC
287    lea  picregq, [sixtap_filter_hb_m]
288%endif
289    mova      m3, [filter_h6_shuf2]
290    mova      m4, [filter_h6_shuf3]
291    LOAD      mx, sixtap_filter_hb
292    mova      m5, [mxq] ; set up 6tap filter in bytes
293    mova      m6, [mxq+16]
294    mova      m7, [filter_h6_shuf1]
295
296.nextrow:
297    movu      m0, [srcq-2]
298    mova      m1, m0
299    mova      m2, m0
300    pshufb    m0, m7
301    pshufb    m1, m3
302    pshufb    m2, m4
303    pmaddubsw m0, m5
304    pmaddubsw m1, m6
305    pmaddubsw m2, m5
306    paddw     m0, m1
307    paddw     m0, m2
308    pmulhrsw  m0, [pw_512]
309    STORE     m0, m1, %1
310
311    ; go to next line
312    add     dstq, dststrideq
313    add     srcq, srcstrideq
314    dec  heightd            ; next row
315    jg .nextrow
316    REP_RET
317%endmacro
318
319INIT_XMM ssse3
320FILTER_SSSE3  put
321FILTER_SSSE3  avg
322
323; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2
324%macro RV40_WCORE  4-5
325    movh       m4, [%3 + r6 + 0]
326    movh       m5, [%4 + r6 + 0]
327%if %0 == 4
328%define OFFSET r6 + mmsize / 2
329%else
330    ; 8x8 block and SSE2, stride was provided
331%define OFFSET r6
332    add        r6, r5
333%endif
334    movh       m6, [%3 + OFFSET]
335    movh       m7, [%4 + OFFSET]
336
337%if %1 == 0
338    ; 14-bit weights
339    punpcklbw  m4, m0
340    punpcklbw  m5, m0
341    punpcklbw  m6, m0
342    punpcklbw  m7, m0
343
344    psllw      m4, 7
345    psllw      m5, 7
346    psllw      m6, 7
347    psllw      m7, 7
348    pmulhw     m4, m3
349    pmulhw     m5, m2
350    pmulhw     m6, m3
351    pmulhw     m7, m2
352
353    paddw      m4, m5
354    paddw      m6, m7
355%else
356    ; 5-bit weights
357%if cpuflag(ssse3)
358    punpcklbw  m4, m5
359    punpcklbw  m6, m7
360
361    pmaddubsw  m4, m3
362    pmaddubsw  m6, m3
363%else
364    punpcklbw  m4, m0
365    punpcklbw  m5, m0
366    punpcklbw  m6, m0
367    punpcklbw  m7, m0
368
369    pmullw     m4, m3
370    pmullw     m5, m2
371    pmullw     m6, m3
372    pmullw     m7, m2
373    paddw      m4, m5
374    paddw      m6, m7
375%endif
376
377%endif
378
379    ; bias and shift down
380%if cpuflag(ssse3)
381    pmulhrsw   m4, m1
382    pmulhrsw   m6, m1
383%else
384    paddw      m4, m1
385    paddw      m6, m1
386    psrlw      m4, 5
387    psrlw      m6, 5
388%endif
389
390    packuswb   m4, m6
391%if %0 == 5
392    ; Only called for 8x8 blocks and SSE2
393    sub        r6, r5
394    movh       [%2 + r6], m4
395    add        r6, r5
396    movhps     [%2 + r6], m4
397%else
398    mova       [%2 + r6], m4
399%endif
400%endmacro
401
402
403%macro MAIN_LOOP   2
404%if mmsize == 8
405    RV40_WCORE %2, r0, r1, r2
406%if %1 == 16
407    RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
408%endif
409
410    ; Prepare for next loop
411    add        r6, r5
412%else
413%ifidn %1, 8
414    RV40_WCORE %2, r0, r1, r2, r5
415    ; Prepare 2 next lines
416    add        r6, r5
417%else
418    RV40_WCORE %2, r0, r1, r2
419    ; Prepare single next line
420    add        r6, r5
421%endif
422%endif
423
424%endmacro
425
426; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
427; %1=size  %2=num of xmm regs
428; The weights are FP0.14 notation of fractions depending on pts.
429; For timebases without rounding error (i.e. PAL), the fractions
430; can be simplified, and several operations can be avoided.
431; Therefore, we check here whether they are multiples of 2^9 for
432; those simplifications to occur.
433%macro RV40_WEIGHT  3
434cglobal rv40_weight_func_%1_%2, 6, 7, 8
435%if cpuflag(ssse3)
436    mova       m1, [pw_1024]
437%else
438    mova       m1, [pw_16]
439%endif
440    pxor       m0, m0
441    ; Set loop counter and increments
442    mov        r6, r5
443    shl        r6, %3
444    add        r0, r6
445    add        r1, r6
446    add        r2, r6
447    neg        r6
448
449    movd       m2, r3d
450    movd       m3, r4d
451%ifidn %1,rnd
452%define  RND   0
453    SPLATW     m2, m2
454%else
455%define  RND   1
456%if cpuflag(ssse3)
457    punpcklbw  m3, m2
458%else
459    SPLATW     m2, m2
460%endif
461%endif
462    SPLATW     m3, m3
463
464.loop:
465    MAIN_LOOP  %2, RND
466    jnz        .loop
467    REP_RET
468%endmacro
469
470INIT_XMM sse2
471RV40_WEIGHT   rnd,    8, 3
472RV40_WEIGHT   rnd,   16, 4
473RV40_WEIGHT   nornd,  8, 3
474RV40_WEIGHT   nornd, 16, 4
475
476INIT_XMM ssse3
477RV40_WEIGHT   rnd,    8, 3
478RV40_WEIGHT   rnd,   16, 4
479RV40_WEIGHT   nornd,  8, 3
480RV40_WEIGHT   nornd, 16, 4
481