1;******************************************************************************
2;* Copyright (c) 2008 Loren Merritt
3;*
4;* This file is part of FFmpeg.
5;*
6;* FFmpeg is free software; you can redistribute it and/or
7;* modify it under the terms of the GNU Lesser General Public
8;* License as published by the Free Software Foundation; either
9;* version 2.1 of the License, or (at your option) any later version.
10;*
11;* FFmpeg is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14;* Lesser General Public License for more details.
15;*
16;* You should have received a copy of the GNU Lesser General Public
17;* License along with FFmpeg; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19;******************************************************************************
20
21%include "libavutil/x86/x86util.asm"
22
23SECTION .text
24
25; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
26;                                     int order, int mul)
27INIT_XMM sse2
28cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
29    shl orderq, 1
30    movd    m7, mulm
31    pshuflw m7, m7, 0
32    punpcklqdq m7, m7
33    pxor    m6, m6
34    add v1q, orderq
35    add v2q, orderq
36    add v3q, orderq
37    neg orderq
38.loop:
39    movu    m0, [v2q + orderq]
40    movu    m1, [v2q + orderq + mmsize]
41    mova    m4, [v1q + orderq]
42    mova    m5, [v1q + orderq + mmsize]
43    movu    m2, [v3q + orderq]
44    movu    m3, [v3q + orderq + mmsize]
45    pmaddwd m0, m4
46    pmaddwd m1, m5
47    pmullw  m2, m7
48    pmullw  m3, m7
49    paddd   m6, m0
50    paddd   m6, m1
51    paddw   m2, m4
52    paddw   m3, m5
53    mova    [v1q + orderq], m2
54    mova    [v1q + orderq + mmsize], m3
55    add     orderq, mmsize*2
56    jl .loop
57    HADDD   m6, m0
58    movd   eax, m6
59    RET
60
61INIT_XMM sse4
62; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
63;                                     int order, int mul)
64cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
65    shl orderq, 1
66    movd    m7, mulm
67    SPLATW  m7, m7
68    pxor    m6, m6
69    add v1q, orderq
70    lea v2q, [v2q + 2*orderq]
71    add v3q, orderq
72    neg orderq
73.loop:
74    mova    m3, [v1q + orderq]
75    movu    m0, [v2q + 2*orderq]
76    pmovsxwd m4, m3
77    movu    m1, [v2q + 2*orderq + mmsize]
78    movhlps m5, m3
79    movu    m2, [v3q + orderq]
80    pmovsxwd m5, m5
81    pmullw  m2, m7
82    pmulld  m0, m4
83    pmulld  m1, m5
84    paddw   m2, m3
85    paddd   m6, m0
86    paddd   m6, m1
87    mova    [v1q + orderq], m2
88    add     orderq, 16
89    jl .loop
90    HADDD   m6, m0
91    movd   eax, m6
92    RET
93
94%macro SCALARPRODUCT_LOOP 1
95align 16
96.loop%1:
97    sub     orderq, mmsize*2
98%if %1
99    mova    m1, m4
100    mova    m4, [v2q + orderq]
101    mova    m0, [v2q + orderq + mmsize]
102    palignr m1, m0, %1
103    palignr m0, m4, %1
104    mova    m3, m5
105    mova    m5, [v3q + orderq]
106    mova    m2, [v3q + orderq + mmsize]
107    palignr m3, m2, %1
108    palignr m2, m5, %1
109%else
110    mova    m0, [v2q + orderq]
111    mova    m1, [v2q + orderq + mmsize]
112    mova    m2, [v3q + orderq]
113    mova    m3, [v3q + orderq + mmsize]
114%endif
115    %define t0  [v1q + orderq]
116    %define t1  [v1q + orderq + mmsize]
117%if ARCH_X86_64
118    mova    m8, t0
119    mova    m9, t1
120    %define t0  m8
121    %define t1  m9
122%endif
123    pmaddwd m0, t0
124    pmaddwd m1, t1
125    pmullw  m2, m7
126    pmullw  m3, m7
127    paddw   m2, t0
128    paddw   m3, t1
129    paddd   m6, m0
130    paddd   m6, m1
131    mova    [v1q + orderq], m2
132    mova    [v1q + orderq + mmsize], m3
133    jg .loop%1
134%if %1
135    jmp .end
136%endif
137%endmacro
138
139; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
140;                                     int order, int mul)
141INIT_XMM ssse3
142cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
143    shl orderq, 1
144    movd    m7, mulm
145    pshuflw m7, m7, 0
146    punpcklqdq m7, m7
147    pxor    m6, m6
148    mov    r4d, v2d
149    and    r4d, 15
150    and    v2q, ~15
151    and    v3q, ~15
152    mova    m4, [v2q + orderq]
153    mova    m5, [v3q + orderq]
154    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
155    cmp    r4d, 0
156    je .loop0
157    cmp    r4d, 2
158    je .loop2
159    cmp    r4d, 4
160    je .loop4
161    cmp    r4d, 6
162    je .loop6
163    cmp    r4d, 8
164    je .loop8
165    cmp    r4d, 10
166    je .loop10
167    cmp    r4d, 12
168    je .loop12
169SCALARPRODUCT_LOOP 14
170SCALARPRODUCT_LOOP 12
171SCALARPRODUCT_LOOP 10
172SCALARPRODUCT_LOOP 8
173SCALARPRODUCT_LOOP 6
174SCALARPRODUCT_LOOP 4
175SCALARPRODUCT_LOOP 2
176SCALARPRODUCT_LOOP 0
177.end:
178    HADDD   m6, m0
179    movd   eax, m6
180    RET
181