1;****************************************************************************** 2;* Copyright (c) 2008 Loren Merritt 3;* 4;* This file is part of FFmpeg. 5;* 6;* FFmpeg is free software; you can redistribute it and/or 7;* modify it under the terms of the GNU Lesser General Public 8;* License as published by the Free Software Foundation; either 9;* version 2.1 of the License, or (at your option) any later version. 10;* 11;* FFmpeg is distributed in the hope that it will be useful, 12;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14;* Lesser General Public License for more details. 15;* 16;* You should have received a copy of the GNU Lesser General Public 17;* License along with FFmpeg; if not, write to the Free Software 18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19;****************************************************************************** 20 21%include "libavutil/x86/x86util.asm" 22 23SECTION .text 24 25; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, 26; int order, int mul) 27INIT_XMM sse2 28cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul 29 shl orderq, 1 30 movd m7, mulm 31 pshuflw m7, m7, 0 32 punpcklqdq m7, m7 33 pxor m6, m6 34 add v1q, orderq 35 add v2q, orderq 36 add v3q, orderq 37 neg orderq 38.loop: 39 movu m0, [v2q + orderq] 40 movu m1, [v2q + orderq + mmsize] 41 mova m4, [v1q + orderq] 42 mova m5, [v1q + orderq + mmsize] 43 movu m2, [v3q + orderq] 44 movu m3, [v3q + orderq + mmsize] 45 pmaddwd m0, m4 46 pmaddwd m1, m5 47 pmullw m2, m7 48 pmullw m3, m7 49 paddd m6, m0 50 paddd m6, m1 51 paddw m2, m4 52 paddw m3, m5 53 mova [v1q + orderq], m2 54 mova [v1q + orderq + mmsize], m3 55 add orderq, mmsize*2 56 jl .loop 57 HADDD m6, m0 58 movd eax, m6 59 RET 60 61INIT_XMM sse4 62; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3, 63; int order, int mul) 64cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul 65 shl orderq, 1 66 movd m7, mulm 67 SPLATW m7, m7 68 pxor m6, m6 69 add v1q, orderq 70 lea v2q, [v2q + 2*orderq] 71 add v3q, orderq 72 neg orderq 73.loop: 74 mova m3, [v1q + orderq] 75 movu m0, [v2q + 2*orderq] 76 pmovsxwd m4, m3 77 movu m1, [v2q + 2*orderq + mmsize] 78 movhlps m5, m3 79 movu m2, [v3q + orderq] 80 pmovsxwd m5, m5 81 pmullw m2, m7 82 pmulld m0, m4 83 pmulld m1, m5 84 paddw m2, m3 85 paddd m6, m0 86 paddd m6, m1 87 mova [v1q + orderq], m2 88 add orderq, 16 89 jl .loop 90 HADDD m6, m0 91 movd eax, m6 92 RET 93 94%macro SCALARPRODUCT_LOOP 1 95align 16 96.loop%1: 97 sub orderq, mmsize*2 98%if %1 99 mova m1, m4 100 mova m4, [v2q + orderq] 101 mova m0, [v2q + orderq + mmsize] 102 palignr m1, m0, %1 103 palignr m0, m4, %1 104 mova m3, m5 105 mova m5, [v3q + orderq] 106 mova m2, [v3q + orderq + mmsize] 107 palignr m3, m2, %1 108 palignr m2, m5, %1 109%else 110 mova m0, [v2q + orderq] 111 mova m1, [v2q + orderq + mmsize] 112 mova m2, [v3q + orderq] 113 mova m3, [v3q + orderq + mmsize] 114%endif 115 %define t0 [v1q + orderq] 116 %define t1 [v1q + orderq + mmsize] 117%if ARCH_X86_64 118 mova m8, t0 119 mova m9, t1 120 %define t0 m8 121 %define t1 m9 122%endif 123 pmaddwd m0, t0 124 pmaddwd m1, t1 125 pmullw m2, m7 126 pmullw m3, m7 127 paddw m2, t0 128 paddw m3, t1 129 paddd m6, m0 130 paddd m6, m1 131 mova [v1q + orderq], m2 132 mova [v1q + orderq + mmsize], m3 133 jg .loop%1 134%if %1 135 jmp .end 136%endif 137%endmacro 138 139; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, 140; int order, int mul) 141INIT_XMM ssse3 142cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul 143 shl orderq, 1 144 movd m7, mulm 145 pshuflw m7, m7, 0 146 punpcklqdq m7, m7 147 pxor m6, m6 148 mov r4d, v2d 149 and r4d, 15 150 and v2q, ~15 151 and v3q, ~15 152 mova m4, [v2q + orderq] 153 mova m5, [v3q + orderq] 154 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) 155 cmp r4d, 0 156 je .loop0 157 cmp r4d, 2 158 je .loop2 159 cmp r4d, 4 160 je .loop4 161 cmp r4d, 6 162 je .loop6 163 cmp r4d, 8 164 je .loop8 165 cmp r4d, 10 166 je .loop10 167 cmp r4d, 12 168 je .loop12 169SCALARPRODUCT_LOOP 14 170SCALARPRODUCT_LOOP 12 171SCALARPRODUCT_LOOP 10 172SCALARPRODUCT_LOOP 8 173SCALARPRODUCT_LOOP 6 174SCALARPRODUCT_LOOP 4 175SCALARPRODUCT_LOOP 2 176SCALARPRODUCT_LOOP 0 177.end: 178 HADDD m6, m0 179 movd eax, m6 180 RET 181