1;****************************************************************************** 2;* SIMD-optimized functions for the DCA decoder 3;* Copyright (C) 2016 James Almer 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION .text 25 26%define sizeof_float 4 27%define FMA3_OFFSET (8 * cpuflag(fma3)) 28 29%macro LFE_FIR0_FLOAT 0 30cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2 31 shr nblocksd, 1 32 sub lfeq, 7*sizeof_float 33 mov cnt1d, 32*sizeof_float 34 mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET 35 lea coeffq, [coeffq+cnt1q*8] 36 add samplesq, cnt1q 37 neg cnt1q 38 39.loop: 40%if cpuflag(avx) 41 cvtdq2ps m4, [lfeq+16] 42 cvtdq2ps m5, [lfeq ] 43 shufps m7, m4, m4, q0123 44 shufps m6, m5, m5, q0123 45%else 46 movu m4, [lfeq+16] 47 movu m5, [lfeq ] 48 cvtdq2ps m4, m4 49 cvtdq2ps m5, m5 50 pshufd m7, m4, q0123 51 pshufd m6, m5, q0123 52%endif 53 54.inner_loop: 55%if ARCH_X86_64 56 movaps m8, [coeffq+cnt1q*8 ] 57 movaps m9, [coeffq+cnt1q*8+16] 58 movaps m10, [coeffq+cnt1q*8+32] 59 movaps m11, [coeffq+cnt1q*8+48] 60%if cpuflag(fma3) 61 movaps m12, [coeffq+cnt1q*8+64] 62 movaps m13, [coeffq+cnt1q*8+80] 63 movaps m14, [coeffq+cnt1q*8+96] 64 movaps m15, [coeffq+cnt1q*8+112] 65 mulps m0, m7, m8 66 mulps m1, m7, m10 67 mulps m2, m7, m12 68 mulps m3, m7, m14 69 fmaddps m0, m6, m9, m0 70 fmaddps m1, m6, m11, m1 71 fmaddps m2, m6, m13, m2 72 fmaddps m3, m6, m15, m3 73 74 haddps m0, m1 75 haddps m2, m3 76 haddps m0, m2 77 movaps [samplesq+cnt1q], m0 78%else 79 mulps m0, m7, m8 80 mulps m1, m6, m9 81 mulps m2, m7, m10 82 mulps m3, m6, m11 83 addps m0, m1 84 addps m2, m3 85 86 unpckhps m3, m0, m2 87 unpcklps m0, m2 88 addps m3, m0 89 movhlps m2, m3 90 addps m2, m3 91 movlps [samplesq+cnt1q], m2 92%endif 93%else ; ARCH_X86_32 94%if cpuflag(fma3) 95 mulps m0, m7, [coeffq+cnt1q*8 ] 96 mulps m1, m7, [coeffq+cnt1q*8+32 ] 97 mulps m2, m7, [coeffq+cnt1q*8+64 ] 98 mulps m3, m7, [coeffq+cnt1q*8+96 ] 99 fmaddps m0, m6, [coeffq+cnt1q*8+16 ], m0 100 fmaddps m1, m6, [coeffq+cnt1q*8+48 ], m1 101 fmaddps m2, m6, [coeffq+cnt1q*8+80 ], m2 102 fmaddps m3, m6, [coeffq+cnt1q*8+112], m3 103 104 haddps m0, m1 105 haddps m2, m3 106 haddps m0, m2 107 movaps [samplesq+cnt1q], m0 108%else 109 mulps m0, m7, [coeffq+cnt1q*8 ] 110 mulps m1, m6, [coeffq+cnt1q*8+16] 111 mulps m2, m7, [coeffq+cnt1q*8+32] 112 mulps m3, m6, [coeffq+cnt1q*8+48] 113 addps m0, m1 114 addps m2, m3 115 116 unpckhps m3, m0, m2 117 unpcklps m0, m2 118 addps m3, m0 119 movhlps m2, m3 120 addps m2, m3 121 movlps [samplesq+cnt1q], m2 122%endif 123%endif; ARCH 124 125%if ARCH_X86_64 126%if cpuflag(fma3) 127 mulps m8, m5 128 mulps m10, m5 129 mulps m12, m5 130 mulps m14, m5 131 fmaddps m8, m4, m9, m8 132 fmaddps m10, m4, m11, m10 133 fmaddps m12, m4, m13, m12 134 fmaddps m14, m4, m15, m14 135 136 haddps m10, m8 137 haddps m14, m12 138 haddps m14, m10 139 movaps [samplesq+cnt2q], m14 140%else 141 mulps m8, m5 142 mulps m9, m4 143 mulps m10, m5 144 mulps m11, m4 145 addps m8, m9 146 addps m10, m11 147 148 unpckhps m11, m10, m8 149 unpcklps m10, m8 150 addps m11, m10 151 movhlps m8, m11 152 addps m8, m11 153 movlps [samplesq+cnt2q], m8 154%endif 155%else ; ARCH_X86_32 156%if cpuflag(fma3) 157 mulps m0, m5, [coeffq+cnt1q*8 ] 158 mulps m1, m5, [coeffq+cnt1q*8+32 ] 159 mulps m2, m5, [coeffq+cnt1q*8+64 ] 160 mulps m3, m5, [coeffq+cnt1q*8+96 ] 161 fmaddps m0, m4, [coeffq+cnt1q*8+16 ], m0 162 fmaddps m1, m4, [coeffq+cnt1q*8+48 ], m1 163 fmaddps m2, m4, [coeffq+cnt1q*8+80 ], m2 164 fmaddps m3, m4, [coeffq+cnt1q*8+112], m3 165 166 haddps m1, m0 167 haddps m3, m2 168 haddps m3, m1 169 movaps [samplesq+cnt2q], m3 170%else 171 mulps m0, m5, [coeffq+cnt1q*8 ] 172 mulps m1, m4, [coeffq+cnt1q*8+16] 173 mulps m2, m5, [coeffq+cnt1q*8+32] 174 mulps m3, m4, [coeffq+cnt1q*8+48] 175 addps m0, m1 176 addps m2, m3 177 178 unpckhps m3, m2, m0 179 unpcklps m2, m0 180 addps m3, m2 181 movhlps m0, m3 182 addps m0, m3 183 movlps [samplesq+cnt2q], m0 184%endif 185%endif; ARCH 186 187 sub cnt2d, 8 + FMA3_OFFSET 188 add cnt1q, 8 + FMA3_OFFSET 189 jl .inner_loop 190 191 add lfeq, 4 192 add samplesq, 64*sizeof_float 193 mov cnt1q, -32*sizeof_float 194 mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET 195 sub nblocksd, 1 196 jg .loop 197 RET 198%endmacro 199 200INIT_XMM sse2 201LFE_FIR0_FLOAT 202%if HAVE_AVX_EXTERNAL 203INIT_XMM avx 204LFE_FIR0_FLOAT 205%endif 206%if HAVE_FMA3_EXTERNAL 207INIT_XMM fma3 208LFE_FIR0_FLOAT 209%endif 210 211%macro LFE_FIR1_FLOAT 0 212cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2 213 shr nblocksd, 2 214 sub lfeq, 3*sizeof_float 215 mov cnt1d, 64*sizeof_float 216 mov cnt2d, 64*sizeof_float-16 217 lea coeffq, [coeffq+cnt1q*4] 218 add samplesq, cnt1q 219 neg cnt1q 220 221.loop: 222%if cpuflag(avx) 223 cvtdq2ps m4, [lfeq] 224 shufps m5, m4, m4, q0123 225%else 226 movu m4, [lfeq] 227 cvtdq2ps m4, m4 228 pshufd m5, m4, q0123 229%endif 230 231.inner_loop: 232 movaps m6, [coeffq+cnt1q*4 ] 233 movaps m7, [coeffq+cnt1q*4+16] 234 mulps m0, m5, m6 235 mulps m1, m5, m7 236%if ARCH_X86_64 237 movaps m8, [coeffq+cnt1q*4+32] 238 movaps m9, [coeffq+cnt1q*4+48] 239 mulps m2, m5, m8 240 mulps m3, m5, m9 241%else 242 mulps m2, m5, [coeffq+cnt1q*4+32] 243 mulps m3, m5, [coeffq+cnt1q*4+48] 244%endif 245 246 haddps m0, m1 247 haddps m2, m3 248 haddps m0, m2 249 movaps [samplesq+cnt1q], m0 250 251 mulps m6, m4 252 mulps m7, m4 253%if ARCH_X86_64 254 mulps m8, m4 255 mulps m9, m4 256 257 haddps m6, m7 258 haddps m8, m9 259 haddps m6, m8 260%else 261 mulps m2, m4, [coeffq+cnt1q*4+32] 262 mulps m3, m4, [coeffq+cnt1q*4+48] 263 264 haddps m6, m7 265 haddps m2, m3 266 haddps m6, m2 267%endif 268 movaps [samplesq+cnt2q], m6 269 270 sub cnt2d, 16 271 add cnt1q, 16 272 jl .inner_loop 273 274 add lfeq, sizeof_float 275 add samplesq, 128*sizeof_float 276 mov cnt1q, -64*sizeof_float 277 mov cnt2d, 64*sizeof_float-16 278 sub nblocksd, 1 279 jg .loop 280 RET 281%endmacro 282 283INIT_XMM sse3 284LFE_FIR1_FLOAT 285%if HAVE_AVX_EXTERNAL 286INIT_XMM avx 287LFE_FIR1_FLOAT 288%endif 289