1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* 32 point SSE-optimized DCT transform 3cabdff1aSopenharmony_ci;* Copyright (c) 2010 Vitor Sessak 4cabdff1aSopenharmony_ci;* 5cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci;****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciSECTION_RODATA 32 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_cips_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cips_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 29cabdff1aSopenharmony_ci dd 0.553104, 0.582935, 0.622504, 0.674808 30cabdff1aSopenharmony_ci dd -10.190008, -3.407609, -2.057781, -1.484165 31cabdff1aSopenharmony_ci dd -1.169440, -0.972568, -0.839350, -0.744536 32cabdff1aSopenharmony_ci dd 0.502419, 0.522499, 0.566944, 0.646822 33cabdff1aSopenharmony_ci dd 0.788155, 1.060678, 1.722447, 5.101149 34cabdff1aSopenharmony_ci dd 0.509796, 0.601345, 0.899976, 2.562916 35cabdff1aSopenharmony_ci dd 0.509796, 0.601345, 0.899976, 2.562916 36cabdff1aSopenharmony_ci dd 1.000000, 1.000000, 1.306563, 0.541196 37cabdff1aSopenharmony_ci dd 1.000000, 1.000000, 1.306563, 0.541196 38cabdff1aSopenharmony_ci dd 1.000000, 0.707107, 1.000000, -0.707107 39cabdff1aSopenharmony_ci dd 1.000000, 0.707107, 1.000000, -0.707107 40cabdff1aSopenharmony_ci dd 0.707107, 0.707107, 0.707107, 0.707107 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_ci%macro BUTTERFLY 4 43cabdff1aSopenharmony_ci subps %4, %1, %2 44cabdff1aSopenharmony_ci addps %2, %2, %1 45cabdff1aSopenharmony_ci mulps %1, %4, %3 46cabdff1aSopenharmony_ci%endmacro 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci%macro BUTTERFLY0 5 49cabdff1aSopenharmony_ci%if cpuflag(sse2) && notcpuflag(avx) 50cabdff1aSopenharmony_ci pshufd %4, %1, %5 51cabdff1aSopenharmony_ci xorps %1, %2 52cabdff1aSopenharmony_ci addps %1, %4 53cabdff1aSopenharmony_ci mulps %1, %3 54cabdff1aSopenharmony_ci%else 55cabdff1aSopenharmony_ci shufps %4, %1, %1, %5 56cabdff1aSopenharmony_ci xorps %1, %1, %2 57cabdff1aSopenharmony_ci addps %4, %4, %1 58cabdff1aSopenharmony_ci mulps %1, %4, %3 59cabdff1aSopenharmony_ci%endif 60cabdff1aSopenharmony_ci%endmacro 61cabdff1aSopenharmony_ci 62cabdff1aSopenharmony_ci%macro BUTTERFLY2 4 63cabdff1aSopenharmony_ci BUTTERFLY0 %1, %2, %3, %4, 0x1b 64cabdff1aSopenharmony_ci%endmacro 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ci%macro BUTTERFLY3 4 67cabdff1aSopenharmony_ci BUTTERFLY0 %1, %2, %3, %4, 0xb1 68cabdff1aSopenharmony_ci%endmacro 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_ci%macro BUTTERFLY3V 5 71cabdff1aSopenharmony_ci movaps m%5, m%1 72cabdff1aSopenharmony_ci addps m%1, m%2 73cabdff1aSopenharmony_ci subps m%5, m%2 74cabdff1aSopenharmony_ci SWAP %2, %5 75cabdff1aSopenharmony_ci mulps m%2, [ps_cos_vec+192] 76cabdff1aSopenharmony_ci movaps m%5, m%3 77cabdff1aSopenharmony_ci addps m%3, m%4 78cabdff1aSopenharmony_ci subps m%4, m%5 79cabdff1aSopenharmony_ci mulps m%4, [ps_cos_vec+192] 80cabdff1aSopenharmony_ci%endmacro 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_ci%macro PASS6_AND_PERMUTE 0 83cabdff1aSopenharmony_ci mov tmpd, [outq+4] 84cabdff1aSopenharmony_ci movss m7, [outq+72] 85cabdff1aSopenharmony_ci addss m7, [outq+76] 86cabdff1aSopenharmony_ci movss m3, [outq+56] 87cabdff1aSopenharmony_ci addss m3, [outq+60] 88cabdff1aSopenharmony_ci addss m4, m3 89cabdff1aSopenharmony_ci movss m2, [outq+52] 90cabdff1aSopenharmony_ci addss m2, m3 91cabdff1aSopenharmony_ci movss m3, [outq+104] 92cabdff1aSopenharmony_ci addss m3, [outq+108] 93cabdff1aSopenharmony_ci addss m1, m3 94cabdff1aSopenharmony_ci addss m5, m4 95cabdff1aSopenharmony_ci movss [outq+ 16], m1 96cabdff1aSopenharmony_ci movss m1, [outq+100] 97cabdff1aSopenharmony_ci addss m1, m3 98cabdff1aSopenharmony_ci movss m3, [outq+40] 99cabdff1aSopenharmony_ci movss [outq+ 48], m1 100cabdff1aSopenharmony_ci addss m3, [outq+44] 101cabdff1aSopenharmony_ci movss m1, [outq+100] 102cabdff1aSopenharmony_ci addss m4, m3 103cabdff1aSopenharmony_ci addss m3, m2 104cabdff1aSopenharmony_ci addss m1, [outq+108] 105cabdff1aSopenharmony_ci movss [outq+ 40], m3 106cabdff1aSopenharmony_ci addss m2, [outq+36] 107cabdff1aSopenharmony_ci movss m3, [outq+8] 108cabdff1aSopenharmony_ci movss [outq+ 56], m2 109cabdff1aSopenharmony_ci addss m3, [outq+12] 110cabdff1aSopenharmony_ci movss [outq+ 32], m3 111cabdff1aSopenharmony_ci movss m3, [outq+80] 112cabdff1aSopenharmony_ci movss [outq+ 8], m5 113cabdff1aSopenharmony_ci movss [outq+ 80], m1 114cabdff1aSopenharmony_ci movss m2, [outq+52] 115cabdff1aSopenharmony_ci movss m5, [outq+120] 116cabdff1aSopenharmony_ci addss m5, [outq+124] 117cabdff1aSopenharmony_ci movss m1, [outq+64] 118cabdff1aSopenharmony_ci addss m2, [outq+60] 119cabdff1aSopenharmony_ci addss m0, m5 120cabdff1aSopenharmony_ci addss m5, [outq+116] 121cabdff1aSopenharmony_ci mov [outq+64], tmpd 122cabdff1aSopenharmony_ci addss m6, m0 123cabdff1aSopenharmony_ci addss m1, m6 124cabdff1aSopenharmony_ci mov tmpd, [outq+12] 125cabdff1aSopenharmony_ci mov [outq+ 96], tmpd 126cabdff1aSopenharmony_ci movss [outq+ 4], m1 127cabdff1aSopenharmony_ci movss m1, [outq+24] 128cabdff1aSopenharmony_ci movss [outq+ 24], m4 129cabdff1aSopenharmony_ci movss m4, [outq+88] 130cabdff1aSopenharmony_ci addss m4, [outq+92] 131cabdff1aSopenharmony_ci addss m3, m4 132cabdff1aSopenharmony_ci addss m4, [outq+84] 133cabdff1aSopenharmony_ci mov tmpd, [outq+108] 134cabdff1aSopenharmony_ci addss m1, [outq+28] 135cabdff1aSopenharmony_ci addss m0, m1 136cabdff1aSopenharmony_ci addss m1, m5 137cabdff1aSopenharmony_ci addss m6, m3 138cabdff1aSopenharmony_ci addss m3, m0 139cabdff1aSopenharmony_ci addss m0, m7 140cabdff1aSopenharmony_ci addss m5, [outq+20] 141cabdff1aSopenharmony_ci addss m7, m1 142cabdff1aSopenharmony_ci movss [outq+ 12], m6 143cabdff1aSopenharmony_ci mov [outq+112], tmpd 144cabdff1aSopenharmony_ci movss m6, [outq+28] 145cabdff1aSopenharmony_ci movss [outq+ 28], m0 146cabdff1aSopenharmony_ci movss m0, [outq+36] 147cabdff1aSopenharmony_ci movss [outq+ 36], m7 148cabdff1aSopenharmony_ci addss m1, m4 149cabdff1aSopenharmony_ci movss m7, [outq+116] 150cabdff1aSopenharmony_ci addss m0, m2 151cabdff1aSopenharmony_ci addss m7, [outq+124] 152cabdff1aSopenharmony_ci movss [outq+ 72], m0 153cabdff1aSopenharmony_ci movss m0, [outq+44] 154cabdff1aSopenharmony_ci addss m2, m0 155cabdff1aSopenharmony_ci movss [outq+ 44], m1 156cabdff1aSopenharmony_ci movss [outq+ 88], m2 157cabdff1aSopenharmony_ci addss m0, [outq+60] 158cabdff1aSopenharmony_ci mov tmpd, [outq+60] 159cabdff1aSopenharmony_ci mov [outq+120], tmpd 160cabdff1aSopenharmony_ci movss [outq+104], m0 161cabdff1aSopenharmony_ci addss m4, m5 162cabdff1aSopenharmony_ci addss m5, [outq+68] 163cabdff1aSopenharmony_ci movss [outq+52], m4 164cabdff1aSopenharmony_ci movss [outq+60], m5 165cabdff1aSopenharmony_ci movss m4, [outq+68] 166cabdff1aSopenharmony_ci movss m5, [outq+20] 167cabdff1aSopenharmony_ci movss [outq+ 20], m3 168cabdff1aSopenharmony_ci addss m5, m7 169cabdff1aSopenharmony_ci addss m7, m6 170cabdff1aSopenharmony_ci addss m4, m5 171cabdff1aSopenharmony_ci movss m2, [outq+84] 172cabdff1aSopenharmony_ci addss m2, [outq+92] 173cabdff1aSopenharmony_ci addss m5, m2 174cabdff1aSopenharmony_ci movss [outq+ 68], m4 175cabdff1aSopenharmony_ci addss m2, m7 176cabdff1aSopenharmony_ci movss m4, [outq+76] 177cabdff1aSopenharmony_ci movss [outq+ 84], m2 178cabdff1aSopenharmony_ci movss [outq+ 76], m5 179cabdff1aSopenharmony_ci addss m7, m4 180cabdff1aSopenharmony_ci addss m6, [outq+124] 181cabdff1aSopenharmony_ci addss m4, m6 182cabdff1aSopenharmony_ci addss m6, [outq+92] 183cabdff1aSopenharmony_ci movss [outq+100], m4 184cabdff1aSopenharmony_ci movss [outq+108], m6 185cabdff1aSopenharmony_ci movss m6, [outq+92] 186cabdff1aSopenharmony_ci movss [outq+92], m7 187cabdff1aSopenharmony_ci addss m6, [outq+124] 188cabdff1aSopenharmony_ci movss [outq+116], m6 189cabdff1aSopenharmony_ci%endmacro 190cabdff1aSopenharmony_ci 191cabdff1aSopenharmony_ciINIT_YMM avx 192cabdff1aSopenharmony_ciSECTION .text 193cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 194cabdff1aSopenharmony_ci; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) 195cabdff1aSopenharmony_cicglobal dct32_float, 2,3,8, out, in, tmp 196cabdff1aSopenharmony_ci ; pass 1 197cabdff1aSopenharmony_ci vmovaps m4, [inq+0] 198cabdff1aSopenharmony_ci vinsertf128 m5, m5, [inq+96], 1 199cabdff1aSopenharmony_ci vinsertf128 m5, m5, [inq+112], 0 200cabdff1aSopenharmony_ci vshufps m5, m5, m5, 0x1b 201cabdff1aSopenharmony_ci BUTTERFLY m4, m5, [ps_cos_vec], m6 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci vmovaps m2, [inq+64] 204cabdff1aSopenharmony_ci vinsertf128 m6, m6, [inq+32], 1 205cabdff1aSopenharmony_ci vinsertf128 m6, m6, [inq+48], 0 206cabdff1aSopenharmony_ci vshufps m6, m6, m6, 0x1b 207cabdff1aSopenharmony_ci BUTTERFLY m2, m6, [ps_cos_vec+32], m0 208cabdff1aSopenharmony_ci 209cabdff1aSopenharmony_ci ; pass 2 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci BUTTERFLY m5, m6, [ps_cos_vec+64], m0 212cabdff1aSopenharmony_ci BUTTERFLY m4, m2, [ps_cos_vec+64], m7 213cabdff1aSopenharmony_ci 214cabdff1aSopenharmony_ci 215cabdff1aSopenharmony_ci ; pass 3 216cabdff1aSopenharmony_ci vperm2f128 m3, m6, m4, 0x31 217cabdff1aSopenharmony_ci vperm2f128 m1, m6, m4, 0x20 218cabdff1aSopenharmony_ci vshufps m3, m3, m3, 0x1b 219cabdff1aSopenharmony_ci 220cabdff1aSopenharmony_ci BUTTERFLY m1, m3, [ps_cos_vec+96], m6 221cabdff1aSopenharmony_ci 222cabdff1aSopenharmony_ci 223cabdff1aSopenharmony_ci vperm2f128 m4, m5, m2, 0x20 224cabdff1aSopenharmony_ci vperm2f128 m5, m5, m2, 0x31 225cabdff1aSopenharmony_ci vshufps m5, m5, m5, 0x1b 226cabdff1aSopenharmony_ci 227cabdff1aSopenharmony_ci BUTTERFLY m4, m5, [ps_cos_vec+96], m6 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci ; pass 4 230cabdff1aSopenharmony_ci vmovaps m6, [ps_p1p1m1m1+0] 231cabdff1aSopenharmony_ci vmovaps m2, [ps_cos_vec+128] 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_ci BUTTERFLY2 m5, m6, m2, m7 234cabdff1aSopenharmony_ci BUTTERFLY2 m4, m6, m2, m7 235cabdff1aSopenharmony_ci BUTTERFLY2 m1, m6, m2, m7 236cabdff1aSopenharmony_ci BUTTERFLY2 m3, m6, m2, m7 237cabdff1aSopenharmony_ci 238cabdff1aSopenharmony_ci 239cabdff1aSopenharmony_ci ; pass 5 240cabdff1aSopenharmony_ci vshufps m6, m6, m6, 0xcc 241cabdff1aSopenharmony_ci vmovaps m2, [ps_cos_vec+160] 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci BUTTERFLY3 m5, m6, m2, m7 244cabdff1aSopenharmony_ci BUTTERFLY3 m4, m6, m2, m7 245cabdff1aSopenharmony_ci BUTTERFLY3 m1, m6, m2, m7 246cabdff1aSopenharmony_ci BUTTERFLY3 m3, m6, m2, m7 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_ci vperm2f128 m6, m3, m3, 0x31 249cabdff1aSopenharmony_ci vmovaps [outq], m3 250cabdff1aSopenharmony_ci 251cabdff1aSopenharmony_ci vextractf128 [outq+64], m5, 1 252cabdff1aSopenharmony_ci vextractf128 [outq+32], m5, 0 253cabdff1aSopenharmony_ci 254cabdff1aSopenharmony_ci vextractf128 [outq+80], m4, 1 255cabdff1aSopenharmony_ci vextractf128 [outq+48], m4, 0 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_ci vperm2f128 m0, m1, m1, 0x31 258cabdff1aSopenharmony_ci vmovaps [outq+96], m1 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci vzeroupper 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_ci ; pass 6, no SIMD... 263cabdff1aSopenharmony_ciINIT_XMM 264cabdff1aSopenharmony_ci PASS6_AND_PERMUTE 265cabdff1aSopenharmony_ci RET 266cabdff1aSopenharmony_ci%endif 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci%if ARCH_X86_64 269cabdff1aSopenharmony_ci%define SPILL SWAP 270cabdff1aSopenharmony_ci%define UNSPILL SWAP 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci%macro PASS5 0 273cabdff1aSopenharmony_ci nop ; FIXME code alignment 274cabdff1aSopenharmony_ci SWAP 5, 8 275cabdff1aSopenharmony_ci SWAP 4, 12 276cabdff1aSopenharmony_ci SWAP 6, 14 277cabdff1aSopenharmony_ci SWAP 7, 13 278cabdff1aSopenharmony_ci SWAP 0, 15 279cabdff1aSopenharmony_ci PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 280cabdff1aSopenharmony_ci TRANSPOSE4x4PS 8, 9, 10, 11, 0 281cabdff1aSopenharmony_ci BUTTERFLY3V 8, 9, 10, 11, 0 282cabdff1aSopenharmony_ci addps m10, m11 283cabdff1aSopenharmony_ci TRANSPOSE4x4PS 12, 13, 14, 15, 0 284cabdff1aSopenharmony_ci BUTTERFLY3V 12, 13, 14, 15, 0 285cabdff1aSopenharmony_ci addps m14, m15 286cabdff1aSopenharmony_ci addps m12, m14 287cabdff1aSopenharmony_ci addps m14, m13 288cabdff1aSopenharmony_ci addps m13, m15 289cabdff1aSopenharmony_ci%endmacro 290cabdff1aSopenharmony_ci 291cabdff1aSopenharmony_ci%macro PASS6 0 292cabdff1aSopenharmony_ci SWAP 9, 12 293cabdff1aSopenharmony_ci SWAP 11, 14 294cabdff1aSopenharmony_ci movss [outq+0x00], m8 295cabdff1aSopenharmony_ci pshuflw m0, m8, 0xe 296cabdff1aSopenharmony_ci movss [outq+0x10], m9 297cabdff1aSopenharmony_ci pshuflw m1, m9, 0xe 298cabdff1aSopenharmony_ci movss [outq+0x20], m10 299cabdff1aSopenharmony_ci pshuflw m2, m10, 0xe 300cabdff1aSopenharmony_ci movss [outq+0x30], m11 301cabdff1aSopenharmony_ci pshuflw m3, m11, 0xe 302cabdff1aSopenharmony_ci movss [outq+0x40], m12 303cabdff1aSopenharmony_ci pshuflw m4, m12, 0xe 304cabdff1aSopenharmony_ci movss [outq+0x50], m13 305cabdff1aSopenharmony_ci pshuflw m5, m13, 0xe 306cabdff1aSopenharmony_ci movss [outq+0x60], m14 307cabdff1aSopenharmony_ci pshuflw m6, m14, 0xe 308cabdff1aSopenharmony_ci movaps [outq+0x70], m15 309cabdff1aSopenharmony_ci pshuflw m7, m15, 0xe 310cabdff1aSopenharmony_ci addss m0, m1 311cabdff1aSopenharmony_ci addss m1, m2 312cabdff1aSopenharmony_ci movss [outq+0x08], m0 313cabdff1aSopenharmony_ci addss m2, m3 314cabdff1aSopenharmony_ci movss [outq+0x18], m1 315cabdff1aSopenharmony_ci addss m3, m4 316cabdff1aSopenharmony_ci movss [outq+0x28], m2 317cabdff1aSopenharmony_ci addss m4, m5 318cabdff1aSopenharmony_ci movss [outq+0x38], m3 319cabdff1aSopenharmony_ci addss m5, m6 320cabdff1aSopenharmony_ci movss [outq+0x48], m4 321cabdff1aSopenharmony_ci addss m6, m7 322cabdff1aSopenharmony_ci movss [outq+0x58], m5 323cabdff1aSopenharmony_ci movss [outq+0x68], m6 324cabdff1aSopenharmony_ci movss [outq+0x78], m7 325cabdff1aSopenharmony_ci 326cabdff1aSopenharmony_ci PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 327cabdff1aSopenharmony_ci movhlps m0, m1 328cabdff1aSopenharmony_ci pshufd m1, m1, 3 329cabdff1aSopenharmony_ci SWAP 0, 2, 4, 6, 8, 10, 12, 14 330cabdff1aSopenharmony_ci SWAP 1, 3, 5, 7, 9, 11, 13, 15 331cabdff1aSopenharmony_ci%rep 7 332cabdff1aSopenharmony_ci movhlps m0, m1 333cabdff1aSopenharmony_ci pshufd m1, m1, 3 334cabdff1aSopenharmony_ci addss m15, m1 335cabdff1aSopenharmony_ci SWAP 0, 2, 4, 6, 8, 10, 12, 14 336cabdff1aSopenharmony_ci SWAP 1, 3, 5, 7, 9, 11, 13, 15 337cabdff1aSopenharmony_ci%endrep 338cabdff1aSopenharmony_ci%assign i 4 339cabdff1aSopenharmony_ci%rep 15 340cabdff1aSopenharmony_ci addss m0, m1 341cabdff1aSopenharmony_ci movss [outq+i], m0 342cabdff1aSopenharmony_ci SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 343cabdff1aSopenharmony_ci %assign i i+8 344cabdff1aSopenharmony_ci%endrep 345cabdff1aSopenharmony_ci%endmacro 346cabdff1aSopenharmony_ci 347cabdff1aSopenharmony_ci%else ; ARCH_X86_32 348cabdff1aSopenharmony_ci%macro SPILL 2 ; xmm#, mempos 349cabdff1aSopenharmony_ci movaps [outq+(%2-8)*16], m%1 350cabdff1aSopenharmony_ci%endmacro 351cabdff1aSopenharmony_ci%macro UNSPILL 2 352cabdff1aSopenharmony_ci movaps m%1, [outq+(%2-8)*16] 353cabdff1aSopenharmony_ci%endmacro 354cabdff1aSopenharmony_ci 355cabdff1aSopenharmony_ci%define PASS6 PASS6_AND_PERMUTE 356cabdff1aSopenharmony_ci%macro PASS5 0 357cabdff1aSopenharmony_ci movaps m2, [ps_cos_vec+160] 358cabdff1aSopenharmony_ci shufps m3, m3, 0xcc 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci BUTTERFLY3 m5, m3, m2, m1 361cabdff1aSopenharmony_ci SPILL 5, 8 362cabdff1aSopenharmony_ci 363cabdff1aSopenharmony_ci UNSPILL 1, 9 364cabdff1aSopenharmony_ci BUTTERFLY3 m1, m3, m2, m5 365cabdff1aSopenharmony_ci SPILL 1, 14 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci BUTTERFLY3 m4, m3, m2, m5 368cabdff1aSopenharmony_ci SPILL 4, 12 369cabdff1aSopenharmony_ci 370cabdff1aSopenharmony_ci BUTTERFLY3 m7, m3, m2, m5 371cabdff1aSopenharmony_ci SPILL 7, 13 372cabdff1aSopenharmony_ci 373cabdff1aSopenharmony_ci UNSPILL 5, 10 374cabdff1aSopenharmony_ci BUTTERFLY3 m5, m3, m2, m7 375cabdff1aSopenharmony_ci SPILL 5, 10 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci UNSPILL 4, 11 378cabdff1aSopenharmony_ci BUTTERFLY3 m4, m3, m2, m7 379cabdff1aSopenharmony_ci SPILL 4, 11 380cabdff1aSopenharmony_ci 381cabdff1aSopenharmony_ci BUTTERFLY3 m6, m3, m2, m7 382cabdff1aSopenharmony_ci SPILL 6, 9 383cabdff1aSopenharmony_ci 384cabdff1aSopenharmony_ci BUTTERFLY3 m0, m3, m2, m7 385cabdff1aSopenharmony_ci SPILL 0, 15 386cabdff1aSopenharmony_ci%endmacro 387cabdff1aSopenharmony_ci%endif 388cabdff1aSopenharmony_ci 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci; void ff_dct32_float(FFTSample *out, const FFTSample *in) 391cabdff1aSopenharmony_ci%macro DCT32_FUNC 0 392cabdff1aSopenharmony_cicglobal dct32_float, 2, 3, 16, out, in, tmp 393cabdff1aSopenharmony_ci ; pass 1 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci movaps m0, [inq+0] 396cabdff1aSopenharmony_ci LOAD_INV m1, [inq+112] 397cabdff1aSopenharmony_ci BUTTERFLY m0, m1, [ps_cos_vec], m3 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci movaps m7, [inq+64] 400cabdff1aSopenharmony_ci LOAD_INV m4, [inq+48] 401cabdff1aSopenharmony_ci BUTTERFLY m7, m4, [ps_cos_vec+32], m3 402cabdff1aSopenharmony_ci 403cabdff1aSopenharmony_ci ; pass 2 404cabdff1aSopenharmony_ci movaps m2, [ps_cos_vec+64] 405cabdff1aSopenharmony_ci BUTTERFLY m1, m4, m2, m3 406cabdff1aSopenharmony_ci SPILL 1, 11 407cabdff1aSopenharmony_ci SPILL 4, 8 408cabdff1aSopenharmony_ci 409cabdff1aSopenharmony_ci ; pass 1 410cabdff1aSopenharmony_ci movaps m1, [inq+16] 411cabdff1aSopenharmony_ci LOAD_INV m6, [inq+96] 412cabdff1aSopenharmony_ci BUTTERFLY m1, m6, [ps_cos_vec+16], m3 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_ci movaps m4, [inq+80] 415cabdff1aSopenharmony_ci LOAD_INV m5, [inq+32] 416cabdff1aSopenharmony_ci BUTTERFLY m4, m5, [ps_cos_vec+48], m3 417cabdff1aSopenharmony_ci 418cabdff1aSopenharmony_ci ; pass 2 419cabdff1aSopenharmony_ci BUTTERFLY m0, m7, m2, m3 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci movaps m2, [ps_cos_vec+80] 422cabdff1aSopenharmony_ci BUTTERFLY m6, m5, m2, m3 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci BUTTERFLY m1, m4, m2, m3 425cabdff1aSopenharmony_ci 426cabdff1aSopenharmony_ci ; pass 3 427cabdff1aSopenharmony_ci movaps m2, [ps_cos_vec+96] 428cabdff1aSopenharmony_ci shufps m1, m1, 0x1b 429cabdff1aSopenharmony_ci BUTTERFLY m0, m1, m2, m3 430cabdff1aSopenharmony_ci SPILL 0, 15 431cabdff1aSopenharmony_ci SPILL 1, 14 432cabdff1aSopenharmony_ci 433cabdff1aSopenharmony_ci UNSPILL 0, 8 434cabdff1aSopenharmony_ci shufps m5, m5, 0x1b 435cabdff1aSopenharmony_ci BUTTERFLY m0, m5, m2, m3 436cabdff1aSopenharmony_ci 437cabdff1aSopenharmony_ci UNSPILL 1, 11 438cabdff1aSopenharmony_ci shufps m6, m6, 0x1b 439cabdff1aSopenharmony_ci BUTTERFLY m1, m6, m2, m3 440cabdff1aSopenharmony_ci SPILL 1, 11 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ci shufps m4, m4, 0x1b 443cabdff1aSopenharmony_ci BUTTERFLY m7, m4, m2, m3 444cabdff1aSopenharmony_ci 445cabdff1aSopenharmony_ci ; pass 4 446cabdff1aSopenharmony_ci movaps m3, [ps_p1p1m1m1+0] 447cabdff1aSopenharmony_ci movaps m2, [ps_cos_vec+128] 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci BUTTERFLY2 m5, m3, m2, m1 450cabdff1aSopenharmony_ci 451cabdff1aSopenharmony_ci BUTTERFLY2 m0, m3, m2, m1 452cabdff1aSopenharmony_ci SPILL 0, 9 453cabdff1aSopenharmony_ci 454cabdff1aSopenharmony_ci BUTTERFLY2 m6, m3, m2, m1 455cabdff1aSopenharmony_ci SPILL 6, 10 456cabdff1aSopenharmony_ci 457cabdff1aSopenharmony_ci UNSPILL 0, 11 458cabdff1aSopenharmony_ci BUTTERFLY2 m0, m3, m2, m1 459cabdff1aSopenharmony_ci SPILL 0, 11 460cabdff1aSopenharmony_ci 461cabdff1aSopenharmony_ci BUTTERFLY2 m4, m3, m2, m1 462cabdff1aSopenharmony_ci 463cabdff1aSopenharmony_ci BUTTERFLY2 m7, m3, m2, m1 464cabdff1aSopenharmony_ci 465cabdff1aSopenharmony_ci UNSPILL 6, 14 466cabdff1aSopenharmony_ci BUTTERFLY2 m6, m3, m2, m1 467cabdff1aSopenharmony_ci 468cabdff1aSopenharmony_ci UNSPILL 0, 15 469cabdff1aSopenharmony_ci BUTTERFLY2 m0, m3, m2, m1 470cabdff1aSopenharmony_ci 471cabdff1aSopenharmony_ci PASS5 472cabdff1aSopenharmony_ci PASS6 473cabdff1aSopenharmony_ci RET 474cabdff1aSopenharmony_ci%endmacro 475cabdff1aSopenharmony_ci 476cabdff1aSopenharmony_ci%macro LOAD_INV 2 477cabdff1aSopenharmony_ci pshufd %1, %2, 0x1b 478cabdff1aSopenharmony_ci%endmacro 479cabdff1aSopenharmony_ci 480cabdff1aSopenharmony_ciINIT_XMM sse2 481cabdff1aSopenharmony_ciDCT32_FUNC 482