1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * ARM NEON optimised FFT 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 5cabdff1aSopenharmony_ci * Copyright (c) 2009 Naotoshi Nojiri 6cabdff1aSopenharmony_ci * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * This algorithm (though not any of the implementation details) is 9cabdff1aSopenharmony_ci * based on libdjbfft by D. J. Bernstein. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * This file is part of FFmpeg. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 14cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 15cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 16cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 19cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 20cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21cabdff1aSopenharmony_ci * Lesser General Public License for more details. 22cabdff1aSopenharmony_ci * 23cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 24cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 25cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 26cabdff1aSopenharmony_ci */ 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ci#define M_SQRT1_2 0.70710678118654752440 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci.macro transpose d0, d1, s0, s1 33cabdff1aSopenharmony_ci trn1 \d0, \s0, \s1 34cabdff1aSopenharmony_ci trn2 \d1, \s0, \s1 35cabdff1aSopenharmony_ci.endm 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_cifunction fft4_neon 39cabdff1aSopenharmony_ci AARCH64_VALID_JUMP_TARGET 40cabdff1aSopenharmony_ci ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_ci fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1 43cabdff1aSopenharmony_ci fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1 44cabdff1aSopenharmony_ci 45cabdff1aSopenharmony_ci ext v16.8b, v2.8b, v3.8b, #4 46cabdff1aSopenharmony_ci ext v17.8b, v3.8b, v2.8b, #4 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3 49cabdff1aSopenharmony_ci fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ci fadd v0.2s, v4.2s, v5.2s 52cabdff1aSopenharmony_ci fsub v2.2s, v4.2s, v5.2s 53cabdff1aSopenharmony_ci fadd v1.2s, v6.2s, v7.2s 54cabdff1aSopenharmony_ci fsub v3.2s, v6.2s, v7.2s 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] 57cabdff1aSopenharmony_ci 58cabdff1aSopenharmony_ci ret 59cabdff1aSopenharmony_ciendfunc 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_cifunction fft8_neon 62cabdff1aSopenharmony_ci AARCH64_VALID_JUMP_TARGET 63cabdff1aSopenharmony_ci mov x1, x0 64cabdff1aSopenharmony_ci ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 65cabdff1aSopenharmony_ci ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] 66cabdff1aSopenharmony_ci ext v22.8b, v2.8b, v3.8b, #4 67cabdff1aSopenharmony_ci ext v23.8b, v3.8b, v2.8b, #4 68cabdff1aSopenharmony_ci fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 69cabdff1aSopenharmony_ci fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 70cabdff1aSopenharmony_ci fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 71cabdff1aSopenharmony_ci fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 72cabdff1aSopenharmony_ci rev64 v27.2s, v28.2s // ??? 73cabdff1aSopenharmony_ci fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 74cabdff1aSopenharmony_ci fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 75cabdff1aSopenharmony_ci fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w 76cabdff1aSopenharmony_ci ext v6.8b, v4.8b, v5.8b, #4 77cabdff1aSopenharmony_ci ext v7.8b, v5.8b, v4.8b, #4 78cabdff1aSopenharmony_ci fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w 79cabdff1aSopenharmony_ci fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 80cabdff1aSopenharmony_ci fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 81cabdff1aSopenharmony_ci fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w 82cabdff1aSopenharmony_ci fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w 83cabdff1aSopenharmony_ci fadd v0.2s, v20.2s, v21.2s 84cabdff1aSopenharmony_ci fsub v2.2s, v20.2s, v21.2s 85cabdff1aSopenharmony_ci fadd v1.2s, v22.2s, v23.2s 86cabdff1aSopenharmony_ci rev64 v26.2s, v26.2s 87cabdff1aSopenharmony_ci rev64 v27.2s, v27.2s 88cabdff1aSopenharmony_ci fsub v3.2s, v22.2s, v23.2s 89cabdff1aSopenharmony_ci fsub v6.2s, v6.2s, v7.2s 90cabdff1aSopenharmony_ci fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 91cabdff1aSopenharmony_ci fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 92cabdff1aSopenharmony_ci fadd v7.2s, v4.2s, v5.2s 93cabdff1aSopenharmony_ci fsub v18.2s, v2.2s, v6.2s 94cabdff1aSopenharmony_ci ext v26.8b, v24.8b, v25.8b, #4 95cabdff1aSopenharmony_ci ext v27.8b, v25.8b, v24.8b, #4 96cabdff1aSopenharmony_ci fadd v2.2s, v2.2s, v6.2s 97cabdff1aSopenharmony_ci fsub v16.2s, v0.2s, v7.2s 98cabdff1aSopenharmony_ci fadd v5.2s, v25.2s, v24.2s 99cabdff1aSopenharmony_ci fsub v4.2s, v26.2s, v27.2s 100cabdff1aSopenharmony_ci fadd v0.2s, v0.2s, v7.2s 101cabdff1aSopenharmony_ci fsub v17.2s, v1.2s, v5.2s 102cabdff1aSopenharmony_ci fsub v19.2s, v3.2s, v4.2s 103cabdff1aSopenharmony_ci fadd v3.2s, v3.2s, v4.2s 104cabdff1aSopenharmony_ci fadd v1.2s, v1.2s, v5.2s 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] 107cabdff1aSopenharmony_ci st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1] 108cabdff1aSopenharmony_ci 109cabdff1aSopenharmony_ci ret 110cabdff1aSopenharmony_ciendfunc 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_cifunction fft16_neon 113cabdff1aSopenharmony_ci AARCH64_VALID_JUMP_TARGET 114cabdff1aSopenharmony_ci mov x1, x0 115cabdff1aSopenharmony_ci ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 116cabdff1aSopenharmony_ci ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32 117cabdff1aSopenharmony_ci ext v22.8b, v2.8b, v3.8b, #4 118cabdff1aSopenharmony_ci ext v23.8b, v3.8b, v2.8b, #4 119cabdff1aSopenharmony_ci fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 120cabdff1aSopenharmony_ci fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 121cabdff1aSopenharmony_ci fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 122cabdff1aSopenharmony_ci fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 123cabdff1aSopenharmony_ci rev64 v27.2s, v28.2s // ??? 124cabdff1aSopenharmony_ci fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 125cabdff1aSopenharmony_ci fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 126cabdff1aSopenharmony_ci fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w 127cabdff1aSopenharmony_ci ext v6.8b, v4.8b, v5.8b, #4 128cabdff1aSopenharmony_ci ext v7.8b, v5.8b, v4.8b, #4 129cabdff1aSopenharmony_ci fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w 130cabdff1aSopenharmony_ci fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 131cabdff1aSopenharmony_ci fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 132cabdff1aSopenharmony_ci fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w 133cabdff1aSopenharmony_ci fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w 134cabdff1aSopenharmony_ci fadd v0.2s, v20.2s, v21.2s 135cabdff1aSopenharmony_ci fsub v2.2s, v20.2s, v21.2s 136cabdff1aSopenharmony_ci fadd v1.2s, v22.2s, v23.2s 137cabdff1aSopenharmony_ci rev64 v26.2s, v26.2s 138cabdff1aSopenharmony_ci rev64 v27.2s, v27.2s 139cabdff1aSopenharmony_ci fsub v3.2s, v22.2s, v23.2s 140cabdff1aSopenharmony_ci fsub v6.2s, v6.2s, v7.2s 141cabdff1aSopenharmony_ci fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 142cabdff1aSopenharmony_ci fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 143cabdff1aSopenharmony_ci fadd v7.2s, v4.2s, v5.2s 144cabdff1aSopenharmony_ci fsub v18.2s, v2.2s, v6.2s 145cabdff1aSopenharmony_ci ld1 {v20.4s,v21.4s}, [x0], #32 146cabdff1aSopenharmony_ci ld1 {v22.4s,v23.4s}, [x0], #32 147cabdff1aSopenharmony_ci ext v26.8b, v24.8b, v25.8b, #4 148cabdff1aSopenharmony_ci ext v27.8b, v25.8b, v24.8b, #4 149cabdff1aSopenharmony_ci fadd v2.2s, v2.2s, v6.2s 150cabdff1aSopenharmony_ci fsub v16.2s, v0.2s, v7.2s 151cabdff1aSopenharmony_ci fadd v5.2s, v25.2s, v24.2s 152cabdff1aSopenharmony_ci fsub v4.2s, v26.2s, v27.2s 153cabdff1aSopenharmony_ci transpose v24.2d, v25.2d, v20.2d, v22.2d 154cabdff1aSopenharmony_ci transpose v26.2d, v27.2d, v21.2d, v23.2d 155cabdff1aSopenharmony_ci fadd v0.2s, v0.2s, v7.2s 156cabdff1aSopenharmony_ci fsub v17.2s, v1.2s, v5.2s 157cabdff1aSopenharmony_ci fsub v19.2s, v3.2s, v4.2s 158cabdff1aSopenharmony_ci fadd v3.2s, v3.2s, v4.2s 159cabdff1aSopenharmony_ci fadd v1.2s, v1.2s, v5.2s 160cabdff1aSopenharmony_ci ext v20.16b, v21.16b, v21.16b, #4 161cabdff1aSopenharmony_ci ext v21.16b, v23.16b, v23.16b, #4 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]} 164cabdff1aSopenharmony_ci zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]} 165cabdff1aSopenharmony_ci zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]} 166cabdff1aSopenharmony_ci zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]} 167cabdff1aSopenharmony_ci 168cabdff1aSopenharmony_ci // 2 x fft4 169cabdff1aSopenharmony_ci transpose v22.2d, v23.2d, v20.2d, v21.2d 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci fadd v4.4s, v24.4s, v25.4s 172cabdff1aSopenharmony_ci fadd v5.4s, v26.4s, v27.4s 173cabdff1aSopenharmony_ci fsub v6.4s, v24.4s, v25.4s 174cabdff1aSopenharmony_ci fsub v7.4s, v22.4s, v23.4s 175cabdff1aSopenharmony_ci 176cabdff1aSopenharmony_ci ld1 {v23.4s}, [x14] 177cabdff1aSopenharmony_ci 178cabdff1aSopenharmony_ci fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]} 179cabdff1aSopenharmony_ci fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]} 180cabdff1aSopenharmony_ci fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]} 181cabdff1aSopenharmony_ci fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]} 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci //fft_pass_neon_16 184cabdff1aSopenharmony_ci rev64 v7.4s, v25.4s 185cabdff1aSopenharmony_ci fmul v25.4s, v25.4s, v23.s[1] 186cabdff1aSopenharmony_ci fmul v7.4s, v7.4s, v29.4s 187cabdff1aSopenharmony_ci fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a} 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci zip1 v20.4s, v24.4s, v25.4s 190cabdff1aSopenharmony_ci zip2 v21.4s, v24.4s, v25.4s 191cabdff1aSopenharmony_ci fneg v22.4s, v20.4s 192cabdff1aSopenharmony_ci fadd v4.4s, v21.4s, v20.4s 193cabdff1aSopenharmony_ci fsub v6.4s, v20.4s, v21.4s // just the second half 194cabdff1aSopenharmony_ci fadd v5.4s, v21.4s, v22.4s // just the first half 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_ci tbl v4.16b, {v4.16b}, v30.16b // trans4_float 197cabdff1aSopenharmony_ci tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 198cabdff1aSopenharmony_ci 199cabdff1aSopenharmony_ci fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]} 200cabdff1aSopenharmony_ci fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]} 201cabdff1aSopenharmony_ci fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]} 202cabdff1aSopenharmony_ci fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]} 203cabdff1aSopenharmony_ci 204cabdff1aSopenharmony_ci//second half 205cabdff1aSopenharmony_ci rev64 v6.4s, v26.4s 206cabdff1aSopenharmony_ci fmul v26.4s, v26.4s, v23.s[2] 207cabdff1aSopenharmony_ci rev64 v7.4s, v27.4s 208cabdff1aSopenharmony_ci fmul v27.4s, v27.4s, v23.s[3] 209cabdff1aSopenharmony_ci fmul v6.4s, v6.4s, v29.4s 210cabdff1aSopenharmony_ci fmul v7.4s, v7.4s, v29.4s 211cabdff1aSopenharmony_ci fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6} 212cabdff1aSopenharmony_ci fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a} 213cabdff1aSopenharmony_ci 214cabdff1aSopenharmony_ci zip1 v24.4s, v26.4s, v27.4s 215cabdff1aSopenharmony_ci zip2 v25.4s, v26.4s, v27.4s 216cabdff1aSopenharmony_ci fneg v26.4s, v24.4s 217cabdff1aSopenharmony_ci fadd v4.4s, v25.4s, v24.4s 218cabdff1aSopenharmony_ci fsub v6.4s, v24.4s, v25.4s // just the second half 219cabdff1aSopenharmony_ci fadd v5.4s, v25.4s, v26.4s // just the first half 220cabdff1aSopenharmony_ci 221cabdff1aSopenharmony_ci tbl v4.16b, {v4.16b}, v30.16b // trans4_float 222cabdff1aSopenharmony_ci tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]} 225cabdff1aSopenharmony_ci fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]} 226cabdff1aSopenharmony_ci fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]} 227cabdff1aSopenharmony_ci fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]} 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci st1 {v16.4s,v17.4s}, [x1], #32 230cabdff1aSopenharmony_ci st1 {v18.4s,v19.4s}, [x1], #32 231cabdff1aSopenharmony_ci st1 {v20.4s,v21.4s}, [x1], #32 232cabdff1aSopenharmony_ci st1 {v22.4s,v23.4s}, [x1], #32 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci ret 235cabdff1aSopenharmony_ciendfunc 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci 238cabdff1aSopenharmony_ciconst trans4_float, align=4 239cabdff1aSopenharmony_ci .byte 0, 1, 2, 3 240cabdff1aSopenharmony_ci .byte 8, 9, 10, 11 241cabdff1aSopenharmony_ci .byte 4, 5, 6, 7 242cabdff1aSopenharmony_ci .byte 12, 13, 14, 15 243cabdff1aSopenharmony_ciendconst 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ciconst trans8_float, align=4 246cabdff1aSopenharmony_ci .byte 24, 25, 26, 27 247cabdff1aSopenharmony_ci .byte 0, 1, 2, 3 248cabdff1aSopenharmony_ci .byte 28, 29, 30, 31 249cabdff1aSopenharmony_ci .byte 4, 5, 6, 7 250cabdff1aSopenharmony_ciendconst 251cabdff1aSopenharmony_ci 252cabdff1aSopenharmony_cifunction fft_pass_neon 253cabdff1aSopenharmony_ci sub x6, x2, #1 // n - 1, loop counter 254cabdff1aSopenharmony_ci lsl x5, x2, #3 // 2 * n * sizeof FFTSample 255cabdff1aSopenharmony_ci lsl x1, x2, #4 // 2 * n * sizeof FFTComplex 256cabdff1aSopenharmony_ci add x5, x4, x5 // wim 257cabdff1aSopenharmony_ci add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex 258cabdff1aSopenharmony_ci add x2, x0, x2, lsl #5 // &z[o2] 259cabdff1aSopenharmony_ci add x3, x0, x3 // &z[o3] 260cabdff1aSopenharmony_ci add x1, x0, x1 // &z[o1] 261cabdff1aSopenharmony_ci ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} 262cabdff1aSopenharmony_ci ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} 263cabdff1aSopenharmony_ci ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} 264cabdff1aSopenharmony_ci trn2 v25.2d, v20.2d, v22.2d 265cabdff1aSopenharmony_ci sub x5, x5, #4 // wim-- 266cabdff1aSopenharmony_ci trn1 v24.2d, v20.2d, v22.2d 267cabdff1aSopenharmony_ci ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1] 268cabdff1aSopenharmony_ci rev64 v7.4s, v25.4s 269cabdff1aSopenharmony_ci fmul v25.4s, v25.4s, v4.s[1] 270cabdff1aSopenharmony_ci ld1 {v16.4s}, [x0] // {z[0],z[1]} 271cabdff1aSopenharmony_ci fmul v7.4s, v7.4s, v29.4s 272cabdff1aSopenharmony_ci ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]} 273cabdff1aSopenharmony_ci prfm pldl1keep, [x2, #16] 274cabdff1aSopenharmony_ci prfm pldl1keep, [x3, #16] 275cabdff1aSopenharmony_ci fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} 276cabdff1aSopenharmony_ci prfm pldl1keep, [x0, #16] 277cabdff1aSopenharmony_ci prfm pldl1keep, [x1, #16] 278cabdff1aSopenharmony_ci 279cabdff1aSopenharmony_ci zip1 v20.4s, v24.4s, v25.4s 280cabdff1aSopenharmony_ci zip2 v21.4s, v24.4s, v25.4s 281cabdff1aSopenharmony_ci fneg v22.4s, v20.4s 282cabdff1aSopenharmony_ci fadd v4.4s, v21.4s, v20.4s 283cabdff1aSopenharmony_ci fsub v6.4s, v20.4s, v21.4s // just the second half 284cabdff1aSopenharmony_ci fadd v5.4s, v21.4s, v22.4s // just the first half 285cabdff1aSopenharmony_ci 286cabdff1aSopenharmony_ci tbl v4.16b, {v4.16b}, v30.16b // trans4_float 287cabdff1aSopenharmony_ci tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 288cabdff1aSopenharmony_ci 289cabdff1aSopenharmony_ci fadd v20.4s, v16.4s, v4.4s 290cabdff1aSopenharmony_ci fsub v22.4s, v16.4s, v4.4s 291cabdff1aSopenharmony_ci fadd v21.4s, v17.4s, v5.4s 292cabdff1aSopenharmony_ci st1 {v20.4s}, [x0], #16 // {z[0], z[1]} 293cabdff1aSopenharmony_ci fsub v23.4s, v17.4s, v5.4s 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} 296cabdff1aSopenharmony_ci st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} 297cabdff1aSopenharmony_ci st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} 298cabdff1aSopenharmony_ci1: 299cabdff1aSopenharmony_ci ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} 300cabdff1aSopenharmony_ci ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} 301cabdff1aSopenharmony_ci ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} 302cabdff1aSopenharmony_ci transpose v26.2d, v27.2d, v20.2d, v22.2d 303cabdff1aSopenharmony_ci ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]} 304cabdff1aSopenharmony_ci rev64 v6.4s, v26.4s 305cabdff1aSopenharmony_ci fmul v26.4s, v26.4s, v4.s[0] 306cabdff1aSopenharmony_ci rev64 v7.4s, v27.4s 307cabdff1aSopenharmony_ci fmul v27.4s, v27.4s, v4.s[1] 308cabdff1aSopenharmony_ci fmul v6.4s, v6.4s, v29.4s 309cabdff1aSopenharmony_ci fmul v7.4s, v7.4s, v29.4s 310cabdff1aSopenharmony_ci ld1 {v16.4s},[x0] // {z[0],z[1]} 311cabdff1aSopenharmony_ci fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6} 312cabdff1aSopenharmony_ci fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} 313cabdff1aSopenharmony_ci ld1 {v17.4s},[x1] // {z[o1],z[o1+1]} 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_ci subs x6, x6, #1 // n-- 316cabdff1aSopenharmony_ci 317cabdff1aSopenharmony_ci zip1 v20.4s, v26.4s, v27.4s 318cabdff1aSopenharmony_ci zip2 v21.4s, v26.4s, v27.4s 319cabdff1aSopenharmony_ci fneg v22.4s, v20.4s 320cabdff1aSopenharmony_ci fadd v4.4s, v21.4s, v20.4s 321cabdff1aSopenharmony_ci fsub v6.4s, v20.4s, v21.4s // just the second half 322cabdff1aSopenharmony_ci fadd v5.4s, v21.4s, v22.4s // just the first half 323cabdff1aSopenharmony_ci 324cabdff1aSopenharmony_ci tbl v4.16b, {v4.16b}, v30.16b // trans4_float 325cabdff1aSopenharmony_ci tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 326cabdff1aSopenharmony_ci 327cabdff1aSopenharmony_ci fadd v20.4s, v16.4s, v4.4s 328cabdff1aSopenharmony_ci fsub v22.4s, v16.4s, v4.4s 329cabdff1aSopenharmony_ci fadd v21.4s, v17.4s, v5.4s 330cabdff1aSopenharmony_ci st1 {v20.4s}, [x0], #16 // {z[0], z[1]} 331cabdff1aSopenharmony_ci fsub v23.4s, v17.4s, v5.4s 332cabdff1aSopenharmony_ci 333cabdff1aSopenharmony_ci st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} 334cabdff1aSopenharmony_ci st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} 335cabdff1aSopenharmony_ci st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} 336cabdff1aSopenharmony_ci b.ne 1b 337cabdff1aSopenharmony_ci 338cabdff1aSopenharmony_ci ret 339cabdff1aSopenharmony_ciendfunc 340cabdff1aSopenharmony_ci 341cabdff1aSopenharmony_ci.macro def_fft n, n2, n4 342cabdff1aSopenharmony_cifunction fft\n\()_neon, align=6 343cabdff1aSopenharmony_ci AARCH64_VALID_JUMP_TARGET 344cabdff1aSopenharmony_ci AARCH64_SIGN_LINK_REGISTER 345cabdff1aSopenharmony_ci sub sp, sp, #16 346cabdff1aSopenharmony_ci stp x28, x30, [sp] 347cabdff1aSopenharmony_ci add x28, x0, #\n4*2*8 348cabdff1aSopenharmony_ci bl fft\n2\()_neon 349cabdff1aSopenharmony_ci mov x0, x28 350cabdff1aSopenharmony_ci bl fft\n4\()_neon 351cabdff1aSopenharmony_ci add x0, x28, #\n4*1*8 352cabdff1aSopenharmony_ci bl fft\n4\()_neon 353cabdff1aSopenharmony_ci sub x0, x28, #\n4*2*8 354cabdff1aSopenharmony_ci ldp x28, x30, [sp], #16 355cabdff1aSopenharmony_ci AARCH64_VALIDATE_LINK_REGISTER 356cabdff1aSopenharmony_ci movrel x4, X(ff_cos_\n) 357cabdff1aSopenharmony_ci mov x2, #\n4>>1 358cabdff1aSopenharmony_ci b fft_pass_neon 359cabdff1aSopenharmony_ciendfunc 360cabdff1aSopenharmony_ci.endm 361cabdff1aSopenharmony_ci 362cabdff1aSopenharmony_ci def_fft 32, 16, 8 363cabdff1aSopenharmony_ci def_fft 64, 32, 16 364cabdff1aSopenharmony_ci def_fft 128, 64, 32 365cabdff1aSopenharmony_ci def_fft 256, 128, 64 366cabdff1aSopenharmony_ci def_fft 512, 256, 128 367cabdff1aSopenharmony_ci def_fft 1024, 512, 256 368cabdff1aSopenharmony_ci def_fft 2048, 1024, 512 369cabdff1aSopenharmony_ci def_fft 4096, 2048, 1024 370cabdff1aSopenharmony_ci def_fft 8192, 4096, 2048 371cabdff1aSopenharmony_ci def_fft 16384, 8192, 4096 372cabdff1aSopenharmony_ci def_fft 32768, 16384, 8192 373cabdff1aSopenharmony_ci def_fft 65536, 32768, 16384 374cabdff1aSopenharmony_ci 375cabdff1aSopenharmony_cifunction ff_fft_calc_neon, export=1 376cabdff1aSopenharmony_ci prfm pldl1keep, [x1] 377cabdff1aSopenharmony_ci movrel x10, trans4_float 378cabdff1aSopenharmony_ci ldr w2, [x0] 379cabdff1aSopenharmony_ci movrel x11, trans8_float 380cabdff1aSopenharmony_ci sub w2, w2, #2 381cabdff1aSopenharmony_ci movrel x3, fft_tab_neon 382cabdff1aSopenharmony_ci ld1 {v30.16b}, [x10] 383cabdff1aSopenharmony_ci mov x7, #-8 384cabdff1aSopenharmony_ci movrel x12, pmmp 385cabdff1aSopenharmony_ci ldr x3, [x3, x2, lsl #3] 386cabdff1aSopenharmony_ci movrel x13, mppm 387cabdff1aSopenharmony_ci movrel x14, X(ff_cos_16) 388cabdff1aSopenharmony_ci ld1 {v31.16b}, [x11] 389cabdff1aSopenharmony_ci mov x0, x1 390cabdff1aSopenharmony_ci ld1 {v29.4s}, [x12] // pmmp 391cabdff1aSopenharmony_ci ld1 {v28.4s}, [x13] 392cabdff1aSopenharmony_ci br x3 393cabdff1aSopenharmony_ciendfunc 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_cifunction ff_fft_permute_neon, export=1 396cabdff1aSopenharmony_ci mov x6, #1 397cabdff1aSopenharmony_ci ldr w2, [x0] // nbits 398cabdff1aSopenharmony_ci ldr x3, [x0, #16] // tmp_buf 399cabdff1aSopenharmony_ci ldr x0, [x0, #8] // revtab 400cabdff1aSopenharmony_ci lsl x6, x6, x2 401cabdff1aSopenharmony_ci mov x2, x6 402cabdff1aSopenharmony_ci1: 403cabdff1aSopenharmony_ci ld1 {v0.2s,v1.2s}, [x1], #16 404cabdff1aSopenharmony_ci ldr w4, [x0], #4 405cabdff1aSopenharmony_ci uxth w5, w4 406cabdff1aSopenharmony_ci lsr w4, w4, #16 407cabdff1aSopenharmony_ci add x5, x3, x5, lsl #3 408cabdff1aSopenharmony_ci add x4, x3, x4, lsl #3 409cabdff1aSopenharmony_ci st1 {v0.2s}, [x5] 410cabdff1aSopenharmony_ci st1 {v1.2s}, [x4] 411cabdff1aSopenharmony_ci subs x6, x6, #2 412cabdff1aSopenharmony_ci b.gt 1b 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_ci sub x1, x1, x2, lsl #3 415cabdff1aSopenharmony_ci1: 416cabdff1aSopenharmony_ci ld1 {v0.4s,v1.4s}, [x3], #32 417cabdff1aSopenharmony_ci st1 {v0.4s,v1.4s}, [x1], #32 418cabdff1aSopenharmony_ci subs x2, x2, #4 419cabdff1aSopenharmony_ci b.gt 1b 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci ret 422cabdff1aSopenharmony_ciendfunc 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ciconst fft_tab_neon, relocate=1 425cabdff1aSopenharmony_ci .quad fft4_neon 426cabdff1aSopenharmony_ci .quad fft8_neon 427cabdff1aSopenharmony_ci .quad fft16_neon 428cabdff1aSopenharmony_ci .quad fft32_neon 429cabdff1aSopenharmony_ci .quad fft64_neon 430cabdff1aSopenharmony_ci .quad fft128_neon 431cabdff1aSopenharmony_ci .quad fft256_neon 432cabdff1aSopenharmony_ci .quad fft512_neon 433cabdff1aSopenharmony_ci .quad fft1024_neon 434cabdff1aSopenharmony_ci .quad fft2048_neon 435cabdff1aSopenharmony_ci .quad fft4096_neon 436cabdff1aSopenharmony_ci .quad fft8192_neon 437cabdff1aSopenharmony_ci .quad fft16384_neon 438cabdff1aSopenharmony_ci .quad fft32768_neon 439cabdff1aSopenharmony_ci .quad fft65536_neon 440cabdff1aSopenharmony_ciendconst 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ciconst pmmp, align=4 443cabdff1aSopenharmony_ci .float +1.0, -1.0, -1.0, +1.0 444cabdff1aSopenharmony_ciendconst 445cabdff1aSopenharmony_ci 446cabdff1aSopenharmony_ciconst mppm, align=4 447cabdff1aSopenharmony_ci .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 448cabdff1aSopenharmony_ciendconst 449