1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2013 RISC OS Open Ltd 3cabdff1aSopenharmony_ci * Author: Ben Avison <bavison@riscosopen.org> 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci@ The fftx_internal_vfp versions of the functions obey a modified AAPCS: 25cabdff1aSopenharmony_ci@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and 26cabdff1aSopenharmony_ci@ all single-precision VFP registers may be corrupted on exit. The a2 27cabdff1aSopenharmony_ci@ register may not be clobbered in these functions, as it holds the 28cabdff1aSopenharmony_ci@ stored original FPSCR. 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_cifunction ff_fft_calc_vfp, export=1 31cabdff1aSopenharmony_ci ldr ip, [a1, #0] @ nbits 32cabdff1aSopenharmony_ci mov a1, a2 33cabdff1aSopenharmony_ci movrel a2, (fft_tab_vfp - 8) 34cabdff1aSopenharmony_ci ldr pc, [a2, ip, lsl #2] 35cabdff1aSopenharmony_ciendfunc 36cabdff1aSopenharmony_ciconst fft_tab_vfp, relocate=1 37cabdff1aSopenharmony_ci .word fft4_vfp 38cabdff1aSopenharmony_ci .word fft8_vfp 39cabdff1aSopenharmony_ci .word X(ff_fft16_vfp) @ this one alone is exported 40cabdff1aSopenharmony_ci .word fft32_vfp 41cabdff1aSopenharmony_ci .word fft64_vfp 42cabdff1aSopenharmony_ci .word fft128_vfp 43cabdff1aSopenharmony_ci .word fft256_vfp 44cabdff1aSopenharmony_ci .word fft512_vfp 45cabdff1aSopenharmony_ci .word fft1024_vfp 46cabdff1aSopenharmony_ci .word fft2048_vfp 47cabdff1aSopenharmony_ci .word fft4096_vfp 48cabdff1aSopenharmony_ci .word fft8192_vfp 49cabdff1aSopenharmony_ci .word fft16384_vfp 50cabdff1aSopenharmony_ci .word fft32768_vfp 51cabdff1aSopenharmony_ci .word fft65536_vfp 52cabdff1aSopenharmony_ciendconst 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_cifunction fft4_vfp 55cabdff1aSopenharmony_ci vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] 56cabdff1aSopenharmony_ci vldr d4, [a1, #1*2*4] @ s8,s9 = z[1] 57cabdff1aSopenharmony_ci vldr d1, [a1, #2*2*4] @ s2,s3 = z[2] 58cabdff1aSopenharmony_ci vldr d5, [a1, #3*2*4] @ s10,s11 = z[3] 59cabdff1aSopenharmony_ci @ stall 60cabdff1aSopenharmony_ci vadd.f s12, s0, s8 @ i0 61cabdff1aSopenharmony_ci vadd.f s13, s1, s9 @ i1 62cabdff1aSopenharmony_ci vadd.f s14, s2, s10 @ i2 63cabdff1aSopenharmony_ci vadd.f s15, s3, s11 @ i3 64cabdff1aSopenharmony_ci vsub.f s8, s0, s8 @ i4 65cabdff1aSopenharmony_ci vsub.f s9, s1, s9 @ i5 66cabdff1aSopenharmony_ci vsub.f s10, s2, s10 @ i6 67cabdff1aSopenharmony_ci vsub.f s11, s3, s11 @ i7 68cabdff1aSopenharmony_ci @ stall 69cabdff1aSopenharmony_ci @ stall 70cabdff1aSopenharmony_ci vadd.f s0, s12, s14 @ z[0].re 71cabdff1aSopenharmony_ci vsub.f s4, s12, s14 @ z[2].re 72cabdff1aSopenharmony_ci vadd.f s1, s13, s15 @ z[0].im 73cabdff1aSopenharmony_ci vsub.f s5, s13, s15 @ z[2].im 74cabdff1aSopenharmony_ci vadd.f s7, s9, s10 @ z[3].im 75cabdff1aSopenharmony_ci vsub.f s3, s9, s10 @ z[1].im 76cabdff1aSopenharmony_ci vadd.f s2, s8, s11 @ z[1].re 77cabdff1aSopenharmony_ci vsub.f s6, s8, s11 @ z[3].re 78cabdff1aSopenharmony_ci @ stall 79cabdff1aSopenharmony_ci @ stall 80cabdff1aSopenharmony_ci vstr d0, [a1, #0*2*4] 81cabdff1aSopenharmony_ci vstr d2, [a1, #2*2*4] 82cabdff1aSopenharmony_ci @ stall 83cabdff1aSopenharmony_ci @ stall 84cabdff1aSopenharmony_ci vstr d1, [a1, #1*2*4] 85cabdff1aSopenharmony_ci vstr d3, [a1, #3*2*4] 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_ci bx lr 88cabdff1aSopenharmony_ciendfunc 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ci.macro macro_fft8_head 91cabdff1aSopenharmony_ci @ FFT4 92cabdff1aSopenharmony_ci vldr d4, [a1, #0 * 2*4] 93cabdff1aSopenharmony_ci vldr d6, [a1, #1 * 2*4] 94cabdff1aSopenharmony_ci vldr d5, [a1, #2 * 2*4] 95cabdff1aSopenharmony_ci vldr d7, [a1, #3 * 2*4] 96cabdff1aSopenharmony_ci @ BF 97cabdff1aSopenharmony_ci vldr d12, [a1, #4 * 2*4] 98cabdff1aSopenharmony_ci vadd.f s16, s8, s12 @ vector op 99cabdff1aSopenharmony_ci vldr d14, [a1, #5 * 2*4] 100cabdff1aSopenharmony_ci vldr d13, [a1, #6 * 2*4] 101cabdff1aSopenharmony_ci vldr d15, [a1, #7 * 2*4] 102cabdff1aSopenharmony_ci vsub.f s20, s8, s12 @ vector op 103cabdff1aSopenharmony_ci vadd.f s0, s16, s18 104cabdff1aSopenharmony_ci vsub.f s2, s16, s18 105cabdff1aSopenharmony_ci vadd.f s1, s17, s19 106cabdff1aSopenharmony_ci vsub.f s3, s17, s19 107cabdff1aSopenharmony_ci vadd.f s7, s21, s22 108cabdff1aSopenharmony_ci vsub.f s5, s21, s22 109cabdff1aSopenharmony_ci vadd.f s4, s20, s23 110cabdff1aSopenharmony_ci vsub.f s6, s20, s23 111cabdff1aSopenharmony_ci vsub.f s20, s24, s28 @ vector op 112cabdff1aSopenharmony_ci vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory 113cabdff1aSopenharmony_ci vstr d1, [a1, #1 * 2*4] 114cabdff1aSopenharmony_ci vldr s0, cos1pi4 115cabdff1aSopenharmony_ci vadd.f s16, s24, s28 @ vector op 116cabdff1aSopenharmony_ci vstr d2, [a1, #2 * 2*4] 117cabdff1aSopenharmony_ci vstr d3, [a1, #3 * 2*4] 118cabdff1aSopenharmony_ci vldr d12, [a1, #0 * 2*4] 119cabdff1aSopenharmony_ci @ TRANSFORM 120cabdff1aSopenharmony_ci vmul.f s20, s20, s0 @ vector x scalar op 121cabdff1aSopenharmony_ci vldr d13, [a1, #1 * 2*4] 122cabdff1aSopenharmony_ci vldr d14, [a1, #2 * 2*4] 123cabdff1aSopenharmony_ci vldr d15, [a1, #3 * 2*4] 124cabdff1aSopenharmony_ci @ BUTTERFLIES 125cabdff1aSopenharmony_ci vadd.f s0, s18, s16 126cabdff1aSopenharmony_ci vadd.f s1, s17, s19 127cabdff1aSopenharmony_ci vsub.f s2, s17, s19 128cabdff1aSopenharmony_ci vsub.f s3, s18, s16 129cabdff1aSopenharmony_ci vadd.f s4, s21, s20 130cabdff1aSopenharmony_ci vsub.f s5, s21, s20 131cabdff1aSopenharmony_ci vadd.f s6, s22, s23 132cabdff1aSopenharmony_ci vsub.f s7, s22, s23 133cabdff1aSopenharmony_ci vadd.f s8, s0, s24 @ vector op 134cabdff1aSopenharmony_ci vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory 135cabdff1aSopenharmony_ci vstr d1, [a1, #1 * 2*4] 136cabdff1aSopenharmony_ci vldr d6, [a1, #0 * 2*4] 137cabdff1aSopenharmony_ci vldr d7, [a1, #1 * 2*4] 138cabdff1aSopenharmony_ci vadd.f s1, s5, s6 139cabdff1aSopenharmony_ci vadd.f s0, s7, s4 140cabdff1aSopenharmony_ci vsub.f s2, s5, s6 141cabdff1aSopenharmony_ci vsub.f s3, s7, s4 142cabdff1aSopenharmony_ci vsub.f s12, s24, s12 @ vector op 143cabdff1aSopenharmony_ci vsub.f s5, s29, s1 144cabdff1aSopenharmony_ci vsub.f s4, s28, s0 145cabdff1aSopenharmony_ci vsub.f s6, s30, s2 146cabdff1aSopenharmony_ci vsub.f s7, s31, s3 147cabdff1aSopenharmony_ci vadd.f s16, s0, s28 @ vector op 148cabdff1aSopenharmony_ci vstr d6, [a1, #4 * 2*4] 149cabdff1aSopenharmony_ci vstr d7, [a1, #6 * 2*4] 150cabdff1aSopenharmony_ci vstr d4, [a1, #0 * 2*4] 151cabdff1aSopenharmony_ci vstr d5, [a1, #2 * 2*4] 152cabdff1aSopenharmony_ci vstr d2, [a1, #5 * 2*4] 153cabdff1aSopenharmony_ci vstr d3, [a1, #7 * 2*4] 154cabdff1aSopenharmony_ci.endm 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_ci.macro macro_fft8_tail 157cabdff1aSopenharmony_ci vstr d8, [a1, #1 * 2*4] 158cabdff1aSopenharmony_ci vstr d9, [a1, #3 * 2*4] 159cabdff1aSopenharmony_ci.endm 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_cifunction .Lfft8_internal_vfp 162cabdff1aSopenharmony_ci macro_fft8_head 163cabdff1aSopenharmony_ci macro_fft8_tail 164cabdff1aSopenharmony_ci bx lr 165cabdff1aSopenharmony_ciendfunc 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_cifunction fft8_vfp 168cabdff1aSopenharmony_ci ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 169cabdff1aSopenharmony_ci fmrx a2, FPSCR 170cabdff1aSopenharmony_ci fmxr FPSCR, a3 171cabdff1aSopenharmony_ci vpush {s16-s31} 172cabdff1aSopenharmony_ci mov ip, lr 173cabdff1aSopenharmony_ci bl .Lfft8_internal_vfp 174cabdff1aSopenharmony_ci vpop {s16-s31} 175cabdff1aSopenharmony_ci fmxr FPSCR, a2 176cabdff1aSopenharmony_ci bx ip 177cabdff1aSopenharmony_ciendfunc 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci.align 3 180cabdff1aSopenharmony_cicos1pi4: @ cos(1*pi/4) = sqrt(2) 181cabdff1aSopenharmony_ci .float 0.707106769084930419921875 182cabdff1aSopenharmony_cicos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 183cabdff1aSopenharmony_ci .float 0.92387950420379638671875 184cabdff1aSopenharmony_cicos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 185cabdff1aSopenharmony_ci .float 0.3826834261417388916015625 186cabdff1aSopenharmony_ci 187cabdff1aSopenharmony_cifunction .Lfft16_internal_vfp 188cabdff1aSopenharmony_ci macro_fft8_head 189cabdff1aSopenharmony_ci @ FFT4(z+8) 190cabdff1aSopenharmony_ci vldr d10, [a1, #8 * 2*4] 191cabdff1aSopenharmony_ci vldr d12, [a1, #9 * 2*4] 192cabdff1aSopenharmony_ci vldr d11, [a1, #10 * 2*4] 193cabdff1aSopenharmony_ci vldr d13, [a1, #11 * 2*4] 194cabdff1aSopenharmony_ci macro_fft8_tail 195cabdff1aSopenharmony_ci vadd.f s16, s20, s24 @ vector op 196cabdff1aSopenharmony_ci @ FFT4(z+12) 197cabdff1aSopenharmony_ci vldr d4, [a1, #12 * 2*4] 198cabdff1aSopenharmony_ci vldr d6, [a1, #13 * 2*4] 199cabdff1aSopenharmony_ci vldr d5, [a1, #14 * 2*4] 200cabdff1aSopenharmony_ci vsub.f s20, s20, s24 @ vector op 201cabdff1aSopenharmony_ci vldr d7, [a1, #15 * 2*4] 202cabdff1aSopenharmony_ci vadd.f s0, s16, s18 203cabdff1aSopenharmony_ci vsub.f s4, s16, s18 204cabdff1aSopenharmony_ci vadd.f s1, s17, s19 205cabdff1aSopenharmony_ci vsub.f s5, s17, s19 206cabdff1aSopenharmony_ci vadd.f s7, s21, s22 207cabdff1aSopenharmony_ci vsub.f s3, s21, s22 208cabdff1aSopenharmony_ci vadd.f s2, s20, s23 209cabdff1aSopenharmony_ci vsub.f s6, s20, s23 210cabdff1aSopenharmony_ci vadd.f s16, s8, s12 @ vector op 211cabdff1aSopenharmony_ci vstr d0, [a1, #8 * 2*4] 212cabdff1aSopenharmony_ci vstr d2, [a1, #10 * 2*4] 213cabdff1aSopenharmony_ci vstr d1, [a1, #9 * 2*4] 214cabdff1aSopenharmony_ci vsub.f s20, s8, s12 215cabdff1aSopenharmony_ci vstr d3, [a1, #11 * 2*4] 216cabdff1aSopenharmony_ci @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4) 217cabdff1aSopenharmony_ci vldr d12, [a1, #10 * 2*4] 218cabdff1aSopenharmony_ci vadd.f s0, s16, s18 219cabdff1aSopenharmony_ci vadd.f s1, s17, s19 220cabdff1aSopenharmony_ci vsub.f s6, s16, s18 221cabdff1aSopenharmony_ci vsub.f s7, s17, s19 222cabdff1aSopenharmony_ci vsub.f s3, s21, s22 223cabdff1aSopenharmony_ci vadd.f s2, s20, s23 224cabdff1aSopenharmony_ci vadd.f s5, s21, s22 225cabdff1aSopenharmony_ci vsub.f s4, s20, s23 226cabdff1aSopenharmony_ci vstr d0, [a1, #12 * 2*4] 227cabdff1aSopenharmony_ci vmov s0, s6 228cabdff1aSopenharmony_ci @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8) 229cabdff1aSopenharmony_ci vldr d6, [a1, #9 * 2*4] 230cabdff1aSopenharmony_ci vstr d1, [a1, #13 * 2*4] 231cabdff1aSopenharmony_ci vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8 232cabdff1aSopenharmony_ci vstr d2, [a1, #15 * 2*4] 233cabdff1aSopenharmony_ci vldr d7, [a1, #13 * 2*4] 234cabdff1aSopenharmony_ci vadd.f s4, s25, s24 235cabdff1aSopenharmony_ci vsub.f s5, s25, s24 236cabdff1aSopenharmony_ci vsub.f s6, s0, s7 237cabdff1aSopenharmony_ci vadd.f s7, s0, s7 238cabdff1aSopenharmony_ci vmul.f s20, s12, s3 @ vector op 239cabdff1aSopenharmony_ci @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8) 240cabdff1aSopenharmony_ci vldr d4, [a1, #11 * 2*4] 241cabdff1aSopenharmony_ci vldr d5, [a1, #15 * 2*4] 242cabdff1aSopenharmony_ci vldr s1, cos3pi8 243cabdff1aSopenharmony_ci vmul.f s24, s4, s2 @ vector * scalar op 244cabdff1aSopenharmony_ci vmul.f s28, s12, s1 @ vector * scalar op 245cabdff1aSopenharmony_ci vmul.f s12, s8, s1 @ vector * scalar op 246cabdff1aSopenharmony_ci vadd.f s4, s20, s29 247cabdff1aSopenharmony_ci vsub.f s5, s21, s28 248cabdff1aSopenharmony_ci vsub.f s6, s22, s31 249cabdff1aSopenharmony_ci vadd.f s7, s23, s30 250cabdff1aSopenharmony_ci vmul.f s8, s8, s3 @ vector * scalar op 251cabdff1aSopenharmony_ci vldr d8, [a1, #1 * 2*4] 252cabdff1aSopenharmony_ci vldr d9, [a1, #5 * 2*4] 253cabdff1aSopenharmony_ci vldr d10, [a1, #3 * 2*4] 254cabdff1aSopenharmony_ci vldr d11, [a1, #7 * 2*4] 255cabdff1aSopenharmony_ci vldr d14, [a1, #2 * 2*4] 256cabdff1aSopenharmony_ci vadd.f s0, s6, s4 257cabdff1aSopenharmony_ci vadd.f s1, s5, s7 258cabdff1aSopenharmony_ci vsub.f s2, s5, s7 259cabdff1aSopenharmony_ci vsub.f s3, s6, s4 260cabdff1aSopenharmony_ci vadd.f s4, s12, s9 261cabdff1aSopenharmony_ci vsub.f s5, s13, s8 262cabdff1aSopenharmony_ci vsub.f s6, s14, s11 263cabdff1aSopenharmony_ci vadd.f s7, s15, s10 264cabdff1aSopenharmony_ci vadd.f s12, s0, s16 @ vector op 265cabdff1aSopenharmony_ci vstr d0, [a1, #1 * 2*4] 266cabdff1aSopenharmony_ci vstr d1, [a1, #5 * 2*4] 267cabdff1aSopenharmony_ci vldr d4, [a1, #1 * 2*4] 268cabdff1aSopenharmony_ci vldr d5, [a1, #5 * 2*4] 269cabdff1aSopenharmony_ci vadd.f s0, s6, s4 270cabdff1aSopenharmony_ci vadd.f s1, s5, s7 271cabdff1aSopenharmony_ci vsub.f s2, s5, s7 272cabdff1aSopenharmony_ci vsub.f s3, s6, s4 273cabdff1aSopenharmony_ci vsub.f s8, s16, s8 @ vector op 274cabdff1aSopenharmony_ci vstr d6, [a1, #1 * 2*4] 275cabdff1aSopenharmony_ci vstr d7, [a1, #5 * 2*4] 276cabdff1aSopenharmony_ci vldr d15, [a1, #6 * 2*4] 277cabdff1aSopenharmony_ci vsub.f s4, s20, s0 278cabdff1aSopenharmony_ci vsub.f s5, s21, s1 279cabdff1aSopenharmony_ci vsub.f s6, s22, s2 280cabdff1aSopenharmony_ci vsub.f s7, s23, s3 281cabdff1aSopenharmony_ci vadd.f s20, s0, s20 @ vector op 282cabdff1aSopenharmony_ci vstr d4, [a1, #9 * 2*4] 283cabdff1aSopenharmony_ci @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12]) 284cabdff1aSopenharmony_ci vldr d6, [a1, #8 * 2*4] 285cabdff1aSopenharmony_ci vstr d5, [a1, #13 * 2*4] 286cabdff1aSopenharmony_ci vldr d7, [a1, #12 * 2*4] 287cabdff1aSopenharmony_ci vstr d2, [a1, #11 * 2*4] 288cabdff1aSopenharmony_ci vldr d8, [a1, #0 * 2*4] 289cabdff1aSopenharmony_ci vstr d3, [a1, #15 * 2*4] 290cabdff1aSopenharmony_ci vldr d9, [a1, #4 * 2*4] 291cabdff1aSopenharmony_ci vadd.f s0, s26, s24 292cabdff1aSopenharmony_ci vadd.f s1, s25, s27 293cabdff1aSopenharmony_ci vsub.f s2, s25, s27 294cabdff1aSopenharmony_ci vsub.f s3, s26, s24 295cabdff1aSopenharmony_ci vadd.f s4, s14, s12 296cabdff1aSopenharmony_ci vadd.f s5, s13, s15 297cabdff1aSopenharmony_ci vsub.f s6, s13, s15 298cabdff1aSopenharmony_ci vsub.f s7, s14, s12 299cabdff1aSopenharmony_ci vadd.f s8, s0, s28 @ vector op 300cabdff1aSopenharmony_ci vstr d0, [a1, #3 * 2*4] 301cabdff1aSopenharmony_ci vstr d1, [a1, #7 * 2*4] 302cabdff1aSopenharmony_ci vldr d6, [a1, #3 * 2*4] 303cabdff1aSopenharmony_ci vldr d7, [a1, #7 * 2*4] 304cabdff1aSopenharmony_ci vsub.f s0, s16, s4 305cabdff1aSopenharmony_ci vsub.f s1, s17, s5 306cabdff1aSopenharmony_ci vsub.f s2, s18, s6 307cabdff1aSopenharmony_ci vsub.f s3, s19, s7 308cabdff1aSopenharmony_ci vsub.f s12, s28, s12 @ vector op 309cabdff1aSopenharmony_ci vadd.f s16, s4, s16 @ vector op 310cabdff1aSopenharmony_ci vstr d10, [a1, #3 * 2*4] 311cabdff1aSopenharmony_ci vstr d11, [a1, #7 * 2*4] 312cabdff1aSopenharmony_ci vstr d4, [a1, #2 * 2*4] 313cabdff1aSopenharmony_ci vstr d5, [a1, #6 * 2*4] 314cabdff1aSopenharmony_ci vstr d0, [a1, #8 * 2*4] 315cabdff1aSopenharmony_ci vstr d1, [a1, #12 * 2*4] 316cabdff1aSopenharmony_ci vstr d6, [a1, #10 * 2*4] 317cabdff1aSopenharmony_ci vstr d7, [a1, #14 * 2*4] 318cabdff1aSopenharmony_ci vstr d8, [a1, #0 * 2*4] 319cabdff1aSopenharmony_ci vstr d9, [a1, #4 * 2*4] 320cabdff1aSopenharmony_ci 321cabdff1aSopenharmony_ci bx lr 322cabdff1aSopenharmony_ciendfunc 323cabdff1aSopenharmony_ci 324cabdff1aSopenharmony_cifunction ff_fft16_vfp, export=1 325cabdff1aSopenharmony_ci ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 326cabdff1aSopenharmony_ci fmrx a2, FPSCR 327cabdff1aSopenharmony_ci fmxr FPSCR, a3 328cabdff1aSopenharmony_ci vpush {s16-s31} 329cabdff1aSopenharmony_ci mov ip, lr 330cabdff1aSopenharmony_ci bl .Lfft16_internal_vfp 331cabdff1aSopenharmony_ci vpop {s16-s31} 332cabdff1aSopenharmony_ci fmxr FPSCR, a2 333cabdff1aSopenharmony_ci bx ip 334cabdff1aSopenharmony_ciendfunc 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci.macro pass n, z0, z1, z2, z3 337cabdff1aSopenharmony_ci add v6, v5, #4*2*\n 338cabdff1aSopenharmony_ci @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]) 339cabdff1aSopenharmony_ci @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) 340cabdff1aSopenharmony_ci @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]) 341cabdff1aSopenharmony_ci @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) 342cabdff1aSopenharmony_ci vldr d8, [\z2, #8*(o2+1)] @ s16,s17 343cabdff1aSopenharmony_ci vldmdb v6!, {s2} 344cabdff1aSopenharmony_ci vldr d9, [\z3, #8*(o3+1)] @ s18,s19 345cabdff1aSopenharmony_ci vldmia v5!, {s0,s1} @ s0 is unused 346cabdff1aSopenharmony_ci vldr s7, [\z2, #8*o2] @ t1 347cabdff1aSopenharmony_ci vmul.f s20, s16, s2 @ vector * scalar 348cabdff1aSopenharmony_ci vldr s0, [\z3, #8*o3] @ t5 349cabdff1aSopenharmony_ci vldr s6, [\z2, #8*o2+4] @ t2 350cabdff1aSopenharmony_ci vldr s3, [\z3, #8*o3+4] @ t6 351cabdff1aSopenharmony_ci vmul.f s16, s16, s1 @ vector * scalar 352cabdff1aSopenharmony_ci ldr a4, =\n-1 353cabdff1aSopenharmony_ci1: add \z0, \z0, #8*2 354cabdff1aSopenharmony_ci .if \n*4*2 >= 512 355cabdff1aSopenharmony_ci add \z1, \z1, #8*2 356cabdff1aSopenharmony_ci .endif 357cabdff1aSopenharmony_ci .if \n*4*2 >= 256 358cabdff1aSopenharmony_ci add \z2, \z2, #8*2 359cabdff1aSopenharmony_ci .endif 360cabdff1aSopenharmony_ci .if \n*4*2 >= 512 361cabdff1aSopenharmony_ci add \z3, \z3, #8*2 362cabdff1aSopenharmony_ci .endif 363cabdff1aSopenharmony_ci @ up to 2 stalls (VFP vector issuing / waiting for s0) 364cabdff1aSopenharmony_ci @ depending upon whether this is the first iteration and 365cabdff1aSopenharmony_ci @ how many add instructions are inserted above 366cabdff1aSopenharmony_ci vadd.f s4, s0, s7 @ t5 367cabdff1aSopenharmony_ci vadd.f s5, s6, s3 @ t6 368cabdff1aSopenharmony_ci vsub.f s6, s6, s3 @ t4 369cabdff1aSopenharmony_ci vsub.f s7, s0, s7 @ t3 370cabdff1aSopenharmony_ci vldr d6, [\z0, #8*0-8*2] @ s12,s13 371cabdff1aSopenharmony_ci vadd.f s0, s16, s21 @ t1 372cabdff1aSopenharmony_ci vldr d7, [\z1, #8*o1-8*2] @ s14,s15 373cabdff1aSopenharmony_ci vsub.f s1, s18, s23 @ t5 374cabdff1aSopenharmony_ci vadd.f s8, s4, s12 @ vector + vector 375cabdff1aSopenharmony_ci @ stall (VFP vector issuing) 376cabdff1aSopenharmony_ci @ stall (VFP vector issuing) 377cabdff1aSopenharmony_ci @ stall (VFP vector issuing) 378cabdff1aSopenharmony_ci vsub.f s4, s12, s4 379cabdff1aSopenharmony_ci vsub.f s5, s13, s5 380cabdff1aSopenharmony_ci vsub.f s6, s14, s6 381cabdff1aSopenharmony_ci vsub.f s7, s15, s7 382cabdff1aSopenharmony_ci vsub.f s2, s17, s20 @ t2 383cabdff1aSopenharmony_ci vadd.f s3, s19, s22 @ t6 384cabdff1aSopenharmony_ci vstr d4, [\z0, #8*0-8*2] @ s8,s9 385cabdff1aSopenharmony_ci vstr d5, [\z1, #8*o1-8*2] @ s10,s11 386cabdff1aSopenharmony_ci @ stall (waiting for s5) 387cabdff1aSopenharmony_ci vstr d2, [\z2, #8*o2-8*2] @ s4,s5 388cabdff1aSopenharmony_ci vadd.f s4, s1, s0 @ t5 389cabdff1aSopenharmony_ci vstr d3, [\z3, #8*o3-8*2] @ s6,s7 390cabdff1aSopenharmony_ci vsub.f s7, s1, s0 @ t3 391cabdff1aSopenharmony_ci vadd.f s5, s2, s3 @ t6 392cabdff1aSopenharmony_ci vsub.f s6, s2, s3 @ t4 393cabdff1aSopenharmony_ci vldr d6, [\z0, #8*1-8*2] @ s12,s13 394cabdff1aSopenharmony_ci vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15 395cabdff1aSopenharmony_ci vldr d4, [\z2, #8*o2] @ s8,s9 396cabdff1aSopenharmony_ci vldmdb v6!, {s2,s3} 397cabdff1aSopenharmony_ci vldr d5, [\z3, #8*o3] @ s10,s11 398cabdff1aSopenharmony_ci vadd.f s20, s4, s12 @ vector + vector 399cabdff1aSopenharmony_ci vldmia v5!, {s0,s1} 400cabdff1aSopenharmony_ci vldr d8, [\z2, #8*(o2+1)] @ s16,s17 401cabdff1aSopenharmony_ci @ stall (VFP vector issuing) 402cabdff1aSopenharmony_ci vsub.f s4, s12, s4 403cabdff1aSopenharmony_ci vsub.f s5, s13, s5 404cabdff1aSopenharmony_ci vsub.f s6, s14, s6 405cabdff1aSopenharmony_ci vsub.f s7, s15, s7 406cabdff1aSopenharmony_ci vmul.f s12, s8, s3 @ vector * scalar 407cabdff1aSopenharmony_ci vstr d10, [\z0, #8*1-8*2] @ s20,s21 408cabdff1aSopenharmony_ci vldr d9, [\z3, #8*(o3+1)] @ s18,s19 409cabdff1aSopenharmony_ci vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23 410cabdff1aSopenharmony_ci vmul.f s8, s8, s0 @ vector * scalar 411cabdff1aSopenharmony_ci vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5 412cabdff1aSopenharmony_ci @ stall (waiting for s7) 413cabdff1aSopenharmony_ci vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7 414cabdff1aSopenharmony_ci vmul.f s20, s16, s2 @ vector * scalar 415cabdff1aSopenharmony_ci @ stall (VFP vector issuing) 416cabdff1aSopenharmony_ci @ stall (VFP vector issuing) 417cabdff1aSopenharmony_ci @ stall (VFP vector issuing) 418cabdff1aSopenharmony_ci vadd.f s7, s8, s13 @ t1 419cabdff1aSopenharmony_ci vsub.f s6, s9, s12 @ t2 420cabdff1aSopenharmony_ci vsub.f s0, s10, s15 @ t5 421cabdff1aSopenharmony_ci vadd.f s3, s11, s14 @ t6 422cabdff1aSopenharmony_ci vmul.f s16, s16, s1 @ vector * scalar 423cabdff1aSopenharmony_ci subs a4, a4, #1 424cabdff1aSopenharmony_ci bne 1b 425cabdff1aSopenharmony_ci @ What remains is identical to the first two indentations of 426cabdff1aSopenharmony_ci @ the above, but without the increment of z 427cabdff1aSopenharmony_ci vadd.f s4, s0, s7 @ t5 428cabdff1aSopenharmony_ci vadd.f s5, s6, s3 @ t6 429cabdff1aSopenharmony_ci vsub.f s6, s6, s3 @ t4 430cabdff1aSopenharmony_ci vsub.f s7, s0, s7 @ t3 431cabdff1aSopenharmony_ci vldr d6, [\z0, #8*0] @ s12,s13 432cabdff1aSopenharmony_ci vadd.f s0, s16, s21 @ t1 433cabdff1aSopenharmony_ci vldr d7, [\z1, #8*o1] @ s14,s15 434cabdff1aSopenharmony_ci vsub.f s1, s18, s23 @ t5 435cabdff1aSopenharmony_ci vadd.f s8, s4, s12 @ vector + vector 436cabdff1aSopenharmony_ci vsub.f s4, s12, s4 437cabdff1aSopenharmony_ci vsub.f s5, s13, s5 438cabdff1aSopenharmony_ci vsub.f s6, s14, s6 439cabdff1aSopenharmony_ci vsub.f s7, s15, s7 440cabdff1aSopenharmony_ci vsub.f s2, s17, s20 @ t2 441cabdff1aSopenharmony_ci vadd.f s3, s19, s22 @ t6 442cabdff1aSopenharmony_ci vstr d4, [\z0, #8*0] @ s8,s9 443cabdff1aSopenharmony_ci vstr d5, [\z1, #8*o1] @ s10,s11 444cabdff1aSopenharmony_ci vstr d2, [\z2, #8*o2] @ s4,s5 445cabdff1aSopenharmony_ci vadd.f s4, s1, s0 @ t5 446cabdff1aSopenharmony_ci vstr d3, [\z3, #8*o3] @ s6,s7 447cabdff1aSopenharmony_ci vsub.f s7, s1, s0 @ t3 448cabdff1aSopenharmony_ci vadd.f s5, s2, s3 @ t6 449cabdff1aSopenharmony_ci vsub.f s6, s2, s3 @ t4 450cabdff1aSopenharmony_ci vldr d6, [\z0, #8*1] @ s12,s13 451cabdff1aSopenharmony_ci vldr d7, [\z1, #8*(o1+1)] @ s14,s15 452cabdff1aSopenharmony_ci vadd.f s20, s4, s12 @ vector + vector 453cabdff1aSopenharmony_ci vsub.f s4, s12, s4 454cabdff1aSopenharmony_ci vsub.f s5, s13, s5 455cabdff1aSopenharmony_ci vsub.f s6, s14, s6 456cabdff1aSopenharmony_ci vsub.f s7, s15, s7 457cabdff1aSopenharmony_ci vstr d10, [\z0, #8*1] @ s20,s21 458cabdff1aSopenharmony_ci vstr d11, [\z1, #8*(o1+1)] @ s22,s23 459cabdff1aSopenharmony_ci vstr d2, [\z2, #8*(o2+1)] @ s4,s5 460cabdff1aSopenharmony_ci vstr d3, [\z3, #8*(o3+1)] @ s6,s7 461cabdff1aSopenharmony_ci.endm 462cabdff1aSopenharmony_ci 463cabdff1aSopenharmony_ci.macro def_fft n, n2, n4 464cabdff1aSopenharmony_cifunction .Lfft\n\()_internal_vfp 465cabdff1aSopenharmony_ci .if \n >= 512 466cabdff1aSopenharmony_ci push {v1-v6,lr} 467cabdff1aSopenharmony_ci .elseif \n >= 256 468cabdff1aSopenharmony_ci push {v1-v2,v5-v6,lr} 469cabdff1aSopenharmony_ci .else 470cabdff1aSopenharmony_ci push {v1,v5-v6,lr} 471cabdff1aSopenharmony_ci .endif 472cabdff1aSopenharmony_ci mov v1, a1 473cabdff1aSopenharmony_ci bl .Lfft\n2\()_internal_vfp 474cabdff1aSopenharmony_ci add a1, v1, #8*(\n/4)*2 475cabdff1aSopenharmony_ci bl .Lfft\n4\()_internal_vfp 476cabdff1aSopenharmony_ci movrelx v5, X(ff_cos_\n), a1 477cabdff1aSopenharmony_ci add a1, v1, #8*(\n/4)*3 478cabdff1aSopenharmony_ci bl .Lfft\n4\()_internal_vfp 479cabdff1aSopenharmony_ci .if \n >= 512 480cabdff1aSopenharmony_ci .set o1, 0*(\n/4/2) 481cabdff1aSopenharmony_ci .set o2, 0*(\n/4/2) 482cabdff1aSopenharmony_ci .set o3, 0*(\n/4/2) 483cabdff1aSopenharmony_ci add v2, v1, #8*2*(\n/4/2) 484cabdff1aSopenharmony_ci add v3, v1, #8*4*(\n/4/2) 485cabdff1aSopenharmony_ci add v4, v1, #8*6*(\n/4/2) 486cabdff1aSopenharmony_ci pass (\n/4/2), v1, v2, v3, v4 487cabdff1aSopenharmony_ci pop {v1-v6,pc} 488cabdff1aSopenharmony_ci .elseif \n >= 256 489cabdff1aSopenharmony_ci .set o1, 2*(\n/4/2) 490cabdff1aSopenharmony_ci .set o2, 0*(\n/4/2) 491cabdff1aSopenharmony_ci .set o3, 2*(\n/4/2) 492cabdff1aSopenharmony_ci add v2, v1, #8*4*(\n/4/2) 493cabdff1aSopenharmony_ci pass (\n/4/2), v1, v1, v2, v2 494cabdff1aSopenharmony_ci pop {v1-v2,v5-v6,pc} 495cabdff1aSopenharmony_ci .else 496cabdff1aSopenharmony_ci .set o1, 2*(\n/4/2) 497cabdff1aSopenharmony_ci .set o2, 4*(\n/4/2) 498cabdff1aSopenharmony_ci .set o3, 6*(\n/4/2) 499cabdff1aSopenharmony_ci pass (\n/4/2), v1, v1, v1, v1 500cabdff1aSopenharmony_ci pop {v1,v5-v6,pc} 501cabdff1aSopenharmony_ci .endif 502cabdff1aSopenharmony_ciendfunc 503cabdff1aSopenharmony_ci 504cabdff1aSopenharmony_cifunction fft\n\()_vfp 505cabdff1aSopenharmony_ci ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ 506cabdff1aSopenharmony_ci fmrx a2, FPSCR 507cabdff1aSopenharmony_ci fmxr FPSCR, a3 508cabdff1aSopenharmony_ci vpush {s16-s31} 509cabdff1aSopenharmony_ci mov ip, lr 510cabdff1aSopenharmony_ci bl .Lfft\n\()_internal_vfp 511cabdff1aSopenharmony_ci vpop {s16-s31} 512cabdff1aSopenharmony_ci fmxr FPSCR, a2 513cabdff1aSopenharmony_ci bx ip 514cabdff1aSopenharmony_ciendfunc 515cabdff1aSopenharmony_ci 516cabdff1aSopenharmony_ci.ltorg 517cabdff1aSopenharmony_ci.endm 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_ci def_fft 32, 16, 8 520cabdff1aSopenharmony_ci def_fft 64, 32, 16 521cabdff1aSopenharmony_ci def_fft 128, 64, 32 522cabdff1aSopenharmony_ci def_fft 256, 128, 64 523cabdff1aSopenharmony_ci def_fft 512, 256, 128 524cabdff1aSopenharmony_ci def_fft 1024, 512, 256 525cabdff1aSopenharmony_ci def_fft 2048, 1024, 512 526cabdff1aSopenharmony_ci def_fft 4096, 2048, 1024 527cabdff1aSopenharmony_ci def_fft 8192, 4096, 2048 528cabdff1aSopenharmony_ci def_fft 16384, 8192, 4096 529cabdff1aSopenharmony_ci def_fft 32768, 16384, 8192 530cabdff1aSopenharmony_ci def_fft 65536, 32768, 16384 531