1 /* 2 * ARM NEON optimised FFT 3 * 4 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 5 * Copyright (c) 2009 Naotoshi Nojiri 6 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 7 * 8 * This algorithm (though not any of the implementation details) is 9 * based on libdjbfft by D. J. Bernstein. 10 * 11 * This file is part of FFmpeg. 12 * 13 * FFmpeg is free software; you can redistribute it and/or 14 * modify it under the terms of the GNU Lesser General Public 15 * License as published by the Free Software Foundation; either 16 * version 2.1 of the License, or (at your option) any later version. 17 * 18 * FFmpeg is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 * Lesser General Public License for more details. 22 * 23 * You should have received a copy of the GNU Lesser General Public 24 * License along with FFmpeg; if not, write to the Free Software 25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 26 */ 27 28 #include "libavutil/aarch64/asm.S" 29 30 #define M_SQRT1_2 0.70710678118654752440 31 32 .macro transpose d0, d1, s0, s1 33 trn1 \d0, \s0, \s1 34 trn2 \d1, \s0, \s1 35 .endm 36 37 38 function fft4_neon 39 AARCH64_VALID_JUMP_TARGET 40 ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] 41 42 fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1 43 fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1 44 45 ext v16.8b, v2.8b, v3.8b, #4 46 ext v17.8b, v3.8b, v2.8b, #4 47 48 fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3 49 fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3 50 51 fadd v0.2s, v4.2s, v5.2s 52 fsub v2.2s, v4.2s, v5.2s 53 fadd v1.2s, v6.2s, v7.2s 54 fsub v3.2s, v6.2s, v7.2s 55 56 st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] 57 58 ret 59 endfunc 60 61 function fft8_neon 62 AARCH64_VALID_JUMP_TARGET 63 mov x1, x0 64 ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 65 ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] 66 ext v22.8b, v2.8b, v3.8b, #4 67 ext v23.8b, v3.8b, v2.8b, #4 68 fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 69 fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 70 fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 71 fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 72 rev64 v27.2s, v28.2s // ??? 73 fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 74 fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 75 fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w 76 ext v6.8b, v4.8b, v5.8b, #4 77 ext v7.8b, v5.8b, v4.8b, #4 78 fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w 79 fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 80 fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 81 fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w 82 fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w 83 fadd v0.2s, v20.2s, v21.2s 84 fsub v2.2s, v20.2s, v21.2s 85 fadd v1.2s, v22.2s, v23.2s 86 rev64 v26.2s, v26.2s 87 rev64 v27.2s, v27.2s 88 fsub v3.2s, v22.2s, v23.2s 89 fsub v6.2s, v6.2s, v7.2s 90 fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 91 fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 92 fadd v7.2s, v4.2s, v5.2s 93 fsub v18.2s, v2.2s, v6.2s 94 ext v26.8b, v24.8b, v25.8b, #4 95 ext v27.8b, v25.8b, v24.8b, #4 96 fadd v2.2s, v2.2s, v6.2s 97 fsub v16.2s, v0.2s, v7.2s 98 fadd v5.2s, v25.2s, v24.2s 99 fsub v4.2s, v26.2s, v27.2s 100 fadd v0.2s, v0.2s, v7.2s 101 fsub v17.2s, v1.2s, v5.2s 102 fsub v19.2s, v3.2s, v4.2s 103 fadd v3.2s, v3.2s, v4.2s 104 fadd v1.2s, v1.2s, v5.2s 105 106 st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] 107 st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1] 108 109 ret 110 endfunc 111 112 function fft16_neon 113 AARCH64_VALID_JUMP_TARGET 114 mov x1, x0 115 ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 116 ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32 117 ext v22.8b, v2.8b, v3.8b, #4 118 ext v23.8b, v3.8b, v2.8b, #4 119 fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 120 fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 121 fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 122 fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 123 rev64 v27.2s, v28.2s // ??? 124 fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 125 fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 126 fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w 127 ext v6.8b, v4.8b, v5.8b, #4 128 ext v7.8b, v5.8b, v4.8b, #4 129 fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w 130 fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 131 fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 132 fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w 133 fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w 134 fadd v0.2s, v20.2s, v21.2s 135 fsub v2.2s, v20.2s, v21.2s 136 fadd v1.2s, v22.2s, v23.2s 137 rev64 v26.2s, v26.2s 138 rev64 v27.2s, v27.2s 139 fsub v3.2s, v22.2s, v23.2s 140 fsub v6.2s, v6.2s, v7.2s 141 fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 142 fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 143 fadd v7.2s, v4.2s, v5.2s 144 fsub v18.2s, v2.2s, v6.2s 145 ld1 {v20.4s,v21.4s}, [x0], #32 146 ld1 {v22.4s,v23.4s}, [x0], #32 147 ext v26.8b, v24.8b, v25.8b, #4 148 ext v27.8b, v25.8b, v24.8b, #4 149 fadd v2.2s, v2.2s, v6.2s 150 fsub v16.2s, v0.2s, v7.2s 151 fadd v5.2s, v25.2s, v24.2s 152 fsub v4.2s, v26.2s, v27.2s 153 transpose v24.2d, v25.2d, v20.2d, v22.2d 154 transpose v26.2d, v27.2d, v21.2d, v23.2d 155 fadd v0.2s, v0.2s, v7.2s 156 fsub v17.2s, v1.2s, v5.2s 157 fsub v19.2s, v3.2s, v4.2s 158 fadd v3.2s, v3.2s, v4.2s 159 fadd v1.2s, v1.2s, v5.2s 160 ext v20.16b, v21.16b, v21.16b, #4 161 ext v21.16b, v23.16b, v23.16b, #4 162 163 zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]} 164 zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]} 165 zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]} 166 zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]} 167 168 // 2 x fft4 169 transpose v22.2d, v23.2d, v20.2d, v21.2d 170 171 fadd v4.4s, v24.4s, v25.4s 172 fadd v5.4s, v26.4s, v27.4s 173 fsub v6.4s, v24.4s, v25.4s 174 fsub v7.4s, v22.4s, v23.4s 175 176 ld1 {v23.4s}, [x14] 177 178 fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]} 179 fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]} 180 fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]} 181 fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]} 182 183 //fft_pass_neon_16 184 rev64 v7.4s, v25.4s 185 fmul v25.4s, v25.4s, v23.s[1] 186 fmul v7.4s, v7.4s, v29.4s 187 fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a} 188 189 zip1 v20.4s, v24.4s, v25.4s 190 zip2 v21.4s, v24.4s, v25.4s 191 fneg v22.4s, v20.4s 192 fadd v4.4s, v21.4s, v20.4s 193 fsub v6.4s, v20.4s, v21.4s // just the second half 194 fadd v5.4s, v21.4s, v22.4s // just the first half 195 196 tbl v4.16b, {v4.16b}, v30.16b // trans4_float 197 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 198 199 fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]} 200 fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]} 201 fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]} 202 fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]} 203 204 //second half 205 rev64 v6.4s, v26.4s 206 fmul v26.4s, v26.4s, v23.s[2] 207 rev64 v7.4s, v27.4s 208 fmul v27.4s, v27.4s, v23.s[3] 209 fmul v6.4s, v6.4s, v29.4s 210 fmul v7.4s, v7.4s, v29.4s 211 fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6} 212 fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a} 213 214 zip1 v24.4s, v26.4s, v27.4s 215 zip2 v25.4s, v26.4s, v27.4s 216 fneg v26.4s, v24.4s 217 fadd v4.4s, v25.4s, v24.4s 218 fsub v6.4s, v24.4s, v25.4s // just the second half 219 fadd v5.4s, v25.4s, v26.4s // just the first half 220 221 tbl v4.16b, {v4.16b}, v30.16b // trans4_float 222 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 223 224 fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]} 225 fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]} 226 fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]} 227 fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]} 228 229 st1 {v16.4s,v17.4s}, [x1], #32 230 st1 {v18.4s,v19.4s}, [x1], #32 231 st1 {v20.4s,v21.4s}, [x1], #32 232 st1 {v22.4s,v23.4s}, [x1], #32 233 234 ret 235 endfunc 236 237 238 const trans4_float, align=4 239 .byte 0, 1, 2, 3 240 .byte 8, 9, 10, 11 241 .byte 4, 5, 6, 7 242 .byte 12, 13, 14, 15 243 endconst 244 245 const trans8_float, align=4 246 .byte 24, 25, 26, 27 247 .byte 0, 1, 2, 3 248 .byte 28, 29, 30, 31 249 .byte 4, 5, 6, 7 250 endconst 251 252 function fft_pass_neon 253 sub x6, x2, #1 // n - 1, loop counter 254 lsl x5, x2, #3 // 2 * n * sizeof FFTSample 255 lsl x1, x2, #4 // 2 * n * sizeof FFTComplex 256 add x5, x4, x5 // wim 257 add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex 258 add x2, x0, x2, lsl #5 // &z[o2] 259 add x3, x0, x3 // &z[o3] 260 add x1, x0, x1 // &z[o1] 261 ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} 262 ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} 263 ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} 264 trn2 v25.2d, v20.2d, v22.2d 265 sub x5, x5, #4 // wim-- 266 trn1 v24.2d, v20.2d, v22.2d 267 ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1] 268 rev64 v7.4s, v25.4s 269 fmul v25.4s, v25.4s, v4.s[1] 270 ld1 {v16.4s}, [x0] // {z[0],z[1]} 271 fmul v7.4s, v7.4s, v29.4s 272 ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]} 273 prfm pldl1keep, [x2, #16] 274 prfm pldl1keep, [x3, #16] 275 fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} 276 prfm pldl1keep, [x0, #16] 277 prfm pldl1keep, [x1, #16] 278 279 zip1 v20.4s, v24.4s, v25.4s 280 zip2 v21.4s, v24.4s, v25.4s 281 fneg v22.4s, v20.4s 282 fadd v4.4s, v21.4s, v20.4s 283 fsub v6.4s, v20.4s, v21.4s // just the second half 284 fadd v5.4s, v21.4s, v22.4s // just the first half 285 286 tbl v4.16b, {v4.16b}, v30.16b // trans4_float 287 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 288 289 fadd v20.4s, v16.4s, v4.4s 290 fsub v22.4s, v16.4s, v4.4s 291 fadd v21.4s, v17.4s, v5.4s 292 st1 {v20.4s}, [x0], #16 // {z[0], z[1]} 293 fsub v23.4s, v17.4s, v5.4s 294 295 st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} 296 st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} 297 st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} 298 1: 299 ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} 300 ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} 301 ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} 302 transpose v26.2d, v27.2d, v20.2d, v22.2d 303 ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]} 304 rev64 v6.4s, v26.4s 305 fmul v26.4s, v26.4s, v4.s[0] 306 rev64 v7.4s, v27.4s 307 fmul v27.4s, v27.4s, v4.s[1] 308 fmul v6.4s, v6.4s, v29.4s 309 fmul v7.4s, v7.4s, v29.4s 310 ld1 {v16.4s},[x0] // {z[0],z[1]} 311 fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6} 312 fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} 313 ld1 {v17.4s},[x1] // {z[o1],z[o1+1]} 314 315 subs x6, x6, #1 // n-- 316 317 zip1 v20.4s, v26.4s, v27.4s 318 zip2 v21.4s, v26.4s, v27.4s 319 fneg v22.4s, v20.4s 320 fadd v4.4s, v21.4s, v20.4s 321 fsub v6.4s, v20.4s, v21.4s // just the second half 322 fadd v5.4s, v21.4s, v22.4s // just the first half 323 324 tbl v4.16b, {v4.16b}, v30.16b // trans4_float 325 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 326 327 fadd v20.4s, v16.4s, v4.4s 328 fsub v22.4s, v16.4s, v4.4s 329 fadd v21.4s, v17.4s, v5.4s 330 st1 {v20.4s}, [x0], #16 // {z[0], z[1]} 331 fsub v23.4s, v17.4s, v5.4s 332 333 st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} 334 st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} 335 st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} 336 b.ne 1b 337 338 ret 339 endfunc 340 341 .macro def_fft n, n2, n4 342 function fft\n\()_neon, align=6 343 AARCH64_VALID_JUMP_TARGET 344 AARCH64_SIGN_LINK_REGISTER 345 sub sp, sp, #16 346 stp x28, x30, [sp] 347 add x28, x0, #\n4*2*8 348 bl fft\n2\()_neon 349 mov x0, x28 350 bl fft\n4\()_neon 351 add x0, x28, #\n4*1*8 352 bl fft\n4\()_neon 353 sub x0, x28, #\n4*2*8 354 ldp x28, x30, [sp], #16 355 AARCH64_VALIDATE_LINK_REGISTER 356 movrel x4, X(ff_cos_\n) 357 mov x2, #\n4>>1 358 b fft_pass_neon 359 endfunc 360 .endm 361 362 def_fft 32, 16, 8 363 def_fft 64, 32, 16 364 def_fft 128, 64, 32 365 def_fft 256, 128, 64 366 def_fft 512, 256, 128 367 def_fft 1024, 512, 256 368 def_fft 2048, 1024, 512 369 def_fft 4096, 2048, 1024 370 def_fft 8192, 4096, 2048 371 def_fft 16384, 8192, 4096 372 def_fft 32768, 16384, 8192 373 def_fft 65536, 32768, 16384 374 375 function ff_fft_calc_neon, export=1 376 prfm pldl1keep, [x1] 377 movrel x10, trans4_float 378 ldr w2, [x0] 379 movrel x11, trans8_float 380 sub w2, w2, #2 381 movrel x3, fft_tab_neon 382 ld1 {v30.16b}, [x10] 383 mov x7, #-8 384 movrel x12, pmmp 385 ldr x3, [x3, x2, lsl #3] 386 movrel x13, mppm 387 movrel x14, X(ff_cos_16) 388 ld1 {v31.16b}, [x11] 389 mov x0, x1 390 ld1 {v29.4s}, [x12] // pmmp 391 ld1 {v28.4s}, [x13] 392 br x3 393 endfunc 394 395 function ff_fft_permute_neon, export=1 396 mov x6, #1 397 ldr w2, [x0] // nbits 398 ldr x3, [x0, #16] // tmp_buf 399 ldr x0, [x0, #8] // revtab 400 lsl x6, x6, x2 401 mov x2, x6 402 1: 403 ld1 {v0.2s,v1.2s}, [x1], #16 404 ldr w4, [x0], #4 405 uxth w5, w4 406 lsr w4, w4, #16 407 add x5, x3, x5, lsl #3 408 add x4, x3, x4, lsl #3 409 st1 {v0.2s}, [x5] 410 st1 {v1.2s}, [x4] 411 subs x6, x6, #2 412 b.gt 1b 413 414 sub x1, x1, x2, lsl #3 415 1: 416 ld1 {v0.4s,v1.4s}, [x3], #32 417 st1 {v0.4s,v1.4s}, [x1], #32 418 subs x2, x2, #4 419 b.gt 1b 420 421 ret 422 endfunc 423 424 const fft_tab_neon, relocate=1 425 .quad fft4_neon 426 .quad fft8_neon 427 .quad fft16_neon 428 .quad fft32_neon 429 .quad fft64_neon 430 .quad fft128_neon 431 .quad fft256_neon 432 .quad fft512_neon 433 .quad fft1024_neon 434 .quad fft2048_neon 435 .quad fft4096_neon 436 .quad fft8192_neon 437 .quad fft16384_neon 438 .quad fft32768_neon 439 .quad fft65536_neon 440 endconst 441 442 const pmmp, align=4 443 .float +1.0, -1.0, -1.0, +1.0 444 endconst 445 446 const mppm, align=4 447 .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 448 endconst 449