1/*
2 * ARM NEON optimised FFT
3 *
4 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
5 * Copyright (c) 2009 Naotoshi Nojiri
6 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
7 *
8 * This algorithm (though not any of the implementation details) is
9 * based on libdjbfft by D. J. Bernstein.
10 *
11 * This file is part of FFmpeg.
12 *
13 * FFmpeg is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU Lesser General Public
15 * License as published by the Free Software Foundation; either
16 * version 2.1 of the License, or (at your option) any later version.
17 *
18 * FFmpeg is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 * Lesser General Public License for more details.
22 *
23 * You should have received a copy of the GNU Lesser General Public
24 * License along with FFmpeg; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 */
27
28#include "libavutil/aarch64/asm.S"
29
30#define M_SQRT1_2 0.70710678118654752440
31
32.macro transpose d0, d1, s0, s1
33        trn1            \d0, \s0, \s1
34        trn2            \d1, \s0, \s1
35.endm
36
37
38function fft4_neon
39        AARCH64_VALID_JUMP_TARGET
40        ld1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
41
42        fadd            v4.2s,  v0.2s,  v1.2s   // r0+r1,i0+i1
43        fsub            v6.2s,  v0.2s,  v1.2s   // r0-r1,i0-i1
44
45        ext             v16.8b, v2.8b,  v3.8b,  #4
46        ext             v17.8b, v3.8b,  v2.8b,  #4
47
48        fadd            v5.2s,  v2.2s,  v3.2s   // i2+i3,r2+r3
49        fsub            v7.2s,  v16.2s, v17.2s  // r3-r2,i2-i3
50
51        fadd            v0.2s,  v4.2s,  v5.2s
52        fsub            v2.2s,  v4.2s,  v5.2s
53        fadd            v1.2s,  v6.2s,  v7.2s
54        fsub            v3.2s,  v6.2s,  v7.2s
55
56        st1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
57
58        ret
59endfunc
60
61function fft8_neon
62        AARCH64_VALID_JUMP_TARGET
63        mov             x1,  x0
64        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
65        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
66        ext             v22.8b, v2.8b,  v3.8b,  #4
67        ext             v23.8b, v3.8b,  v2.8b,  #4
68        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
69        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
70        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
71        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
72        rev64           v27.2s, v28.2s  // ???
73        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
74        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
75        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
76        ext             v6.8b,  v4.8b,  v5.8b,  #4
77        ext             v7.8b,  v5.8b,  v4.8b,  #4
78        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
79        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
80        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
81        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
82        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
83        fadd            v0.2s,  v20.2s, v21.2s
84        fsub            v2.2s,  v20.2s, v21.2s
85        fadd            v1.2s,  v22.2s, v23.2s
86        rev64           v26.2s, v26.2s
87        rev64           v27.2s, v27.2s
88        fsub            v3.2s,  v22.2s, v23.2s
89        fsub            v6.2s,  v6.2s,  v7.2s
90        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
91        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
92        fadd            v7.2s,  v4.2s,  v5.2s
93        fsub            v18.2s, v2.2s,  v6.2s
94        ext             v26.8b, v24.8b, v25.8b, #4
95        ext             v27.8b, v25.8b, v24.8b, #4
96        fadd            v2.2s,  v2.2s,  v6.2s
97        fsub            v16.2s, v0.2s,  v7.2s
98        fadd            v5.2s,  v25.2s, v24.2s
99        fsub            v4.2s,  v26.2s, v27.2s
100        fadd            v0.2s,  v0.2s,  v7.2s
101        fsub            v17.2s, v1.2s,  v5.2s
102        fsub            v19.2s, v3.2s,  v4.2s
103        fadd            v3.2s,  v3.2s,  v4.2s
104        fadd            v1.2s,  v1.2s,  v5.2s
105
106        st1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
107        st1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x1]
108
109        ret
110endfunc
111
112function fft16_neon
113        AARCH64_VALID_JUMP_TARGET
114        mov             x1,  x0
115        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
116        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
117        ext             v22.8b, v2.8b,  v3.8b,  #4
118        ext             v23.8b, v3.8b,  v2.8b,  #4
119        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
120        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
121        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
122        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
123        rev64           v27.2s, v28.2s  // ???
124        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
125        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
126        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
127        ext             v6.8b,  v4.8b,  v5.8b,  #4
128        ext             v7.8b,  v5.8b,  v4.8b,  #4
129        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
130        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
131        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
132        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
133        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
134        fadd            v0.2s,  v20.2s, v21.2s
135        fsub            v2.2s,  v20.2s, v21.2s
136        fadd            v1.2s,  v22.2s, v23.2s
137        rev64           v26.2s, v26.2s
138        rev64           v27.2s, v27.2s
139        fsub            v3.2s,  v22.2s, v23.2s
140        fsub            v6.2s,  v6.2s,  v7.2s
141        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
142        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
143        fadd            v7.2s,  v4.2s,  v5.2s
144        fsub            v18.2s, v2.2s,  v6.2s
145        ld1             {v20.4s,v21.4s}, [x0], #32
146        ld1             {v22.4s,v23.4s}, [x0], #32
147        ext             v26.8b, v24.8b, v25.8b, #4
148        ext             v27.8b, v25.8b, v24.8b, #4
149        fadd            v2.2s,  v2.2s,  v6.2s
150        fsub            v16.2s, v0.2s,  v7.2s
151        fadd            v5.2s,  v25.2s, v24.2s
152        fsub            v4.2s,  v26.2s, v27.2s
153        transpose       v24.2d, v25.2d, v20.2d, v22.2d
154        transpose       v26.2d, v27.2d, v21.2d, v23.2d
155        fadd            v0.2s,  v0.2s,  v7.2s
156        fsub            v17.2s, v1.2s,  v5.2s
157        fsub            v19.2s, v3.2s,  v4.2s
158        fadd            v3.2s,  v3.2s,  v4.2s
159        fadd            v1.2s,  v1.2s,  v5.2s
160        ext             v20.16b, v21.16b, v21.16b,  #4
161        ext             v21.16b, v23.16b, v23.16b,  #4
162
163        zip1            v0.2d,  v0.2d,  v1.2d   // {z[0],   z[1]}
164        zip1            v1.2d,  v2.2d,  v3.2d   // {z[2],   z[3]}
165        zip1            v2.2d,  v16.2d, v17.2d  // {z[o1],  z[o1+1]}
166        zip1            v3.2d,  v18.2d, v19.2d  // {z[o1+2],z[o1+3]}
167
168        // 2 x fft4
169        transpose       v22.2d, v23.2d, v20.2d, v21.2d
170
171        fadd            v4.4s,  v24.4s, v25.4s
172        fadd            v5.4s,  v26.4s, v27.4s
173        fsub            v6.4s,  v24.4s, v25.4s
174        fsub            v7.4s,  v22.4s, v23.4s
175
176        ld1             {v23.4s},  [x14]
177
178        fadd            v24.4s, v4.4s,  v5.4s   // {z[o2+0],z[o2+1]}
179        fsub            v26.4s, v4.4s,  v5.4s   // {z[o2+2],z[o2+3]}
180        fadd            v25.4s, v6.4s,  v7.4s   // {z[o3+0],z[o3+1]}
181        fsub            v27.4s, v6.4s,  v7.4s   // {z[o3+2],z[o3+3]}
182
183        //fft_pass_neon_16
184        rev64           v7.4s,  v25.4s
185        fmul            v25.4s, v25.4s, v23.s[1]
186        fmul            v7.4s,  v7.4s,  v29.4s
187        fmla            v25.4s, v7.4s,  v23.s[3] // {t1a,t2a,t5a,t6a}
188
189        zip1            v20.4s, v24.4s, v25.4s
190        zip2            v21.4s, v24.4s, v25.4s
191        fneg            v22.4s, v20.4s
192        fadd            v4.4s,  v21.4s, v20.4s
193        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
194        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
195
196        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
197        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
198
199        fsub            v20.4s, v0.4s,  v4.4s   // {z[o2],z[o2+1]}
200        fadd            v16.4s, v0.4s,  v4.4s   // {z[0], z[1]}
201        fsub            v22.4s, v2.4s,  v5.4s   // {z[o3],z[o3+1]}
202        fadd            v18.4s, v2.4s,  v5.4s   // {z[o1],z[o1+1]}
203
204//second half
205        rev64           v6.4s,  v26.4s
206        fmul            v26.4s, v26.4s, v23.s[2]
207        rev64           v7.4s,  v27.4s
208        fmul            v27.4s, v27.4s, v23.s[3]
209        fmul            v6.4s,  v6.4s,  v29.4s
210        fmul            v7.4s,  v7.4s,  v29.4s
211        fmla            v26.4s, v6.4s,  v23.s[2] // {t1,t2,t5,t6}
212        fmla            v27.4s, v7.4s,  v23.s[1] // {t1a,t2a,t5a,t6a}
213
214        zip1            v24.4s, v26.4s, v27.4s
215        zip2            v25.4s, v26.4s, v27.4s
216        fneg            v26.4s, v24.4s
217        fadd            v4.4s,  v25.4s, v24.4s
218        fsub            v6.4s,  v24.4s, v25.4s  // just the second half
219        fadd            v5.4s,  v25.4s, v26.4s  // just the first half
220
221        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
222        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
223
224        fadd            v17.4s, v1.4s, v4.4s    // {z[2], z[3]}
225        fsub            v21.4s, v1.4s, v4.4s    // {z[o2+2],z[o2+3]}
226        fadd            v19.4s, v3.4s, v5.4s    // {z[o1+2],z[o1+3]}
227        fsub            v23.4s, v3.4s, v5.4s    // {z[o3+2],z[o3+3]}
228
229        st1             {v16.4s,v17.4s}, [x1], #32
230        st1             {v18.4s,v19.4s}, [x1], #32
231        st1             {v20.4s,v21.4s}, [x1], #32
232        st1             {v22.4s,v23.4s}, [x1], #32
233
234        ret
235endfunc
236
237
238const  trans4_float, align=4
239        .byte    0,  1,  2,  3
240        .byte    8,  9, 10, 11
241        .byte    4,  5,  6,  7
242        .byte   12, 13, 14, 15
243endconst
244
245const  trans8_float, align=4
246        .byte   24, 25, 26, 27
247        .byte    0,  1,  2,  3
248        .byte   28, 29, 30, 31
249        .byte    4,  5,  6,  7
250endconst
251
252function fft_pass_neon
253        sub             x6,  x2,  #1            // n - 1, loop counter
254        lsl             x5,  x2,  #3            // 2 * n * sizeof FFTSample
255        lsl             x1,  x2,  #4            // 2 * n * sizeof FFTComplex
256        add             x5,  x4,  x5            // wim
257        add             x3,  x1,  x2,  lsl #5   // 4 * n * sizeof FFTComplex
258        add             x2,  x0,  x2,  lsl #5   // &z[o2]
259        add             x3,  x0,  x3            // &z[o3]
260        add             x1,  x0,  x1            // &z[o1]
261        ld1             {v20.4s},[x2]           // {z[o2],z[o2+1]}
262        ld1             {v22.4s},[x3]           // {z[o3],z[o3+1]}
263        ld1             {v4.2s},  [x4], #8      // {wre[0],wre[1]}
264        trn2            v25.2d, v20.2d, v22.2d
265        sub             x5,  x5,  #4            // wim--
266        trn1            v24.2d, v20.2d, v22.2d
267        ld1             {v5.s}[0],  [x5], x7    // d5[0] = wim[-1]
268        rev64           v7.4s,  v25.4s
269        fmul            v25.4s, v25.4s, v4.s[1]
270        ld1             {v16.4s}, [x0]          // {z[0],z[1]}
271        fmul            v7.4s,  v7.4s,  v29.4s
272        ld1             {v17.4s}, [x1]          // {z[o1],z[o1+1]}
273        prfm            pldl1keep, [x2, #16]
274        prfm            pldl1keep, [x3, #16]
275        fmla            v25.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
276        prfm            pldl1keep, [x0, #16]
277        prfm            pldl1keep, [x1, #16]
278
279        zip1            v20.4s, v24.4s, v25.4s
280        zip2            v21.4s, v24.4s, v25.4s
281        fneg            v22.4s, v20.4s
282        fadd            v4.4s,  v21.4s, v20.4s
283        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
284        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
285
286        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
287        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
288
289        fadd            v20.4s, v16.4s, v4.4s
290        fsub            v22.4s, v16.4s, v4.4s
291        fadd            v21.4s, v17.4s, v5.4s
292        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
293        fsub            v23.4s, v17.4s, v5.4s
294
295        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
296        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
297        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
2981:
299        ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
300        ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}
301        ld1             {v4.2s}, [x4], #8       // {wre[0],wre[1]}
302        transpose       v26.2d, v27.2d, v20.2d, v22.2d
303        ld1             {v5.2s}, [x5], x7       // {wim[-1],wim[0]}
304        rev64           v6.4s,  v26.4s
305        fmul            v26.4s, v26.4s, v4.s[0]
306        rev64           v7.4s,  v27.4s
307        fmul            v27.4s, v27.4s, v4.s[1]
308        fmul            v6.4s,  v6.4s,  v29.4s
309        fmul            v7.4s,  v7.4s,  v29.4s
310        ld1             {v16.4s},[x0]           // {z[0],z[1]}
311        fmla            v26.4s, v6.4s,  v5.s[1] // {t1,t2,t5,t6}
312        fmla            v27.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
313        ld1             {v17.4s},[x1]           // {z[o1],z[o1+1]}
314
315        subs            x6,  x6,  #1            // n--
316
317        zip1            v20.4s, v26.4s, v27.4s
318        zip2            v21.4s, v26.4s, v27.4s
319        fneg            v22.4s, v20.4s
320        fadd            v4.4s,  v21.4s, v20.4s
321        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
322        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
323
324        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
325        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
326
327        fadd            v20.4s, v16.4s, v4.4s
328        fsub            v22.4s, v16.4s, v4.4s
329        fadd            v21.4s, v17.4s, v5.4s
330        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
331        fsub            v23.4s, v17.4s, v5.4s
332
333        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
334        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
335        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
336        b.ne            1b
337
338        ret
339endfunc
340
341.macro  def_fft n, n2, n4
342function fft\n\()_neon, align=6
343        AARCH64_VALID_JUMP_TARGET
344        AARCH64_SIGN_LINK_REGISTER
345        sub             sp,  sp,  #16
346        stp             x28, x30, [sp]
347        add             x28, x0,  #\n4*2*8
348        bl              fft\n2\()_neon
349        mov             x0,  x28
350        bl              fft\n4\()_neon
351        add             x0,  x28, #\n4*1*8
352        bl              fft\n4\()_neon
353        sub             x0,  x28, #\n4*2*8
354        ldp             x28, x30, [sp], #16
355        AARCH64_VALIDATE_LINK_REGISTER
356        movrel          x4,  X(ff_cos_\n)
357        mov             x2,  #\n4>>1
358        b               fft_pass_neon
359endfunc
360.endm
361
362        def_fft    32,    16,     8
363        def_fft    64,    32,    16
364        def_fft   128,    64,    32
365        def_fft   256,   128,    64
366        def_fft   512,   256,   128
367        def_fft  1024,   512,   256
368        def_fft  2048,  1024,   512
369        def_fft  4096,  2048,  1024
370        def_fft  8192,  4096,  2048
371        def_fft 16384,  8192,  4096
372        def_fft 32768, 16384,  8192
373        def_fft 65536, 32768, 16384
374
375function ff_fft_calc_neon, export=1
376        prfm            pldl1keep, [x1]
377        movrel          x10, trans4_float
378        ldr             w2,  [x0]
379        movrel          x11, trans8_float
380        sub             w2,  w2,  #2
381        movrel          x3,  fft_tab_neon
382        ld1             {v30.16b}, [x10]
383        mov             x7,  #-8
384        movrel          x12, pmmp
385        ldr             x3,  [x3, x2, lsl #3]
386        movrel          x13, mppm
387        movrel          x14, X(ff_cos_16)
388        ld1             {v31.16b}, [x11]
389        mov             x0,  x1
390        ld1             {v29.4s},  [x12]         // pmmp
391        ld1             {v28.4s},  [x13]
392        br              x3
393endfunc
394
395function ff_fft_permute_neon, export=1
396        mov             x6,  #1
397        ldr             w2,  [x0]       // nbits
398        ldr             x3,  [x0, #16]  // tmp_buf
399        ldr             x0,  [x0, #8]   // revtab
400        lsl             x6,  x6, x2
401        mov             x2,  x6
4021:
403        ld1             {v0.2s,v1.2s}, [x1], #16
404        ldr             w4,  [x0], #4
405        uxth            w5,  w4
406        lsr             w4,  w4,  #16
407        add             x5,  x3,  x5,  lsl #3
408        add             x4,  x3,  x4,  lsl #3
409        st1             {v0.2s}, [x5]
410        st1             {v1.2s}, [x4]
411        subs            x6,  x6, #2
412        b.gt            1b
413
414        sub             x1,  x1,  x2,  lsl #3
4151:
416        ld1             {v0.4s,v1.4s}, [x3], #32
417        st1             {v0.4s,v1.4s}, [x1], #32
418        subs            x2,  x2,  #4
419        b.gt            1b
420
421        ret
422endfunc
423
424const   fft_tab_neon, relocate=1
425        .quad fft4_neon
426        .quad fft8_neon
427        .quad fft16_neon
428        .quad fft32_neon
429        .quad fft64_neon
430        .quad fft128_neon
431        .quad fft256_neon
432        .quad fft512_neon
433        .quad fft1024_neon
434        .quad fft2048_neon
435        .quad fft4096_neon
436        .quad fft8192_neon
437        .quad fft16384_neon
438        .quad fft32768_neon
439        .quad fft65536_neon
440endconst
441
442const   pmmp, align=4
443        .float          +1.0, -1.0, -1.0, +1.0
444endconst
445
446const   mppm, align=4
447        .float          -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
448endconst
449