1 /* 2 * AArch64 NEON optimised MDCT 3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23 #include "libavutil/aarch64/asm.S" 24 25 function ff_imdct_half_neon, export=1 26 sub sp, sp, #32 27 stp x19, x20, [sp] 28 AARCH64_SIGN_LINK_REGISTER 29 str x30, [sp, #16] 30 mov x12, #1 31 ldr w14, [x0, #28] // mdct_bits 32 ldr x4, [x0, #32] // tcos 33 ldr x3, [x0, #8] // revtab 34 lsl x12, x12, x14 // n = 1 << nbits 35 lsr x14, x12, #2 // n4 = n >> 2 36 add x7, x2, x12, lsl #1 37 mov x12, #-16 38 sub x7, x7, #16 39 40 ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0 41 ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x 42 rev64 v17.2s, v17.2s 43 ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 44 fmul v6.2s, v17.2s, v2.2s 45 fmul v7.2s, v0.2s, v2.2s 46 1: 47 subs x14, x14, #2 48 ldr w6, [x3], #4 49 fmul v4.2s, v0.2s, v3.2s 50 fmul v5.2s, v17.2s, v3.2s 51 fsub v4.2s, v6.2s, v4.2s 52 fadd v5.2s, v5.2s, v7.2s 53 ubfm x8, x6, #16, #31 54 ubfm x6, x6, #0, #15 55 add x8, x1, x8, lsl #3 56 add x6, x1, x6, lsl #3 57 b.eq 2f 58 ld2 {v16.2s,v17.2s}, [x7], x12 59 ld2 {v0.2s,v1.2s}, [x2], #16 60 rev64 v17.2s, v17.2s 61 ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 62 fmul v6.2s, v17.2s, v2.2s 63 fmul v7.2s, v0.2s, v2.2s 64 st2 {v4.s,v5.s}[0], [x6] 65 st2 {v4.s,v5.s}[1], [x8] 66 b 1b 67 2: 68 st2 {v4.s,v5.s}[0], [x6] 69 st2 {v4.s,v5.s}[1], [x8] 70 71 mov x19, x0 72 mov x20, x1 73 bl X(ff_fft_calc_neon) 74 75 mov x12, #1 76 ldr w14, [x19, #28] // mdct_bits 77 ldr x4, [x19, #32] // tcos 78 lsl x12, x12, x14 // n = 1 << nbits 79 lsr x14, x12, #3 // n8 = n >> 3 80 81 add x4, x4, x14, lsl #3 82 add x6, x20, x14, lsl #3 83 sub x1, x4, #16 84 sub x3, x6, #16 85 86 mov x7, #-16 87 mov x8, x6 88 mov x0, x3 89 90 ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0 91 ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3 92 ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 93 3: 94 subs x14, x14, #2 95 fmul v7.2s, v0.2s, v17.2s 96 ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3 97 fmul v4.2s, v1.2s, v17.2s 98 fmul v6.2s, v21.2s, v19.2s 99 fmul v5.2s, v20.2s, v19.2s 100 fmul v22.2s, v1.2s, v16.2s 101 fmul v23.2s, v21.2s, v18.2s 102 fmul v24.2s, v0.2s, v16.2s 103 fmul v25.2s, v20.2s, v18.2s 104 fadd v7.2s, v7.2s, v22.2s 105 fadd v5.2s, v5.2s, v23.2s 106 fsub v4.2s, v4.2s, v24.2s 107 fsub v6.2s, v6.2s, v25.2s 108 b.eq 4f 109 ld2 {v0.2s,v1.2s}, [x3], x7 110 ld2 {v20.2s,v21.2s},[x6], #16 111 ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 112 rev64 v5.2s, v5.2s 113 rev64 v7.2s, v7.2s 114 st2 {v4.2s,v5.2s}, [x0], x7 115 st2 {v6.2s,v7.2s}, [x8], #16 116 b 3b 117 4: 118 rev64 v5.2s, v5.2s 119 rev64 v7.2s, v7.2s 120 st2 {v4.2s,v5.2s}, [x0] 121 st2 {v6.2s,v7.2s}, [x8] 122 123 ldp x19, x20, [sp] 124 ldr x30, [sp, #16] 125 AARCH64_VALIDATE_LINK_REGISTER 126 add sp, sp, #32 127 128 ret 129 endfunc 130 131 function ff_imdct_calc_neon, export=1 132 sub sp, sp, #32 133 stp x19, x20, [sp] 134 AARCH64_SIGN_LINK_REGISTER 135 str x30, [sp, #16] 136 ldr w3, [x0, #28] // mdct_bits 137 mov x19, #1 138 mov x20, x1 139 lsl x19, x19, x3 140 add x1, x1, x19 141 142 bl X(ff_imdct_half_neon) 143 144 add x0, x20, x19, lsl #2 145 add x1, x20, x19, lsl #1 146 sub x0, x0, #8 147 sub x2, x1, #16 148 mov x3, #-16 149 mov x6, #-8 150 1: 151 ld1 {v0.4s}, [x2], x3 152 prfum pldl1keep, [x0, #-16] 153 rev64 v0.4s, v0.4s 154 ld1 {v2.2s,v3.2s}, [x1], #16 155 fneg v4.4s, v0.4s 156 prfum pldl1keep, [x2, #-16] 157 rev64 v2.2s, v2.2s 158 rev64 v3.2s, v3.2s 159 ext v4.16b, v4.16b, v4.16b, #8 160 st1 {v2.2s}, [x0], x6 161 st1 {v3.2s}, [x0], x6 162 st1 {v4.4s}, [x20], #16 163 subs x19, x19, #16 164 b.gt 1b 165 166 ldp x19, x20, [sp] 167 ldr x30, [sp, #16] 168 AARCH64_VALIDATE_LINK_REGISTER 169 add sp, sp, #32 170 171 ret 172 endfunc 173 174 175 function ff_mdct_calc_neon, export=1 176 sub sp, sp, #32 177 stp x19, x20, [sp] 178 AARCH64_SIGN_LINK_REGISTER 179 str x30, [sp, #16] 180 181 mov x12, #1 182 ldr w14, [x0, #28] // mdct_bits 183 ldr x4, [x0, #32] // tcos 184 ldr x3, [x0, #8] // revtab 185 lsl x14, x12, x14 // n = 1 << nbits 186 add x7, x2, x14 // in4u 187 sub x9, x7, #16 // in4d 188 add x2, x7, x14, lsl #1 // in3u 189 add x8, x9, x14, lsl #1 // in3d 190 add x5, x4, x14, lsl #1 191 sub x5, x5, #16 192 sub x3, x3, #4 193 mov x12, #-16 194 lsr x13, x14, #1 195 196 ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 197 ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 198 ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 199 rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 200 rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 201 ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 202 fsub v0.2s, v17.2s, v0.2s // in4d-in4u I 203 ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 204 rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 205 rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 206 ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 207 fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R 208 fsub v16.2s, v16.2s, v1.2s // in0u-in2d R 209 fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I 210 1: 211 fmul v7.2s, v0.2s, v21.2s // I*s 212 ldr w10, [x3, x13] 213 fmul v6.2s, v2.2s, v20.2s // -R*c 214 ldr w6, [x3, #4]! 215 fmul v4.2s, v2.2s, v21.2s // -R*s 216 fmul v5.2s, v0.2s, v20.2s // I*c 217 fmul v24.2s, v16.2s, v30.2s // R*c 218 fmul v25.2s, v18.2s, v31.2s // -I*s 219 fmul v22.2s, v16.2s, v31.2s // R*s 220 fmul v23.2s, v18.2s, v30.2s // I*c 221 subs x14, x14, #16 222 subs x13, x13, #8 223 fsub v6.2s, v6.2s, v7.2s // -R*c-I*s 224 fadd v7.2s, v4.2s, v5.2s // -R*s+I*c 225 fsub v24.2s, v25.2s, v24.2s // I*s-R*c 226 fadd v25.2s, v22.2s, v23.2s // R*s-I*c 227 b.eq 1f 228 mov x12, #-16 229 ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 230 ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 231 fneg v7.2s, v7.2s // R*s-I*c 232 ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 233 rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 234 rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 235 ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 236 fsub v0.2s, v17.2s, v0.2s // in4d-in4u I 237 ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 238 rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 239 rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 240 ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 241 fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R 242 fsub v16.2s, v16.2s, v1.2s // in0u-in2d R 243 fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I 244 ubfm x12, x6, #16, #31 245 ubfm x6, x6, #0, #15 246 add x12, x1, x12, lsl #3 247 add x6, x1, x6, lsl #3 248 st2 {v6.s,v7.s}[0], [x6] 249 st2 {v6.s,v7.s}[1], [x12] 250 ubfm x6, x10, #16, #31 251 ubfm x10, x10, #0, #15 252 add x6 , x1, x6, lsl #3 253 add x10, x1, x10, lsl #3 254 st2 {v24.s,v25.s}[0], [x10] 255 st2 {v24.s,v25.s}[1], [x6] 256 b 1b 257 1: 258 fneg v7.2s, v7.2s // R*s-I*c 259 ubfm x12, x6, #16, #31 260 ubfm x6, x6, #0, #15 261 add x12, x1, x12, lsl #3 262 add x6, x1, x6, lsl #3 263 st2 {v6.s,v7.s}[0], [x6] 264 st2 {v6.s,v7.s}[1], [x12] 265 ubfm x6, x10, #16, #31 266 ubfm x10, x10, #0, #15 267 add x6 , x1, x6, lsl #3 268 add x10, x1, x10, lsl #3 269 st2 {v24.s,v25.s}[0], [x10] 270 st2 {v24.s,v25.s}[1], [x6] 271 272 mov x19, x0 273 mov x20, x1 274 bl X(ff_fft_calc_neon) 275 276 mov x12, #1 277 ldr w14, [x19, #28] // mdct_bits 278 ldr x4, [x19, #32] // tcos 279 lsl x12, x12, x14 // n = 1 << nbits 280 lsr x14, x12, #3 // n8 = n >> 3 281 282 add x4, x4, x14, lsl #3 283 add x6, x20, x14, lsl #3 284 sub x1, x4, #16 285 sub x3, x6, #16 286 287 mov x7, #-16 288 mov x8, x6 289 mov x0, x3 290 291 ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0 292 ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3 293 ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 294 1: 295 subs x14, x14, #2 296 fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0 297 ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3 298 fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0 299 fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3 300 fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3 301 fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0 302 fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3 303 fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3 304 fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0 305 fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0 306 fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3 307 fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3 308 fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0 309 fneg v4.2s, v4.2s 310 fneg v6.2s, v6.2s 311 b.eq 1f 312 ld2 {v0.2s, v1.2s}, [x3], x7 313 ld2 {v20.2s,v21.2s}, [x6], #16 314 ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 315 rev64 v5.2s, v5.2s 316 rev64 v7.2s, v7.2s 317 st2 {v4.2s,v5.2s}, [x0], x7 318 st2 {v6.2s,v7.2s}, [x8], #16 319 b 1b 320 1: 321 rev64 v5.2s, v5.2s 322 rev64 v7.2s, v7.2s 323 st2 {v4.2s,v5.2s}, [x0] 324 st2 {v6.2s,v7.2s}, [x8] 325 326 ldp x19, x20, [sp] 327 ldr x30, [sp, #16] 328 AARCH64_VALIDATE_LINK_REGISTER 329 add sp, sp, #32 330 331 ret 332 endfunc 333