1 /* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #include "config.h" 23 #include "libavutil/aarch64/asm.S" 24 25 function swri_oldapi_conv_flt_to_s16_neon, export=1 26 oldapi_conv_flt_to_s16_neon: 27 subs x2, x2, #8 28 ld1 {v0.4s}, [x1], #16 29 fcvtzs v4.4s, v0.4s, #31 30 ld1 {v1.4s}, [x1], #16 31 fcvtzs v5.4s, v1.4s, #31 32 b.eq 3f 33 ands x12, x2, #~15 34 b.eq 2f 35 1: subs x12, x12, #16 36 sqrshrn v4.4h, v4.4s, #16 37 ld1 {v2.4s}, [x1], #16 38 fcvtzs v6.4s, v2.4s, #31 39 sqrshrn2 v4.8h, v5.4s, #16 40 ld1 {v3.4s}, [x1], #16 41 fcvtzs v7.4s, v3.4s, #31 42 sqrshrn v6.4h, v6.4s, #16 43 st1 {v4.8h}, [x0], #16 44 sqrshrn2 v6.8h, v7.4s, #16 45 ld1 {v0.4s}, [x1], #16 46 fcvtzs v4.4s, v0.4s, #31 47 ld1 {v1.4s}, [x1], #16 48 fcvtzs v5.4s, v1.4s, #31 49 st1 {v6.8h}, [x0], #16 50 b.ne 1b 51 ands x2, x2, #15 52 b.eq 3f 53 2: ld1 {v2.4s}, [x1], #16 54 sqrshrn v4.4h, v4.4s, #16 55 fcvtzs v6.4s, v2.4s, #31 56 ld1 {v3.4s}, [x1], #16 57 sqrshrn2 v4.8h, v5.4s, #16 58 fcvtzs v7.4s, v3.4s, #31 59 sqrshrn v6.4h, v6.4s, #16 60 st1 {v4.8h}, [x0], #16 61 sqrshrn2 v6.8h, v7.4s, #16 62 st1 {v6.8h}, [x0] 63 ret 64 3: sqrshrn v4.4h, v4.4s, #16 65 sqrshrn2 v4.8h, v5.4s, #16 66 st1 {v4.8h}, [x0] 67 ret 68 endfunc 69 70 function swri_oldapi_conv_fltp_to_s16_2ch_neon, export=1 71 oldapi_conv_fltp_to_s16_2ch_neon: 72 ldp x4, x5, [x1] 73 subs x2, x2, #8 74 ld1 {v0.4s}, [x4], #16 75 fcvtzs v4.4s, v0.4s, #31 76 ld1 {v1.4s}, [x4], #16 77 fcvtzs v5.4s, v1.4s, #31 78 ld1 {v2.4s}, [x5], #16 79 fcvtzs v6.4s, v2.4s, #31 80 ld1 {v3.4s}, [x5], #16 81 fcvtzs v7.4s, v3.4s, #31 82 b.eq 3f 83 ands x12, x2, #~15 84 b.eq 2f 85 1: subs x12, x12, #16 86 ld1 {v16.4s}, [x4], #16 87 fcvtzs v20.4s, v16.4s, #31 88 sri v6.4s, v4.4s, #16 89 ld1 {v17.4s}, [x4], #16 90 fcvtzs v21.4s, v17.4s, #31 91 ld1 {v18.4s}, [x5], #16 92 fcvtzs v22.4s, v18.4s, #31 93 ld1 {v19.4s}, [x5], #16 94 sri v7.4s, v5.4s, #16 95 st1 {v6.4s}, [x0], #16 96 fcvtzs v23.4s, v19.4s, #31 97 st1 {v7.4s}, [x0], #16 98 sri v22.4s, v20.4s, #16 99 ld1 {v0.4s}, [x4], #16 100 sri v23.4s, v21.4s, #16 101 st1 {v22.4s}, [x0], #16 102 fcvtzs v4.4s, v0.4s, #31 103 ld1 {v1.4s}, [x4], #16 104 fcvtzs v5.4s, v1.4s, #31 105 ld1 {v2.4s}, [x5], #16 106 fcvtzs v6.4s, v2.4s, #31 107 ld1 {v3.4s}, [x5], #16 108 fcvtzs v7.4s, v3.4s, #31 109 st1 {v23.4s}, [x0], #16 110 b.ne 1b 111 ands x2, x2, #15 112 b.eq 3f 113 2: sri v6.4s, v4.4s, #16 114 ld1 {v0.4s}, [x4], #16 115 fcvtzs v0.4s, v0.4s, #31 116 ld1 {v1.4s}, [x4], #16 117 fcvtzs v1.4s, v1.4s, #31 118 ld1 {v2.4s}, [x5], #16 119 fcvtzs v2.4s, v2.4s, #31 120 sri v7.4s, v5.4s, #16 121 ld1 {v3.4s}, [x5], #16 122 fcvtzs v3.4s, v3.4s, #31 123 sri v2.4s, v0.4s, #16 124 st1 {v6.4s,v7.4s}, [x0], #32 125 sri v3.4s, v1.4s, #16 126 st1 {v2.4s,v3.4s}, [x0], #32 127 ret 128 3: sri v6.4s, v4.4s, #16 129 sri v7.4s, v5.4s, #16 130 st1 {v6.4s,v7.4s}, [x0] 131 ret 132 endfunc 133 134 function swri_oldapi_conv_fltp_to_s16_nch_neon, export=1 135 cmp w3, #2 136 b.eq oldapi_conv_fltp_to_s16_2ch_neon 137 b.gt 1f 138 ldr x1, [x1] 139 b oldapi_conv_flt_to_s16_neon 140 1: 141 cmp w3, #4 142 lsl x12, x3, #1 143 b.lt 4f 144 145 5: // 4 channels 146 ldp x4, x5, [x1], #16 147 ldp x6, x7, [x1], #16 148 mov w9, w2 149 mov x8, x0 150 ld1 {v4.4s}, [x4], #16 151 fcvtzs v4.4s, v4.4s, #31 152 ld1 {v5.4s}, [x5], #16 153 fcvtzs v5.4s, v5.4s, #31 154 ld1 {v6.4s}, [x6], #16 155 fcvtzs v6.4s, v6.4s, #31 156 ld1 {v7.4s}, [x7], #16 157 fcvtzs v7.4s, v7.4s, #31 158 6: 159 subs w9, w9, #8 160 ld1 {v0.4s}, [x4], #16 161 fcvtzs v0.4s, v0.4s, #31 162 sri v5.4s, v4.4s, #16 163 ld1 {v1.4s}, [x5], #16 164 fcvtzs v1.4s, v1.4s, #31 165 sri v7.4s, v6.4s, #16 166 ld1 {v2.4s}, [x6], #16 167 fcvtzs v2.4s, v2.4s, #31 168 zip1 v16.4s, v5.4s, v7.4s 169 ld1 {v3.4s}, [x7], #16 170 fcvtzs v3.4s, v3.4s, #31 171 zip2 v17.4s, v5.4s, v7.4s 172 st1 {v16.d}[0], [x8], x12 173 sri v1.4s, v0.4s, #16 174 st1 {v16.d}[1], [x8], x12 175 sri v3.4s, v2.4s, #16 176 st1 {v17.d}[0], [x8], x12 177 zip1 v18.4s, v1.4s, v3.4s 178 st1 {v17.d}[1], [x8], x12 179 zip2 v19.4s, v1.4s, v3.4s 180 b.eq 7f 181 ld1 {v4.4s}, [x4], #16 182 fcvtzs v4.4s, v4.4s, #31 183 st1 {v18.d}[0], [x8], x12 184 ld1 {v5.4s}, [x5], #16 185 fcvtzs v5.4s, v5.4s, #31 186 st1 {v18.d}[1], [x8], x12 187 ld1 {v6.4s}, [x6], #16 188 fcvtzs v6.4s, v6.4s, #31 189 st1 {v19.d}[0], [x8], x12 190 ld1 {v7.4s}, [x7], #16 191 fcvtzs v7.4s, v7.4s, #31 192 st1 {v19.d}[1], [x8], x12 193 b 6b 194 7: 195 st1 {v18.d}[0], [x8], x12 196 st1 {v18.d}[1], [x8], x12 197 st1 {v19.d}[0], [x8], x12 198 st1 {v19.d}[1], [x8], x12 199 subs w3, w3, #4 200 b.eq end 201 cmp w3, #4 202 add x0, x0, #8 203 b.ge 5b 204 205 4: // 2 channels 206 cmp w3, #2 207 b.lt 4f 208 ldp x4, x5, [x1], #16 209 mov w9, w2 210 mov x8, x0 211 tst w9, #8 212 ld1 {v4.4s}, [x4], #16 213 fcvtzs v4.4s, v4.4s, #31 214 ld1 {v5.4s}, [x5], #16 215 fcvtzs v5.4s, v5.4s, #31 216 ld1 {v6.4s}, [x4], #16 217 fcvtzs v6.4s, v6.4s, #31 218 ld1 {v7.4s}, [x5], #16 219 fcvtzs v7.4s, v7.4s, #31 220 b.eq 6f 221 subs w9, w9, #8 222 b.eq 7f 223 sri v5.4s, v4.4s, #16 224 ld1 {v4.4s}, [x4], #16 225 fcvtzs v4.4s, v4.4s, #31 226 st1 {v5.s}[0], [x8], x12 227 sri v7.4s, v6.4s, #16 228 st1 {v5.s}[1], [x8], x12 229 ld1 {v6.4s}, [x4], #16 230 fcvtzs v6.4s, v6.4s, #31 231 st1 {v5.s}[2], [x8], x12 232 st1 {v5.s}[3], [x8], x12 233 st1 {v7.s}[0], [x8], x12 234 st1 {v7.s}[1], [x8], x12 235 ld1 {v5.4s}, [x5], #16 236 fcvtzs v5.4s, v5.4s, #31 237 st1 {v7.s}[2], [x8], x12 238 st1 {v7.s}[3], [x8], x12 239 ld1 {v7.4s}, [x5], #16 240 fcvtzs v7.4s, v7.4s, #31 241 6: 242 subs w9, w9, #16 243 ld1 {v0.4s}, [x4], #16 244 sri v5.4s, v4.4s, #16 245 fcvtzs v0.4s, v0.4s, #31 246 ld1 {v1.4s}, [x5], #16 247 sri v7.4s, v6.4s, #16 248 st1 {v5.s}[0], [x8], x12 249 st1 {v5.s}[1], [x8], x12 250 fcvtzs v1.4s, v1.4s, #31 251 st1 {v5.s}[2], [x8], x12 252 st1 {v5.s}[3], [x8], x12 253 ld1 {v2.4s}, [x4], #16 254 st1 {v7.s}[0], [x8], x12 255 fcvtzs v2.4s, v2.4s, #31 256 st1 {v7.s}[1], [x8], x12 257 ld1 {v3.4s}, [x5], #16 258 st1 {v7.s}[2], [x8], x12 259 fcvtzs v3.4s, v3.4s, #31 260 st1 {v7.s}[3], [x8], x12 261 sri v1.4s, v0.4s, #16 262 sri v3.4s, v2.4s, #16 263 b.eq 6f 264 ld1 {v4.4s}, [x4], #16 265 st1 {v1.s}[0], [x8], x12 266 fcvtzs v4.4s, v4.4s, #31 267 st1 {v1.s}[1], [x8], x12 268 ld1 {v5.4s}, [x5], #16 269 st1 {v1.s}[2], [x8], x12 270 fcvtzs v5.4s, v5.4s, #31 271 st1 {v1.s}[3], [x8], x12 272 ld1 {v6.4s}, [x4], #16 273 st1 {v3.s}[0], [x8], x12 274 fcvtzs v6.4s, v6.4s, #31 275 st1 {v3.s}[1], [x8], x12 276 ld1 {v7.4s}, [x5], #16 277 st1 {v3.s}[2], [x8], x12 278 fcvtzs v7.4s, v7.4s, #31 279 st1 {v3.s}[3], [x8], x12 280 b.gt 6b 281 6: 282 st1 {v1.s}[0], [x8], x12 283 st1 {v1.s}[1], [x8], x12 284 st1 {v1.s}[2], [x8], x12 285 st1 {v1.s}[3], [x8], x12 286 st1 {v3.s}[0], [x8], x12 287 st1 {v3.s}[1], [x8], x12 288 st1 {v3.s}[2], [x8], x12 289 st1 {v3.s}[3], [x8], x12 290 b 8f 291 7: 292 sri v5.4s, v4.4s, #16 293 sri v7.4s, v6.4s, #16 294 st1 {v5.s}[0], [x8], x12 295 st1 {v5.s}[1], [x8], x12 296 st1 {v5.s}[2], [x8], x12 297 st1 {v5.s}[3], [x8], x12 298 st1 {v7.s}[0], [x8], x12 299 st1 {v7.s}[1], [x8], x12 300 st1 {v7.s}[2], [x8], x12 301 st1 {v7.s}[3], [x8], x12 302 8: 303 subs w3, w3, #2 304 add x0, x0, #4 305 b.eq end 306 307 4: // 1 channel 308 ldr x4, [x1] 309 tst w2, #8 310 mov w9, w2 311 mov x5, x0 312 ld1 {v0.4s}, [x4], #16 313 fcvtzs v0.4s, v0.4s, #31 314 ld1 {v1.4s}, [x4], #16 315 fcvtzs v1.4s, v1.4s, #31 316 b.ne 8f 317 6: 318 subs w9, w9, #16 319 ld1 {v2.4s}, [x4], #16 320 fcvtzs v2.4s, v2.4s, #31 321 ld1 {v3.4s}, [x4], #16 322 fcvtzs v3.4s, v3.4s, #31 323 st1 {v0.h}[1], [x5], x12 324 st1 {v0.h}[3], [x5], x12 325 st1 {v0.h}[5], [x5], x12 326 st1 {v0.h}[7], [x5], x12 327 st1 {v1.h}[1], [x5], x12 328 st1 {v1.h}[3], [x5], x12 329 st1 {v1.h}[5], [x5], x12 330 st1 {v1.h}[7], [x5], x12 331 b.eq 7f 332 ld1 {v0.4s}, [x4], #16 333 fcvtzs v0.4s, v0.4s, #31 334 ld1 {v1.4s}, [x4], #16 335 fcvtzs v1.4s, v1.4s, #31 336 7: 337 st1 {v2.h}[1], [x5], x12 338 st1 {v2.h}[3], [x5], x12 339 st1 {v2.h}[5], [x5], x12 340 st1 {v2.h}[7], [x5], x12 341 st1 {v3.h}[1], [x5], x12 342 st1 {v3.h}[3], [x5], x12 343 st1 {v3.h}[5], [x5], x12 344 st1 {v3.h}[7], [x5], x12 345 b.gt 6b 346 ret 347 8: 348 subs w9, w9, #8 349 st1 {v0.h}[1], [x5], x12 350 st1 {v0.h}[3], [x5], x12 351 st1 {v0.h}[5], [x5], x12 352 st1 {v0.h}[7], [x5], x12 353 st1 {v1.h}[1], [x5], x12 354 st1 {v1.h}[3], [x5], x12 355 st1 {v1.h}[5], [x5], x12 356 st1 {v1.h}[7], [x5], x12 357 b.eq end 358 ld1 {v0.4s}, [x4], #16 359 fcvtzs v0.4s, v0.4s, #31 360 ld1 {v1.4s}, [x4], #16 361 fcvtzs v1.4s, v1.4s, #31 362 b 6b 363 end: 364 ret 365 endfunc 366