1 /* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #include "libavutil/aarch64/asm.S" 23 #include "neon.S" 24 25 function ff_h264_idct_add_neon, export=1 26 .L_ff_h264_idct_add_neon: 27 AARCH64_VALID_CALL_TARGET 28 ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1] 29 sxtw x2, w2 30 movi v30.8H, #0 31 32 add v4.4H, v0.4H, v2.4H 33 sshr v16.4H, v1.4H, #1 34 st1 {v30.8H}, [x1], #16 35 sshr v17.4H, v3.4H, #1 36 st1 {v30.8H}, [x1], #16 37 sub v5.4H, v0.4H, v2.4H 38 sub v6.4H, v16.4H, v3.4H 39 add v7.4H, v1.4H, v17.4H 40 add v0.4H, v4.4H, v7.4H 41 add v1.4H, v5.4H, v6.4H 42 sub v2.4H, v5.4H, v6.4H 43 sub v3.4H, v4.4H, v7.4H 44 45 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 46 47 add v4.4H, v0.4H, v2.4H 48 ld1 {v18.S}[0], [x0], x2 49 sshr v16.4H, v3.4H, #1 50 sshr v17.4H, v1.4H, #1 51 ld1 {v18.S}[1], [x0], x2 52 sub v5.4H, v0.4H, v2.4H 53 ld1 {v19.S}[1], [x0], x2 54 add v6.4H, v16.4H, v1.4H 55 ins v4.D[1], v5.D[0] 56 sub v7.4H, v17.4H, v3.4H 57 ld1 {v19.S}[0], [x0], x2 58 ins v6.D[1], v7.D[0] 59 sub x0, x0, x2, lsl #2 60 add v0.8H, v4.8H, v6.8H 61 sub v1.8H, v4.8H, v6.8H 62 63 srshr v0.8H, v0.8H, #6 64 srshr v1.8H, v1.8H, #6 65 66 uaddw v0.8H, v0.8H, v18.8B 67 uaddw v1.8H, v1.8H, v19.8B 68 69 sqxtun v0.8B, v0.8H 70 sqxtun v1.8B, v1.8H 71 72 st1 {v0.S}[0], [x0], x2 73 st1 {v0.S}[1], [x0], x2 74 st1 {v1.S}[1], [x0], x2 75 st1 {v1.S}[0], [x0], x2 76 77 sub x1, x1, #32 78 ret 79 endfunc 80 81 function ff_h264_idct_dc_add_neon, export=1 82 .L_ff_h264_idct_dc_add_neon: 83 AARCH64_VALID_CALL_TARGET 84 sxtw x2, w2 85 mov w3, #0 86 ld1r {v2.8H}, [x1] 87 strh w3, [x1] 88 srshr v2.8H, v2.8H, #6 89 ld1 {v0.S}[0], [x0], x2 90 ld1 {v0.S}[1], [x0], x2 91 uaddw v3.8H, v2.8H, v0.8B 92 ld1 {v1.S}[0], [x0], x2 93 ld1 {v1.S}[1], [x0], x2 94 uaddw v4.8H, v2.8H, v1.8B 95 sqxtun v0.8B, v3.8H 96 sqxtun v1.8B, v4.8H 97 sub x0, x0, x2, lsl #2 98 st1 {v0.S}[0], [x0], x2 99 st1 {v0.S}[1], [x0], x2 100 st1 {v1.S}[0], [x0], x2 101 st1 {v1.S}[1], [x0], x2 102 ret 103 endfunc 104 105 function ff_h264_idct_add16_neon, export=1 106 mov x12, x30 107 mov x6, x0 // dest 108 mov x5, x1 // block_offset 109 mov x1, x2 // block 110 mov w9, w3 // stride 111 movrel x7, scan8 112 mov x10, #16 113 movrel x13, .L_ff_h264_idct_dc_add_neon 114 movrel x14, .L_ff_h264_idct_add_neon 115 1: mov w2, w9 116 ldrb w3, [x7], #1 117 ldrsw x0, [x5], #4 118 ldrb w3, [x4, w3, uxtw] 119 subs w3, w3, #1 120 b.lt 2f 121 ldrsh w3, [x1] 122 add x0, x0, x6 123 ccmp w3, #0, #4, eq 124 csel x15, x13, x14, ne 125 blr x15 126 2: subs x10, x10, #1 127 add x1, x1, #32 128 b.ne 1b 129 ret x12 130 endfunc 131 132 function ff_h264_idct_add16intra_neon, export=1 133 mov x12, x30 134 mov x6, x0 // dest 135 mov x5, x1 // block_offset 136 mov x1, x2 // block 137 mov w9, w3 // stride 138 movrel x7, scan8 139 mov x10, #16 140 movrel x13, .L_ff_h264_idct_dc_add_neon 141 movrel x14, .L_ff_h264_idct_add_neon 142 1: mov w2, w9 143 ldrb w3, [x7], #1 144 ldrsw x0, [x5], #4 145 ldrb w3, [x4, w3, uxtw] 146 add x0, x0, x6 147 cmp w3, #0 148 ldrsh w3, [x1] 149 csel x15, x13, x14, eq 150 ccmp w3, #0, #0, eq 151 b.eq 2f 152 blr x15 153 2: subs x10, x10, #1 154 add x1, x1, #32 155 b.ne 1b 156 ret x12 157 endfunc 158 159 function ff_h264_idct_add8_neon, export=1 160 sub sp, sp, #0x40 161 stp x19, x20, [sp] 162 mov x12, x30 163 ldp x6, x15, [x0] // dest[0], dest[1] 164 add x5, x1, #16*4 // block_offset 165 add x9, x2, #16*32 // block 166 mov w19, w3 // stride 167 movrel x13, .L_ff_h264_idct_dc_add_neon 168 movrel x14, .L_ff_h264_idct_add_neon 169 movrel x7, scan8, 16 170 mov x10, #0 171 mov x11, #16 172 1: mov w2, w19 173 ldrb w3, [x7, x10] // scan8[i] 174 ldrsw x0, [x5, x10, lsl #2] // block_offset[i] 175 ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ] 176 add x0, x0, x6 // block_offset[i] + dst[j-1] 177 add x1, x9, x10, lsl #5 // block + i * 16 178 cmp w3, #0 179 ldrsh w3, [x1] // block[i*16] 180 csel x20, x13, x14, eq 181 ccmp w3, #0, #0, eq 182 b.eq 2f 183 blr x20 184 2: add x10, x10, #1 185 cmp x10, #4 186 csel x10, x11, x10, eq // mov x10, #16 187 csel x6, x15, x6, eq 188 cmp x10, #20 189 b.lt 1b 190 ldp x19, x20, [sp] 191 add sp, sp, #0x40 192 ret x12 193 endfunc 194 195 .macro idct8x8_cols pass 196 .if \pass == 0 197 va .req v18 198 vb .req v30 199 sshr v18.8H, v26.8H, #1 200 add v16.8H, v24.8H, v28.8H 201 ld1 {v30.8H, v31.8H}, [x1] 202 st1 {v19.8H}, [x1], #16 203 st1 {v19.8H}, [x1], #16 204 sub v17.8H, v24.8H, v28.8H 205 sshr v19.8H, v30.8H, #1 206 sub v18.8H, v18.8H, v30.8H 207 add v19.8H, v19.8H, v26.8H 208 .else 209 va .req v30 210 vb .req v18 211 sshr v30.8H, v26.8H, #1 212 sshr v19.8H, v18.8H, #1 213 add v16.8H, v24.8H, v28.8H 214 sub v17.8H, v24.8H, v28.8H 215 sub v30.8H, v30.8H, v18.8H 216 add v19.8H, v19.8H, v26.8H 217 .endif 218 add v26.8H, v17.8H, va.8H 219 sub v28.8H, v17.8H, va.8H 220 add v24.8H, v16.8H, v19.8H 221 sub vb.8H, v16.8H, v19.8H 222 sub v16.8H, v29.8H, v27.8H 223 add v17.8H, v31.8H, v25.8H 224 sub va.8H, v31.8H, v25.8H 225 add v19.8H, v29.8H, v27.8H 226 sub v16.8H, v16.8H, v31.8H 227 sub v17.8H, v17.8H, v27.8H 228 add va.8H, va.8H, v29.8H 229 add v19.8H, v19.8H, v25.8H 230 sshr v25.8H, v25.8H, #1 231 sshr v27.8H, v27.8H, #1 232 sshr v29.8H, v29.8H, #1 233 sshr v31.8H, v31.8H, #1 234 sub v16.8H, v16.8H, v31.8H 235 sub v17.8H, v17.8H, v27.8H 236 add va.8H, va.8H, v29.8H 237 add v19.8H, v19.8H, v25.8H 238 sshr v25.8H, v16.8H, #2 239 sshr v27.8H, v17.8H, #2 240 sshr v29.8H, va.8H, #2 241 sshr v31.8H, v19.8H, #2 242 sub v19.8H, v19.8H, v25.8H 243 sub va.8H, v27.8H, va.8H 244 add v17.8H, v17.8H, v29.8H 245 add v16.8H, v16.8H, v31.8H 246 .if \pass == 0 247 sub v31.8H, v24.8H, v19.8H 248 add v24.8H, v24.8H, v19.8H 249 add v25.8H, v26.8H, v18.8H 250 sub v18.8H, v26.8H, v18.8H 251 add v26.8H, v28.8H, v17.8H 252 add v27.8H, v30.8H, v16.8H 253 sub v29.8H, v28.8H, v17.8H 254 sub v28.8H, v30.8H, v16.8H 255 .else 256 sub v31.8H, v24.8H, v19.8H 257 add v24.8H, v24.8H, v19.8H 258 add v25.8H, v26.8H, v30.8H 259 sub v30.8H, v26.8H, v30.8H 260 add v26.8H, v28.8H, v17.8H 261 sub v29.8H, v28.8H, v17.8H 262 add v27.8H, v18.8H, v16.8H 263 sub v28.8H, v18.8H, v16.8H 264 .endif 265 .unreq va 266 .unreq vb 267 .endm 268 269 function ff_h264_idct8_add_neon, export=1 270 .L_ff_h264_idct8_add_neon: 271 AARCH64_VALID_CALL_TARGET 272 movi v19.8H, #0 273 sxtw x2, w2 274 ld1 {v24.8H, v25.8H}, [x1] 275 st1 {v19.8H}, [x1], #16 276 st1 {v19.8H}, [x1], #16 277 ld1 {v26.8H, v27.8H}, [x1] 278 st1 {v19.8H}, [x1], #16 279 st1 {v19.8H}, [x1], #16 280 ld1 {v28.8H, v29.8H}, [x1] 281 st1 {v19.8H}, [x1], #16 282 st1 {v19.8H}, [x1], #16 283 284 idct8x8_cols 0 285 transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 286 idct8x8_cols 1 287 288 mov x3, x0 289 srshr v24.8H, v24.8H, #6 290 ld1 {v0.8B}, [x0], x2 291 srshr v25.8H, v25.8H, #6 292 ld1 {v1.8B}, [x0], x2 293 srshr v26.8H, v26.8H, #6 294 ld1 {v2.8B}, [x0], x2 295 srshr v27.8H, v27.8H, #6 296 ld1 {v3.8B}, [x0], x2 297 srshr v28.8H, v28.8H, #6 298 ld1 {v4.8B}, [x0], x2 299 srshr v29.8H, v29.8H, #6 300 ld1 {v5.8B}, [x0], x2 301 srshr v30.8H, v30.8H, #6 302 ld1 {v6.8B}, [x0], x2 303 srshr v31.8H, v31.8H, #6 304 ld1 {v7.8B}, [x0], x2 305 uaddw v24.8H, v24.8H, v0.8B 306 uaddw v25.8H, v25.8H, v1.8B 307 uaddw v26.8H, v26.8H, v2.8B 308 sqxtun v0.8B, v24.8H 309 uaddw v27.8H, v27.8H, v3.8B 310 sqxtun v1.8B, v25.8H 311 uaddw v28.8H, v28.8H, v4.8B 312 sqxtun v2.8B, v26.8H 313 st1 {v0.8B}, [x3], x2 314 uaddw v29.8H, v29.8H, v5.8B 315 sqxtun v3.8B, v27.8H 316 st1 {v1.8B}, [x3], x2 317 uaddw v30.8H, v30.8H, v6.8B 318 sqxtun v4.8B, v28.8H 319 st1 {v2.8B}, [x3], x2 320 uaddw v31.8H, v31.8H, v7.8B 321 sqxtun v5.8B, v29.8H 322 st1 {v3.8B}, [x3], x2 323 sqxtun v6.8B, v30.8H 324 sqxtun v7.8B, v31.8H 325 st1 {v4.8B}, [x3], x2 326 st1 {v5.8B}, [x3], x2 327 st1 {v6.8B}, [x3], x2 328 st1 {v7.8B}, [x3], x2 329 330 sub x1, x1, #128 331 ret 332 endfunc 333 334 function ff_h264_idct8_dc_add_neon, export=1 335 .L_ff_h264_idct8_dc_add_neon: 336 AARCH64_VALID_CALL_TARGET 337 mov w3, #0 338 sxtw x2, w2 339 ld1r {v31.8H}, [x1] 340 strh w3, [x1] 341 ld1 {v0.8B}, [x0], x2 342 srshr v31.8H, v31.8H, #6 343 ld1 {v1.8B}, [x0], x2 344 ld1 {v2.8B}, [x0], x2 345 uaddw v24.8H, v31.8H, v0.8B 346 ld1 {v3.8B}, [x0], x2 347 uaddw v25.8H, v31.8H, v1.8B 348 ld1 {v4.8B}, [x0], x2 349 uaddw v26.8H, v31.8H, v2.8B 350 ld1 {v5.8B}, [x0], x2 351 uaddw v27.8H, v31.8H, v3.8B 352 ld1 {v6.8B}, [x0], x2 353 uaddw v28.8H, v31.8H, v4.8B 354 ld1 {v7.8B}, [x0], x2 355 uaddw v29.8H, v31.8H, v5.8B 356 uaddw v30.8H, v31.8H, v6.8B 357 uaddw v31.8H, v31.8H, v7.8B 358 sqxtun v0.8B, v24.8H 359 sqxtun v1.8B, v25.8H 360 sqxtun v2.8B, v26.8H 361 sqxtun v3.8B, v27.8H 362 sub x0, x0, x2, lsl #3 363 st1 {v0.8B}, [x0], x2 364 sqxtun v4.8B, v28.8H 365 st1 {v1.8B}, [x0], x2 366 sqxtun v5.8B, v29.8H 367 st1 {v2.8B}, [x0], x2 368 sqxtun v6.8B, v30.8H 369 st1 {v3.8B}, [x0], x2 370 sqxtun v7.8B, v31.8H 371 st1 {v4.8B}, [x0], x2 372 st1 {v5.8B}, [x0], x2 373 st1 {v6.8B}, [x0], x2 374 st1 {v7.8B}, [x0], x2 375 ret 376 endfunc 377 378 function ff_h264_idct8_add4_neon, export=1 379 mov x12, x30 380 mov x6, x0 381 mov x5, x1 382 mov x1, x2 383 mov w2, w3 384 movrel x7, scan8 385 mov w10, #16 386 movrel x13, .L_ff_h264_idct8_dc_add_neon 387 movrel x14, .L_ff_h264_idct8_add_neon 388 1: ldrb w9, [x7], #4 389 ldrsw x0, [x5], #16 390 ldrb w9, [x4, w9, UXTW] 391 subs w9, w9, #1 392 b.lt 2f 393 ldrsh w11, [x1] 394 add x0, x6, x0 395 ccmp w11, #0, #4, eq 396 csel x15, x13, x14, ne 397 blr x15 398 2: subs w10, w10, #4 399 add x1, x1, #128 400 b.ne 1b 401 ret x12 402 endfunc 403 404 const scan8 405 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 406 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 407 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 408 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 409 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 410 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 411 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 412 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 413 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 414 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 415 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 416 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 417 endconst 418