1 /* 2 * IDCT AArch64 NEON optimisations 3 * 4 * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23 #include "libavutil/aarch64/asm.S" 24 25 // Clamp 16-bit signed block coefficients to unsigned 8-bit 26 // On entry: 27 // x0 -> array of 64x 16-bit coefficients 28 // x1 -> 8-bit results 29 // x2 = row stride for results, bytes 30 function ff_put_pixels_clamped_neon, export=1 31 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 32 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0] 33 sqxtun v0.8b, v0.8h 34 sqxtun v1.8b, v1.8h 35 sqxtun v2.8b, v2.8h 36 sqxtun v3.8b, v3.8h 37 sqxtun v4.8b, v4.8h 38 st1 {v0.8b}, [x1], x2 39 sqxtun v0.8b, v5.8h 40 st1 {v1.8b}, [x1], x2 41 sqxtun v1.8b, v6.8h 42 st1 {v2.8b}, [x1], x2 43 sqxtun v2.8b, v7.8h 44 st1 {v3.8b}, [x1], x2 45 st1 {v4.8b}, [x1], x2 46 st1 {v0.8b}, [x1], x2 47 st1 {v1.8b}, [x1], x2 48 st1 {v2.8b}, [x1] 49 ret 50 endfunc 51 52 // Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128) 53 // On entry: 54 // x0 -> array of 64x 16-bit coefficients 55 // x1 -> 8-bit results 56 // x2 = row stride for results, bytes 57 function ff_put_signed_pixels_clamped_neon, export=1 58 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 59 movi v4.8b, #128 60 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] 61 sqxtn v0.8b, v0.8h 62 sqxtn v1.8b, v1.8h 63 sqxtn v2.8b, v2.8h 64 sqxtn v3.8b, v3.8h 65 sqxtn v5.8b, v16.8h 66 add v0.8b, v0.8b, v4.8b 67 sqxtn v6.8b, v17.8h 68 add v1.8b, v1.8b, v4.8b 69 sqxtn v7.8b, v18.8h 70 add v2.8b, v2.8b, v4.8b 71 sqxtn v16.8b, v19.8h 72 add v3.8b, v3.8b, v4.8b 73 st1 {v0.8b}, [x1], x2 74 add v0.8b, v5.8b, v4.8b 75 st1 {v1.8b}, [x1], x2 76 add v1.8b, v6.8b, v4.8b 77 st1 {v2.8b}, [x1], x2 78 add v2.8b, v7.8b, v4.8b 79 st1 {v3.8b}, [x1], x2 80 add v3.8b, v16.8b, v4.8b 81 st1 {v0.8b}, [x1], x2 82 st1 {v1.8b}, [x1], x2 83 st1 {v2.8b}, [x1], x2 84 st1 {v3.8b}, [x1] 85 ret 86 endfunc 87 88 // Add 16-bit signed block coefficients to unsigned 8-bit 89 // On entry: 90 // x0 -> array of 64x 16-bit coefficients 91 // x1 -> 8-bit input and results 92 // x2 = row stride for 8-bit input and results, bytes 93 function ff_add_pixels_clamped_neon, export=1 94 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 95 mov x3, x1 96 ld1 {v4.8b}, [x1], x2 97 ld1 {v5.8b}, [x1], x2 98 ld1 {v6.8b}, [x1], x2 99 ld1 {v7.8b}, [x1], x2 100 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] 101 uaddw v0.8h, v0.8h, v4.8b 102 uaddw v1.8h, v1.8h, v5.8b 103 uaddw v2.8h, v2.8h, v6.8b 104 ld1 {v4.8b}, [x1], x2 105 uaddw v3.8h, v3.8h, v7.8b 106 ld1 {v5.8b}, [x1], x2 107 sqxtun v0.8b, v0.8h 108 ld1 {v6.8b}, [x1], x2 109 sqxtun v1.8b, v1.8h 110 ld1 {v7.8b}, [x1] 111 sqxtun v2.8b, v2.8h 112 sqxtun v3.8b, v3.8h 113 uaddw v4.8h, v16.8h, v4.8b 114 st1 {v0.8b}, [x3], x2 115 uaddw v0.8h, v17.8h, v5.8b 116 st1 {v1.8b}, [x3], x2 117 uaddw v1.8h, v18.8h, v6.8b 118 st1 {v2.8b}, [x3], x2 119 uaddw v2.8h, v19.8h, v7.8b 120 sqxtun v4.8b, v4.8h 121 sqxtun v0.8b, v0.8h 122 st1 {v3.8b}, [x3], x2 123 sqxtun v1.8b, v1.8h 124 sqxtun v2.8b, v2.8h 125 st1 {v4.8b}, [x3], x2 126 st1 {v0.8b}, [x3], x2 127 st1 {v1.8b}, [x3], x2 128 st1 {v2.8b}, [x3] 129 ret 130 endfunc 131