1 /* 2 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include "libavutil/aarch64/asm.S" 22 #include "asm-offsets.h" 23 24 .macro resample_one fmt, es=2 25 .ifnc \fmt, dbl 26 .macro M_MUL2 x:vararg 27 .endm 28 .macro M_MLA2 x:vararg 29 .endm 30 .endif 31 function ff_resample_one_\fmt\()_neon, export=1 32 sxtw x2, w2 33 ldr x9, [x0, #FILTER_BANK] 34 ldr w6, [x0, #FILTER_LENGTH] 35 ldp w7, w8, [x0, #PHASE_SHIFT] // and phase_mask 36 lsr x10, x4, x7 // sample_index 37 and x4, x4, x8 38 lsl x11, x6, #\es // filter_length * elem_size 39 add x3, x3, x10, lsl #\es // src[sample_index] 40 madd x9, x11, x4, x9 // filter 41 cmp w6, #16 42 b.lt 5f 43 8: // remaining filter_length at least 16 44 subs w6, w6, #16 45 LOAD8 v4, v5, v6, v7, x3 46 LOAD8 v16, v17, v18, v19, x9 47 M_MUL v0, v4, v16, v1 48 M_MUL2 v1, v6, v18 49 7: 50 LOAD8 v20, v21, v22, v23, x3 51 M_MLA v0, v5, v17, v1 52 M_MLA2 v1, v7, v19 53 LOAD8 v24, v25, v26, v27, x9 54 M_MLA v0, v20, v24, v1 55 M_MLA2 v1, v22, v26 56 b.eq 6f 57 cmp w6, #16 58 M_MLA v0, v21, v25, v1 59 M_MLA2 v1, v23, v27 60 b.lt 4f 61 subs w6, w6, #16 62 LOAD8 v4, v5, v6, v7, x3 63 LOAD8 v16, v17, v18, v19, x9 64 M_MLA v0, v4, v16, v1 65 M_MLA2 v1, v6, v18 66 b 7b 67 6: 68 M_MLA v0, v21, v25, v1 69 M_MLA2 v1, v23, v27 70 STORE_ONE 0, x1, x2, v1 71 ret 72 5: 73 movi v0.16b, #0 74 movi v1.16b, #0 75 4: // remaining filter_length 1-15 76 cmp w6, #4 77 b.lt 2f 78 subs w6, w6, #4 79 LOAD4 v4, v5, x3 80 LOAD4 v6, v7, x9 81 M_MLA v0, v4, v6, v1 82 M_MLA2 v1, v5, v7 83 b.eq 0f 84 b 4b 85 2: // remaining filter_length 1-3 86 cmp w6, #2 87 b.lt 1f 88 LOAD2 2, x3 89 LOAD2 3, x9 90 subs w6, w6, #2 91 M_MLA v0, v2, v3 92 b.eq 0f 93 1: // remaining filter_length 1 94 LOAD1 6, x3 95 LOAD1 7, x9 96 M_MLA v0, v6, v7 97 0: 98 STORE_ONE 0, x1, x2, v1 99 ret 100 endfunc 101 102 .purgem LOAD1 103 .purgem LOAD2 104 .purgem LOAD4 105 .purgem LOAD8 106 .purgem M_MLA 107 .purgem M_MLA2 108 .purgem M_MUL 109 .purgem M_MUL2 110 .purgem STORE_ONE 111 .endm 112 113 114 .macro LOAD1 d1, addr 115 ldr d\d1, [\addr], #8 116 .endm 117 .macro LOAD2 d1, addr 118 ld1 {v\d1\().2d}, [\addr], #16 119 .endm 120 .macro LOAD4 d1, d2, addr 121 ld1 {\d1\().2d,\d2\().2d}, [\addr], #32 122 .endm 123 .macro LOAD8 d1, d2, d3, d4, addr 124 ld1 {\d1\().2d,\d2\().2d,\d3\().2d,\d4\().2d}, [\addr], #64 125 .endm 126 .macro M_MLA d, r0, r1, d2:vararg 127 fmla \d\().2d, \r0\().2d, \r1\().2d 128 .endm 129 .macro M_MLA2 second:vararg 130 M_MLA \second 131 .endm 132 .macro M_MUL d, r0, r1, d2:vararg 133 fmul \d\().2d, \r0\().2d, \r1\().2d 134 .endm 135 .macro M_MUL2 second:vararg 136 M_MUL \second 137 .endm 138 .macro STORE_ONE rn, addr, idx, d2 139 fadd v\rn\().2d, v\rn\().2d, \d2\().2d 140 faddp d\rn\(), v\rn\().2d 141 str d\rn\(), [\addr, \idx, lsl #3] 142 .endm 143 144 resample_one dbl, 3 145 146 147 .macro LOAD1 d1, addr 148 ldr s\d1, [\addr], #4 149 .endm 150 .macro LOAD2 d1, addr 151 ld1 {v\d1\().2s}, [\addr], #8 152 .endm 153 .macro LOAD4 d1, d2, addr 154 ld1 {\d1\().4s}, [\addr], #16 155 .endm 156 .macro LOAD8 d1, d2, d3, d4, addr 157 ld1 {\d1\().4s,\d2\().4s}, [\addr], #32 158 .endm 159 .macro M_MLA d, r0, r1, d2:vararg 160 fmla \d\().4s, \r0\().4s, \r1\().4s 161 .endm 162 .macro M_MUL d, r0, r1, d2:vararg 163 fmul \d\().4s, \r0\().4s, \r1\().4s 164 .endm 165 .macro STORE_ONE rn, addr, idx, d2 166 faddp v\rn\().4s, v\rn\().4s, v\rn\().4s 167 faddp s\rn\(), v\rn\().2s 168 str s\rn\(), [\addr, \idx, lsl #2] 169 .endm 170 171 resample_one flt 172 173 174 .macro LOAD1 d1, addr 175 ldr h\d1, [\addr], #2 176 .endm 177 .macro LOAD2 d1, addr 178 ldr s\d1, [\addr], #4 179 .endm 180 .macro LOAD4 d1, d2, addr 181 ld1 {\d1\().4h}, [\addr], #8 182 .endm 183 .macro LOAD8 d1, d2, d3, d4, addr 184 ld1 {\d1\().4h,\d2\().4h}, [\addr], #16 185 .endm 186 .macro M_MLA d, r0, r1, d2:vararg 187 smlal \d\().4s, \r0\().4h, \r1\().4h 188 .endm 189 .macro M_MUL d, r0, r1, d2:vararg 190 smull \d\().4s, \r0\().4h, \r1\().4h 191 .endm 192 .macro STORE_ONE rn, addr, idx, d2 193 addp v\rn\().4s, v\rn\().4s, v\rn\().4s 194 addp v\rn\().4s, v\rn\().4s, v\rn\().4s 195 sqrshrn v\rn\().4h, v\rn\().4s, #15 196 str h\rn\(), [\addr, \idx, lsl #1] 197 .endm 198 199 resample_one s16, 1 200 201 202 .macro LOAD1 d1, addr 203 ldr s\d1, [\addr], #4 204 .endm 205 .macro LOAD2 d1, addr 206 ld1 {v\d1\().2s}, [\addr], #8 207 .endm 208 .macro LOAD4 d1, d2, addr 209 ld1 {\d1\().4s}, [\addr], #16 210 .endm 211 .macro LOAD8 d1, d2, d3, d4, addr 212 ld1 {\d1\().4s,\d2\().4s}, [\addr], #32 213 .endm 214 .macro M_MLA d1, r0, r1, d2:vararg 215 smlal \d1\().2d, \r0\().2s, \r1\().2s 216 .ifnb \d2 217 smlal2 \d2\().2d, \r0\().4s, \r1\().4s 218 .endif 219 .endm 220 .macro M_MUL d1, r0, r1, d2:vararg 221 smull \d1\().2d, \r0\().2s, \r1\().2s 222 .ifnb \d2 223 smull2 \d2\().2d, \r0\().4s, \r1\().4s 224 .endif 225 .endm 226 .macro STORE_ONE rn, addr, idx, d2 227 add v\rn\().2d, v\rn\().2d, \d2\().2d 228 addp d\rn\(), v\rn\().2d 229 sqrshrn v\rn\().2s, v\rn\().2d, #30 230 str s\rn\(), [\addr, \idx, lsl #2] 231 .endm 232 233 resample_one s32 234