1 /* 2 * VC1 AArch64 NEON optimisations 3 * 4 * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23 #include "libavutil/aarch64/asm.S" 24 25 // VC-1 8x8 inverse transform 26 // On entry: 27 // x0 -> array of 16-bit inverse transform coefficients, in column-major order 28 // On exit: 29 // array at x0 updated to hold transformed block; also now held in row-major order 30 function ff_vc1_inv_trans_8x8_neon, export=1 31 ld1 {v1.16b, v2.16b}, [x0], #32 32 ld1 {v3.16b, v4.16b}, [x0], #32 33 ld1 {v5.16b, v6.16b}, [x0], #32 34 shl v1.8h, v1.8h, #2 // 8/2 * src[0] 35 sub x1, x0, #3*32 36 ld1 {v16.16b, v17.16b}, [x0] 37 shl v7.8h, v2.8h, #4 // 16 * src[8] 38 shl v18.8h, v2.8h, #2 // 4 * src[8] 39 shl v19.8h, v4.8h, #4 // 16 * src[24] 40 ldr d0, .Lcoeffs_it8 41 shl v5.8h, v5.8h, #2 // 8/2 * src[32] 42 shl v20.8h, v6.8h, #4 // 16 * src[40] 43 shl v21.8h, v6.8h, #2 // 4 * src[40] 44 shl v22.8h, v17.8h, #4 // 16 * src[56] 45 ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40] 46 mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16] 47 sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40] 48 ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56] 49 sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56] 50 shl v3.8h, v3.8h, #3 // 16/2 * src[16] 51 mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] 52 ssra v1.8h, v1.8h, #1 // 12/2 * src[0] 53 ssra v5.8h, v5.8h, #1 // 12/2 * src[32] 54 mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] 55 shl v21.8h, v16.8h, #3 // 16/2 * src[48] 56 mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] 57 sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] 58 mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] 59 add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] 60 sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] 61 mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] 62 mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] 63 add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 64 sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 65 mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] 66 add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 67 add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 68 mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] 69 sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 70 add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 71 mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] 72 sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 73 sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 74 neg v3.8h, v7.8h // -t1 75 neg v4.8h, v20.8h // +t2 76 neg v6.8h, v19.8h // +t3 77 ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1 78 ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1 79 neg v7.8h, v18.8h // +t4 80 ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1 81 ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1 82 ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1 83 ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1 84 ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1 85 ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1 86 srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3 87 srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3 88 srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3 89 srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3 90 srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3 91 srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3 92 srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3 93 srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3 94 trn2 v17.8h, v3.8h, v4.8h 95 trn2 v18.8h, v5.8h, v6.8h 96 trn2 v19.8h, v2.8h, v1.8h 97 trn2 v20.8h, v7.8h, v16.8h 98 trn1 v21.4s, v17.4s, v18.4s 99 trn2 v17.4s, v17.4s, v18.4s 100 trn1 v18.4s, v19.4s, v20.4s 101 trn2 v19.4s, v19.4s, v20.4s 102 trn1 v3.8h, v3.8h, v4.8h 103 trn2 v4.2d, v21.2d, v18.2d 104 trn1 v20.2d, v17.2d, v19.2d 105 trn1 v5.8h, v5.8h, v6.8h 106 trn1 v1.8h, v2.8h, v1.8h 107 trn1 v2.8h, v7.8h, v16.8h 108 trn1 v6.2d, v21.2d, v18.2d 109 trn2 v7.2d, v17.2d, v19.2d 110 shl v16.8h, v20.8h, #4 // 16 * src[24] 111 shl v17.8h, v4.8h, #4 // 16 * src[40] 112 trn1 v18.4s, v3.4s, v5.4s 113 trn1 v19.4s, v1.4s, v2.4s 114 shl v21.8h, v7.8h, #4 // 16 * src[56] 115 shl v22.8h, v6.8h, #2 // 4 * src[8] 116 shl v23.8h, v4.8h, #2 // 4 * src[40] 117 trn2 v3.4s, v3.4s, v5.4s 118 trn2 v1.4s, v1.4s, v2.4s 119 shl v2.8h, v6.8h, #4 // 16 * src[8] 120 sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40] 121 ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40] 122 sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56] 123 trn1 v22.2d, v18.2d, v19.2d 124 trn2 v18.2d, v18.2d, v19.2d 125 trn1 v19.2d, v3.2d, v1.2d 126 ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56] 127 mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] 128 shl v21.8h, v22.8h, #2 // 8/2 * src[0] 129 shl v18.8h, v18.8h, #2 // 8/2 * src[32] 130 mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] 131 shl v6.8h, v19.8h, #3 // 16/2 * src[16] 132 trn2 v1.2d, v3.2d, v1.2d 133 mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] 134 ssra v21.8h, v21.8h, #1 // 12/2 * src[0] 135 ssra v18.8h, v18.8h, #1 // 12/2 * src[32] 136 mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16] 137 shl v19.8h, v1.8h, #3 // 16/2 * src[48] 138 mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] 139 add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] 140 mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] 141 sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] 142 sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] 143 mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] 144 mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] 145 add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 146 add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 147 mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] 148 sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 149 neg v21.8h, v17.8h // +t2 150 mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] 151 sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 152 neg v4.8h, v5.8h // +t3 153 sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 154 sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 155 neg v24.8h, v16.8h // +t4 156 add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 157 add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 158 ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1 159 neg v3.8h, v2.8h // -t1 160 ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1 161 ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1 162 ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1 163 srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1 164 srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1 165 srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1 166 srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1 167 srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7 168 srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7 169 srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7 170 srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7 171 srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7 172 srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7 173 st1 {v2.16b, v3.16b}, [x1], #32 174 srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7 175 srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7 176 st1 {v4.16b, v5.16b}, [x1], #32 177 st1 {v16.16b, v17.16b}, [x1], #32 178 st1 {v0.16b, v1.16b}, [x1] 179 ret 180 endfunc 181 182 // VC-1 8x4 inverse transform 183 // On entry: 184 // x0 -> array of 8-bit samples, in row-major order 185 // x1 = row stride for 8-bit sample array 186 // x2 -> array of 16-bit inverse transform coefficients, in row-major order 187 // On exit: 188 // array at x0 updated by saturated addition of (narrowed) transformed block 189 function ff_vc1_inv_trans_8x4_neon, export=1 190 ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32 191 mov x3, x0 192 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2] 193 ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector 194 ld1 {v5.8b}, [x0], x1 195 trn2 v6.4h, v1.4h, v3.4h 196 trn2 v7.4h, v2.4h, v4.4h 197 trn1 v1.4h, v1.4h, v3.4h 198 trn1 v2.4h, v2.4h, v4.4h 199 trn2 v3.4h, v16.4h, v18.4h 200 trn2 v4.4h, v17.4h, v19.4h 201 trn1 v16.4h, v16.4h, v18.4h 202 trn1 v17.4h, v17.4h, v19.4h 203 ld1 {v18.8b}, [x0], x1 204 trn1 v19.2s, v6.2s, v3.2s 205 trn2 v3.2s, v6.2s, v3.2s 206 trn1 v6.2s, v7.2s, v4.2s 207 trn2 v4.2s, v7.2s, v4.2s 208 trn1 v7.2s, v1.2s, v16.2s 209 trn1 v20.2s, v2.2s, v17.2s 210 shl v21.4h, v19.4h, #4 // 16 * src[1] 211 trn2 v1.2s, v1.2s, v16.2s 212 shl v16.4h, v3.4h, #4 // 16 * src[3] 213 trn2 v2.2s, v2.2s, v17.2s 214 shl v17.4h, v6.4h, #4 // 16 * src[5] 215 ld1 {v22.8b}, [x0], x1 216 shl v23.4h, v4.4h, #4 // 16 * src[7] 217 mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2] 218 ld1 {v25.8b}, [x0] 219 shl v26.4h, v19.4h, #2 // 4 * src[1] 220 shl v27.4h, v6.4h, #2 // 4 * src[5] 221 ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7] 222 ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5] 223 sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7] 224 sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5] 225 shl v7.4h, v7.4h, #2 // 8/2 * src[0] 226 shl v20.4h, v20.4h, #2 // 8/2 * src[4] 227 mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7] 228 shl v1.4h, v1.4h, #3 // 16/2 * src[2] 229 mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5] 230 ssra v7.4h, v7.4h, #1 // 12/2 * src[0] 231 mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5] 232 ssra v20.4h, v20.4h, #1 // 12/2 * src[4] 233 mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7] 234 shl v3.4h, v2.4h, #3 // 16/2 * src[6] 235 mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6] 236 mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7] 237 mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7] 238 sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6] 239 mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7] 240 add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4] 241 mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7] 242 sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4] 243 neg v6.4h, v21.4h // -t1 244 add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 245 sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 246 add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 247 sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 248 add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 249 add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 250 sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 251 sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 252 neg v3.4h, v17.4h // +t2 253 neg v4.4h, v16.4h // +t3 254 neg v28.4h, v23.4h // +t4 255 ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1 256 ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1 257 ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1 258 ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1 259 ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1 260 ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1 261 ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1 262 ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1 263 trn1 v1.2d, v7.2d, v1.2d 264 trn1 v2.2d, v20.2d, v2.2d 265 trn1 v3.2d, v24.2d, v27.2d 266 trn1 v4.2d, v19.2d, v26.2d 267 srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3 268 srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3 269 srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3 270 srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3 271 trn2 v6.8h, v1.8h, v2.8h 272 trn1 v1.8h, v1.8h, v2.8h 273 trn2 v2.8h, v3.8h, v4.8h 274 trn1 v3.8h, v3.8h, v4.8h 275 trn2 v4.4s, v6.4s, v2.4s 276 trn1 v7.4s, v1.4s, v3.4s 277 trn2 v1.4s, v1.4s, v3.4s 278 mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24] 279 trn1 v2.4s, v6.4s, v2.4s 280 mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24] 281 mul v6.8h, v7.8h, v0.h[6] // 17 * src[0] 282 mul v1.8h, v1.8h, v0.h[6] // 17 * src[16] 283 mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] 284 mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24] 285 add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16] 286 sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16] 287 neg v2.8h, v3.8h // -t4/2 288 neg v6.8h, v4.8h // -t3/2 289 ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1 290 ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1 291 ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1 292 ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1 293 srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7 294 srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7 295 srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7 296 srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7 297 uaddw v0.8h, v0.8h, v5.8b 298 uaddw v1.8h, v1.8h, v18.8b 299 uaddw v2.8h, v2.8h, v22.8b 300 uaddw v3.8h, v3.8h, v25.8b 301 sqxtun v0.8b, v0.8h 302 sqxtun v1.8b, v1.8h 303 sqxtun v2.8b, v2.8h 304 sqxtun v3.8b, v3.8h 305 st1 {v0.8b}, [x3], x1 306 st1 {v1.8b}, [x3], x1 307 st1 {v2.8b}, [x3], x1 308 st1 {v3.8b}, [x3] 309 ret 310 endfunc 311 312 // VC-1 4x8 inverse transform 313 // On entry: 314 // x0 -> array of 8-bit samples, in row-major order 315 // x1 = row stride for 8-bit sample array 316 // x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) 317 // On exit: 318 // array at x0 updated by saturated addition of (narrowed) transformed block 319 function ff_vc1_inv_trans_4x8_neon, export=1 320 mov x3, #16 321 ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector 322 mov x4, x0 323 ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 324 ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 325 ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 326 ld1 {v4.d}[0], [x2], x3 // 30 31 32 33 327 ld1 {v1.d}[1], [x2], x3 // 40 41 42 43 328 ld1 {v2.d}[1], [x2], x3 // 50 51 52 53 329 ld1 {v3.d}[1], [x2], x3 // 60 61 62 63 330 ld1 {v4.d}[1], [x2] // 70 71 72 73 331 ld1 {v5.s}[0], [x0], x1 332 ld1 {v6.s}[0], [x0], x1 333 ld1 {v7.s}[0], [x0], x1 334 trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53 335 trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52 336 trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73 337 trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72 338 ld1 {v4.s}[0], [x0], x1 339 trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73 340 trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70 341 trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71 342 mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3] 343 ld1 {v5.s}[1], [x0], x1 344 mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3] 345 ld1 {v6.s}[1], [x0], x1 346 trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72 347 mul v3.8h, v18.8h, v0.h[6] // 17 * src[0] 348 ld1 {v7.s}[1], [x0], x1 349 mul v1.8h, v1.8h, v0.h[6] // 17 * src[2] 350 ld1 {v4.s}[1], [x0] 351 mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3] 352 mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] 353 add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2] 354 sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2] 355 neg v3.8h, v16.8h // -t3/2 356 ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1 357 neg v18.8h, v17.8h // -t4/2 358 ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1 359 ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1 360 ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1 361 srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3 362 srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3 363 srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3 364 srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3 365 trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73 366 trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71 367 trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61 368 trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63 369 trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53 370 trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73 371 trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43 372 mov d18, v3.d[1] // 50 51 52 53 373 shl v19.4h, v3.4h, #4 // 16 * src[8] 374 mov d20, v16.d[1] // 70 71 72 73 375 shl v21.4h, v16.4h, #4 // 16 * src[24] 376 mov d22, v17.d[1] // 40 41 42 43 377 shl v23.4h, v3.4h, #2 // 4 * src[8] 378 shl v24.4h, v18.4h, #4 // 16 * src[40] 379 shl v25.4h, v20.4h, #4 // 16 * src[56] 380 shl v26.4h, v18.4h, #2 // 4 * src[40] 381 trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63 382 ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40] 383 sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56] 384 shl v17.4h, v17.4h, #2 // 8/2 * src[0] 385 sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40] 386 shl v22.4h, v22.4h, #2 // 8/2 * src[32] 387 mov d23, v1.d[1] // 60 61 62 63 388 ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56] 389 mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16] 390 shl v1.4h, v1.4h, #3 // 16/2 * src[16] 391 mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] 392 ssra v17.4h, v17.4h, #1 // 12/2 * src[0] 393 mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] 394 ssra v22.4h, v22.4h, #1 // 12/2 * src[32] 395 mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] 396 shl v3.4h, v23.4h, #3 // 16/2 * src[48] 397 mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] 398 mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] 399 mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] 400 add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32] 401 sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48] 402 sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32] 403 mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] 404 mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] 405 add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 406 mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] 407 sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 408 add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 409 sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 410 neg v23.4h, v24.4h // +t2 411 sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 412 add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 413 neg v17.4h, v21.4h // +t3 414 sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 415 add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 416 neg v16.4h, v19.4h // -t1 417 neg v27.4h, v2.4h // +t4 418 ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1 419 srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1 420 ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1 421 srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1 422 ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1 423 srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1 424 ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1 425 srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1 426 trn1 v0.2d, v20.2d, v0.2d 427 trn1 v2.2d, v18.2d, v22.2d 428 trn1 v3.2d, v25.2d, v3.2d 429 trn1 v1.2d, v26.2d, v1.2d 430 srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7 431 srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7 432 srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7 433 srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7 434 uaddw v0.8h, v0.8h, v5.8b 435 uaddw v2.8h, v2.8h, v6.8b 436 uaddw v3.8h, v3.8h, v7.8b 437 uaddw v1.8h, v1.8h, v4.8b 438 sqxtun v0.8b, v0.8h 439 sqxtun v2.8b, v2.8h 440 sqxtun v3.8b, v3.8h 441 sqxtun v1.8b, v1.8h 442 st1 {v0.s}[0], [x4], x1 443 st1 {v2.s}[0], [x4], x1 444 st1 {v3.s}[0], [x4], x1 445 st1 {v1.s}[0], [x4], x1 446 st1 {v0.s}[1], [x4], x1 447 st1 {v2.s}[1], [x4], x1 448 st1 {v3.s}[1], [x4], x1 449 st1 {v1.s}[1], [x4] 450 ret 451 endfunc 452 453 // VC-1 4x4 inverse transform 454 // On entry: 455 // x0 -> array of 8-bit samples, in row-major order 456 // x1 = row stride for 8-bit sample array 457 // x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) 458 // On exit: 459 // array at x0 updated by saturated addition of (narrowed) transformed block 460 function ff_vc1_inv_trans_4x4_neon, export=1 461 mov x3, #16 462 ldr d0, .Lcoeffs_it4 463 mov x4, x0 464 ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 465 ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 466 ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 467 ld1 {v4.d}[0], [x2] // 30 31 32 33 468 ld1 {v5.s}[0], [x0], x1 469 ld1 {v5.s}[1], [x0], x1 470 ld1 {v6.s}[0], [x0], x1 471 trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13 472 trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12 473 ld1 {v6.s}[1], [x0] 474 trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33 475 trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32 476 trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33 477 trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30 478 trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31 479 trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32 480 mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3] 481 mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3] 482 mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] 483 mul v1.4h, v1.4h, v0.h[2] // 17 * src[2] 484 mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3] 485 mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] 486 add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2] 487 sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2] 488 neg v7.4h, v3.4h // -t3/2 489 neg v16.4h, v4.4h // -t4/2 490 ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1 491 ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1 492 ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1 493 ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1 494 srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3 495 srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3 496 srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3 497 srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3 498 trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31 499 trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21 500 trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33 501 trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23 502 trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33 503 trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03 504 trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13 505 trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23 506 mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24] 507 mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24] 508 mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] 509 mul v1.4h, v1.4h, v0.h[2] // 17 * src[16] 510 mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] 511 mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24] 512 add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16] 513 sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16] 514 neg v3.4h, v2.4h // -t4/2 515 neg v7.4h, v4.4h // -t3/2 516 ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1 517 ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1 518 ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1 519 ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1 520 trn1 v0.2d, v4.2d, v3.2d 521 trn1 v1.2d, v2.2d, v7.2d 522 srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7 523 srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7 524 uaddw v0.8h, v0.8h, v5.8b 525 uaddw v1.8h, v1.8h, v6.8b 526 sqxtun v0.8b, v0.8h 527 sqxtun v1.8b, v1.8h 528 st1 {v0.s}[0], [x4], x1 529 st1 {v0.s}[1], [x4], x1 530 st1 {v1.s}[0], [x4], x1 531 st1 {v1.s}[1], [x4] 532 ret 533 endfunc 534 535 // VC-1 8x8 inverse transform, DC case 536 // On entry: 537 // x0 -> array of 8-bit samples, in row-major order 538 // x1 = row stride for 8-bit sample array 539 // x2 -> 16-bit inverse transform DC coefficient 540 // On exit: 541 // array at x0 updated by saturated addition of (narrowed) transformed block 542 function ff_vc1_inv_trans_8x8_dc_neon, export=1 543 ldrsh w2, [x2] 544 mov x3, x0 545 ld1 {v0.8b}, [x0], x1 546 ld1 {v1.8b}, [x0], x1 547 ld1 {v2.8b}, [x0], x1 548 add w2, w2, w2, lsl #1 549 ld1 {v3.8b}, [x0], x1 550 ld1 {v4.8b}, [x0], x1 551 add w2, w2, #1 552 ld1 {v5.8b}, [x0], x1 553 asr w2, w2, #1 554 ld1 {v6.8b}, [x0], x1 555 add w2, w2, w2, lsl #1 556 ld1 {v7.8b}, [x0] 557 add w0, w2, #16 558 asr w0, w0, #5 559 dup v16.8h, w0 560 uaddw v0.8h, v16.8h, v0.8b 561 uaddw v1.8h, v16.8h, v1.8b 562 uaddw v2.8h, v16.8h, v2.8b 563 uaddw v3.8h, v16.8h, v3.8b 564 uaddw v4.8h, v16.8h, v4.8b 565 uaddw v5.8h, v16.8h, v5.8b 566 sqxtun v0.8b, v0.8h 567 uaddw v6.8h, v16.8h, v6.8b 568 sqxtun v1.8b, v1.8h 569 uaddw v7.8h, v16.8h, v7.8b 570 sqxtun v2.8b, v2.8h 571 sqxtun v3.8b, v3.8h 572 sqxtun v4.8b, v4.8h 573 st1 {v0.8b}, [x3], x1 574 sqxtun v0.8b, v5.8h 575 st1 {v1.8b}, [x3], x1 576 sqxtun v1.8b, v6.8h 577 st1 {v2.8b}, [x3], x1 578 sqxtun v2.8b, v7.8h 579 st1 {v3.8b}, [x3], x1 580 st1 {v4.8b}, [x3], x1 581 st1 {v0.8b}, [x3], x1 582 st1 {v1.8b}, [x3], x1 583 st1 {v2.8b}, [x3] 584 ret 585 endfunc 586 587 // VC-1 8x4 inverse transform, DC case 588 // On entry: 589 // x0 -> array of 8-bit samples, in row-major order 590 // x1 = row stride for 8-bit sample array 591 // x2 -> 16-bit inverse transform DC coefficient 592 // On exit: 593 // array at x0 updated by saturated addition of (narrowed) transformed block 594 function ff_vc1_inv_trans_8x4_dc_neon, export=1 595 ldrsh w2, [x2] 596 mov x3, x0 597 ld1 {v0.8b}, [x0], x1 598 ld1 {v1.8b}, [x0], x1 599 ld1 {v2.8b}, [x0], x1 600 add w2, w2, w2, lsl #1 601 ld1 {v3.8b}, [x0] 602 add w0, w2, #1 603 asr w0, w0, #1 604 add w0, w0, w0, lsl #4 605 add w0, w0, #64 606 asr w0, w0, #7 607 dup v4.8h, w0 608 uaddw v0.8h, v4.8h, v0.8b 609 uaddw v1.8h, v4.8h, v1.8b 610 uaddw v2.8h, v4.8h, v2.8b 611 uaddw v3.8h, v4.8h, v3.8b 612 sqxtun v0.8b, v0.8h 613 sqxtun v1.8b, v1.8h 614 sqxtun v2.8b, v2.8h 615 sqxtun v3.8b, v3.8h 616 st1 {v0.8b}, [x3], x1 617 st1 {v1.8b}, [x3], x1 618 st1 {v2.8b}, [x3], x1 619 st1 {v3.8b}, [x3] 620 ret 621 endfunc 622 623 // VC-1 4x8 inverse transform, DC case 624 // On entry: 625 // x0 -> array of 8-bit samples, in row-major order 626 // x1 = row stride for 8-bit sample array 627 // x2 -> 16-bit inverse transform DC coefficient 628 // On exit: 629 // array at x0 updated by saturated addition of (narrowed) transformed block 630 function ff_vc1_inv_trans_4x8_dc_neon, export=1 631 ldrsh w2, [x2] 632 mov x3, x0 633 ld1 {v0.s}[0], [x0], x1 634 ld1 {v1.s}[0], [x0], x1 635 ld1 {v2.s}[0], [x0], x1 636 add w2, w2, w2, lsl #4 637 ld1 {v3.s}[0], [x0], x1 638 add w2, w2, #4 639 asr w2, w2, #3 640 add w2, w2, w2, lsl #1 641 ld1 {v0.s}[1], [x0], x1 642 add w2, w2, #16 643 asr w2, w2, #5 644 dup v4.8h, w2 645 ld1 {v1.s}[1], [x0], x1 646 ld1 {v2.s}[1], [x0], x1 647 ld1 {v3.s}[1], [x0] 648 uaddw v0.8h, v4.8h, v0.8b 649 uaddw v1.8h, v4.8h, v1.8b 650 uaddw v2.8h, v4.8h, v2.8b 651 uaddw v3.8h, v4.8h, v3.8b 652 sqxtun v0.8b, v0.8h 653 sqxtun v1.8b, v1.8h 654 sqxtun v2.8b, v2.8h 655 sqxtun v3.8b, v3.8h 656 st1 {v0.s}[0], [x3], x1 657 st1 {v1.s}[0], [x3], x1 658 st1 {v2.s}[0], [x3], x1 659 st1 {v3.s}[0], [x3], x1 660 st1 {v0.s}[1], [x3], x1 661 st1 {v1.s}[1], [x3], x1 662 st1 {v2.s}[1], [x3], x1 663 st1 {v3.s}[1], [x3] 664 ret 665 endfunc 666 667 // VC-1 4x4 inverse transform, DC case 668 // On entry: 669 // x0 -> array of 8-bit samples, in row-major order 670 // x1 = row stride for 8-bit sample array 671 // x2 -> 16-bit inverse transform DC coefficient 672 // On exit: 673 // array at x0 updated by saturated addition of (narrowed) transformed block 674 function ff_vc1_inv_trans_4x4_dc_neon, export=1 675 ldrsh w2, [x2] 676 mov x3, x0 677 ld1 {v0.s}[0], [x0], x1 678 ld1 {v1.s}[0], [x0], x1 679 ld1 {v0.s}[1], [x0], x1 680 add w2, w2, w2, lsl #4 681 ld1 {v1.s}[1], [x0] 682 add w0, w2, #4 683 asr w0, w0, #3 684 add w0, w0, w0, lsl #4 685 add w0, w0, #64 686 asr w0, w0, #7 687 dup v2.8h, w0 688 uaddw v0.8h, v2.8h, v0.8b 689 uaddw v1.8h, v2.8h, v1.8b 690 sqxtun v0.8b, v0.8h 691 sqxtun v1.8b, v1.8h 692 st1 {v0.s}[0], [x3], x1 693 st1 {v1.s}[0], [x3], x1 694 st1 {v0.s}[1], [x3], x1 695 st1 {v1.s}[1], [x3] 696 ret 697 endfunc 698 699 .align 5 700 .Lcoeffs_it8: 701 .quad 0x000F00090003 702 .Lcoeffs_it4: 703 .quad 0x0011000B0005 704 .Lcoeffs: 705 .quad 0x00050002 706 707 // VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks 708 // On entry: 709 // x0 -> top-left pel of lower block 710 // x1 = row stride, bytes 711 // w2 = PQUANT bitstream parameter 712 function ff_vc1_v_loop_filter4_neon, export=1 713 sub x3, x0, w1, sxtw #2 714 ldr d0, .Lcoeffs 715 ld1 {v1.s}[0], [x0], x1 // P5 716 ld1 {v2.s}[0], [x3], x1 // P1 717 ld1 {v3.s}[0], [x3], x1 // P2 718 ld1 {v4.s}[0], [x0], x1 // P6 719 ld1 {v5.s}[0], [x3], x1 // P3 720 ld1 {v6.s}[0], [x0], x1 // P7 721 ld1 {v7.s}[0], [x3] // P4 722 ld1 {v16.s}[0], [x0] // P8 723 ushll v17.8h, v1.8b, #1 // 2*P5 724 dup v18.8h, w2 // pq 725 ushll v2.8h, v2.8b, #1 // 2*P1 726 uxtl v3.8h, v3.8b // P2 727 uxtl v4.8h, v4.8b // P6 728 uxtl v19.8h, v5.8b // P3 729 mls v2.4h, v3.4h, v0.h[1] // 2*P1-5*P2 730 uxtl v3.8h, v6.8b // P7 731 mls v17.4h, v4.4h, v0.h[1] // 2*P5-5*P6 732 ushll v5.8h, v5.8b, #1 // 2*P3 733 uxtl v6.8h, v7.8b // P4 734 mla v17.4h, v3.4h, v0.h[1] // 2*P5-5*P6+5*P7 735 uxtl v3.8h, v16.8b // P8 736 mla v2.4h, v19.4h, v0.h[1] // 2*P1-5*P2+5*P3 737 uxtl v1.8h, v1.8b // P5 738 mls v5.4h, v6.4h, v0.h[1] // 2*P3-5*P4 739 mls v17.4h, v3.4h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 740 sub v3.4h, v6.4h, v1.4h // P4-P5 741 mls v2.4h, v6.4h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 742 mla v5.4h, v1.4h, v0.h[1] // 2*P3-5*P4+5*P5 743 mls v5.4h, v4.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 744 abs v4.4h, v3.4h 745 srshr v7.4h, v17.4h, #3 746 srshr v2.4h, v2.4h, #3 747 sshr v4.4h, v4.4h, #1 // clip 748 srshr v5.4h, v5.4h, #3 749 abs v7.4h, v7.4h // a2 750 sshr v3.4h, v3.4h, #8 // clip_sign 751 abs v2.4h, v2.4h // a1 752 cmeq v16.4h, v4.4h, #0 // test clip == 0 753 abs v17.4h, v5.4h // a0 754 sshr v5.4h, v5.4h, #8 // a0_sign 755 cmhs v19.4h, v2.4h, v7.4h // test a1 >= a2 756 cmhs v18.4h, v17.4h, v18.4h // test a0 >= pq 757 sub v3.4h, v3.4h, v5.4h // clip_sign - a0_sign 758 bsl v19.8b, v7.8b, v2.8b // a3 759 orr v2.8b, v16.8b, v18.8b // test clip == 0 || a0 >= pq 760 uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 761 cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 762 mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 763 orr v5.8b, v2.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 764 mov w0, v5.s[1] // move to gp reg 765 ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 766 cmhs v5.4h, v0.4h, v4.4h 767 tbnz w0, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered 768 bsl v5.8b, v4.8b, v0.8b // FFMIN(d, clip) 769 bic v0.8b, v5.8b, v2.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) 770 mls v6.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 771 mla v1.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 772 sqxtun v0.8b, v6.8h 773 sqxtun v1.8b, v1.8h 774 st1 {v0.s}[0], [x3], x1 775 st1 {v1.s}[0], [x3] 776 1: ret 777 endfunc 778 779 // VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks 780 // On entry: 781 // x0 -> top-left pel of right block 782 // x1 = row stride, bytes 783 // w2 = PQUANT bitstream parameter 784 function ff_vc1_h_loop_filter4_neon, export=1 785 sub x3, x0, #4 // where to start reading 786 ldr d0, .Lcoeffs 787 ld1 {v1.8b}, [x3], x1 788 sub x0, x0, #1 // where to start writing 789 ld1 {v2.8b}, [x3], x1 790 ld1 {v3.8b}, [x3], x1 791 ld1 {v4.8b}, [x3] 792 dup v5.8h, w2 // pq 793 trn1 v6.8b, v1.8b, v2.8b 794 trn2 v1.8b, v1.8b, v2.8b 795 trn1 v2.8b, v3.8b, v4.8b 796 trn2 v3.8b, v3.8b, v4.8b 797 trn1 v4.4h, v6.4h, v2.4h // P1, P5 798 trn1 v7.4h, v1.4h, v3.4h // P2, P6 799 trn2 v2.4h, v6.4h, v2.4h // P3, P7 800 trn2 v1.4h, v1.4h, v3.4h // P4, P8 801 ushll v3.8h, v4.8b, #1 // 2*P1, 2*P5 802 uxtl v6.8h, v7.8b // P2, P6 803 uxtl v7.8h, v2.8b // P3, P7 804 uxtl v1.8h, v1.8b // P4, P8 805 mls v3.8h, v6.8h, v0.h[1] // 2*P1-5*P2, 2*P5-5*P6 806 ushll v2.8h, v2.8b, #1 // 2*P3, 2*P7 807 uxtl v4.8h, v4.8b // P1, P5 808 mla v3.8h, v7.8h, v0.h[1] // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 809 mov d6, v6.d[1] // P6 810 mls v3.8h, v1.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 811 mov d4, v4.d[1] // P5 812 mls v2.4h, v1.4h, v0.h[1] // 2*P3-5*P4 813 mla v2.4h, v4.4h, v0.h[1] // 2*P3-5*P4+5*P5 814 sub v7.4h, v1.4h, v4.4h // P4-P5 815 mls v2.4h, v6.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 816 srshr v3.8h, v3.8h, #3 817 abs v6.4h, v7.4h 818 sshr v7.4h, v7.4h, #8 // clip_sign 819 srshr v2.4h, v2.4h, #3 820 abs v3.8h, v3.8h // a1, a2 821 sshr v6.4h, v6.4h, #1 // clip 822 mov d16, v3.d[1] // a2 823 abs v17.4h, v2.4h // a0 824 cmeq v18.4h, v6.4h, #0 // test clip == 0 825 sshr v2.4h, v2.4h, #8 // a0_sign 826 cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2 827 cmhs v5.4h, v17.4h, v5.4h // test a0 >= pq 828 sub v2.4h, v7.4h, v2.4h // clip_sign - a0_sign 829 bsl v19.8b, v16.8b, v3.8b // a3 830 orr v3.8b, v18.8b, v5.8b // test clip == 0 || a0 >= pq 831 uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 832 cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 833 mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 834 orr v5.8b, v3.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 835 mov w2, v5.s[1] // move to gp reg 836 ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 837 cmhs v5.4h, v0.4h, v6.4h 838 tbnz w2, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered 839 bsl v5.8b, v6.8b, v0.8b // FFMIN(d, clip) 840 bic v0.8b, v5.8b, v3.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) 841 mla v4.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 842 mls v1.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 843 sqxtun v3.8b, v4.8h 844 sqxtun v2.8b, v1.8h 845 st2 {v2.b, v3.b}[0], [x0], x1 846 st2 {v2.b, v3.b}[1], [x0], x1 847 st2 {v2.b, v3.b}[2], [x0], x1 848 st2 {v2.b, v3.b}[3], [x0] 849 1: ret 850 endfunc 851 852 // VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks 853 // On entry: 854 // x0 -> top-left pel of lower block 855 // x1 = row stride, bytes 856 // w2 = PQUANT bitstream parameter 857 function ff_vc1_v_loop_filter8_neon, export=1 858 sub x3, x0, w1, sxtw #2 859 ldr d0, .Lcoeffs 860 ld1 {v1.8b}, [x0], x1 // P5 861 movi v2.2d, #0x0000ffff00000000 862 ld1 {v3.8b}, [x3], x1 // P1 863 ld1 {v4.8b}, [x3], x1 // P2 864 ld1 {v5.8b}, [x0], x1 // P6 865 ld1 {v6.8b}, [x3], x1 // P3 866 ld1 {v7.8b}, [x0], x1 // P7 867 ushll v16.8h, v1.8b, #1 // 2*P5 868 ushll v3.8h, v3.8b, #1 // 2*P1 869 ld1 {v17.8b}, [x3] // P4 870 uxtl v4.8h, v4.8b // P2 871 ld1 {v18.8b}, [x0] // P8 872 uxtl v5.8h, v5.8b // P6 873 dup v19.8h, w2 // pq 874 uxtl v20.8h, v6.8b // P3 875 mls v3.8h, v4.8h, v0.h[1] // 2*P1-5*P2 876 uxtl v4.8h, v7.8b // P7 877 ushll v6.8h, v6.8b, #1 // 2*P3 878 mls v16.8h, v5.8h, v0.h[1] // 2*P5-5*P6 879 uxtl v7.8h, v17.8b // P4 880 uxtl v17.8h, v18.8b // P8 881 mla v16.8h, v4.8h, v0.h[1] // 2*P5-5*P6+5*P7 882 uxtl v1.8h, v1.8b // P5 883 mla v3.8h, v20.8h, v0.h[1] // 2*P1-5*P2+5*P3 884 sub v4.8h, v7.8h, v1.8h // P4-P5 885 mls v6.8h, v7.8h, v0.h[1] // 2*P3-5*P4 886 mls v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 887 abs v17.8h, v4.8h 888 sshr v4.8h, v4.8h, #8 // clip_sign 889 mls v3.8h, v7.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 890 sshr v17.8h, v17.8h, #1 // clip 891 mla v6.8h, v1.8h, v0.h[1] // 2*P3-5*P4+5*P5 892 srshr v16.8h, v16.8h, #3 893 mls v6.8h, v5.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 894 cmeq v5.8h, v17.8h, #0 // test clip == 0 895 srshr v3.8h, v3.8h, #3 896 abs v16.8h, v16.8h // a2 897 abs v3.8h, v3.8h // a1 898 srshr v6.8h, v6.8h, #3 899 cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2 900 abs v20.8h, v6.8h // a0 901 sshr v6.8h, v6.8h, #8 // a0_sign 902 bsl v18.16b, v16.16b, v3.16b // a3 903 cmhs v3.8h, v20.8h, v19.8h // test a0 >= pq 904 sub v4.8h, v4.8h, v6.8h // clip_sign - a0_sign 905 uqsub v6.8h, v20.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 906 cmhs v16.8h, v18.8h, v20.8h // test a3 >= a0 907 orr v3.16b, v5.16b, v3.16b // test clip == 0 || a0 >= pq 908 mul v0.8h, v6.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 909 orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0 910 cmtst v2.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either 911 mov w0, v5.s[1] // move to gp reg 912 ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 913 mov w2, v5.s[3] 914 orr v2.16b, v3.16b, v2.16b 915 cmhs v3.8h, v0.8h, v17.8h 916 and w0, w0, w2 917 bsl v3.16b, v17.16b, v0.16b // FFMIN(d, clip) 918 tbnz w0, #0, 1f // none of the 8 pixel pairs should be updated in this case 919 bic v0.16b, v3.16b, v2.16b // set each d to zero if it should not be filtered 920 mls v7.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 921 mla v1.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 922 sqxtun v0.8b, v7.8h 923 sqxtun v1.8b, v1.8h 924 st1 {v0.8b}, [x3], x1 925 st1 {v1.8b}, [x3] 926 1: ret 927 endfunc 928 929 // VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks 930 // On entry: 931 // x0 -> top-left pel of right block 932 // x1 = row stride, bytes 933 // w2 = PQUANT bitstream parameter 934 function ff_vc1_h_loop_filter8_neon, export=1 935 sub x3, x0, #4 // where to start reading 936 ldr d0, .Lcoeffs 937 ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... 938 sub x0, x0, #1 // where to start writing 939 ld1 {v2.8b}, [x3], x1 940 add x4, x0, x1, lsl #2 941 ld1 {v3.8b}, [x3], x1 942 ld1 {v4.8b}, [x3], x1 943 ld1 {v5.8b}, [x3], x1 944 ld1 {v6.8b}, [x3], x1 945 ld1 {v7.8b}, [x3], x1 946 trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... 947 ld1 {v17.8b}, [x3] 948 trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... 949 trn1 v2.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... 950 trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... 951 dup v4.8h, w2 // pq 952 trn1 v18.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... 953 trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... 954 trn1 v6.4h, v16.4h, v2.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... 955 trn1 v19.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... 956 trn1 v20.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... 957 trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... 958 trn2 v2.4h, v16.4h, v2.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... 959 trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... 960 trn1 v3.4h, v18.4h, v20.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... 961 trn1 v16.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... 962 trn2 v17.4h, v18.4h, v20.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... 963 trn2 v5.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... 964 trn1 v7.2s, v6.2s, v3.2s // P1 965 trn1 v18.2s, v19.2s, v16.2s // P2 966 trn2 v3.2s, v6.2s, v3.2s // P5 967 trn2 v6.2s, v19.2s, v16.2s // P6 968 trn1 v16.2s, v2.2s, v17.2s // P3 969 trn2 v2.2s, v2.2s, v17.2s // P7 970 ushll v7.8h, v7.8b, #1 // 2*P1 971 trn1 v17.2s, v1.2s, v5.2s // P4 972 ushll v19.8h, v3.8b, #1 // 2*P5 973 trn2 v1.2s, v1.2s, v5.2s // P8 974 uxtl v5.8h, v18.8b // P2 975 uxtl v6.8h, v6.8b // P6 976 uxtl v18.8h, v16.8b // P3 977 mls v7.8h, v5.8h, v0.h[1] // 2*P1-5*P2 978 uxtl v2.8h, v2.8b // P7 979 ushll v5.8h, v16.8b, #1 // 2*P3 980 mls v19.8h, v6.8h, v0.h[1] // 2*P5-5*P6 981 uxtl v16.8h, v17.8b // P4 982 uxtl v1.8h, v1.8b // P8 983 mla v19.8h, v2.8h, v0.h[1] // 2*P5-5*P6+5*P7 984 uxtl v2.8h, v3.8b // P5 985 mla v7.8h, v18.8h, v0.h[1] // 2*P1-5*P2+5*P3 986 sub v3.8h, v16.8h, v2.8h // P4-P5 987 mls v5.8h, v16.8h, v0.h[1] // 2*P3-5*P4 988 mls v19.8h, v1.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 989 abs v1.8h, v3.8h 990 sshr v3.8h, v3.8h, #8 // clip_sign 991 mls v7.8h, v16.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 992 sshr v1.8h, v1.8h, #1 // clip 993 mla v5.8h, v2.8h, v0.h[1] // 2*P3-5*P4+5*P5 994 srshr v17.8h, v19.8h, #3 995 mls v5.8h, v6.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 996 cmeq v6.8h, v1.8h, #0 // test clip == 0 997 srshr v7.8h, v7.8h, #3 998 abs v17.8h, v17.8h // a2 999 abs v7.8h, v7.8h // a1 1000 srshr v5.8h, v5.8h, #3 1001 cmhs v18.8h, v7.8h, v17.8h // test a1 >= a2 1002 abs v19.8h, v5.8h // a0 1003 sshr v5.8h, v5.8h, #8 // a0_sign 1004 bsl v18.16b, v17.16b, v7.16b // a3 1005 cmhs v4.8h, v19.8h, v4.8h // test a0 >= pq 1006 sub v3.8h, v3.8h, v5.8h // clip_sign - a0_sign 1007 uqsub v5.8h, v19.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1008 cmhs v7.8h, v18.8h, v19.8h // test a3 >= a0 1009 orr v4.16b, v6.16b, v4.16b // test clip == 0 || a0 >= pq 1010 mul v0.8h, v5.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 1011 orr v5.16b, v4.16b, v7.16b // test clip == 0 || a0 >= pq || a3 >= a0 1012 mov w2, v5.s[1] // move to gp reg 1013 ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 1014 mov w3, v5.s[3] 1015 cmhs v5.8h, v0.8h, v1.8h 1016 and w5, w2, w3 1017 bsl v5.16b, v1.16b, v0.16b // FFMIN(d, clip) 1018 tbnz w5, #0, 2f // none of the 8 pixel pairs should be updated in this case 1019 bic v0.16b, v5.16b, v4.16b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) 1020 mla v2.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 1021 mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 1022 sqxtun v1.8b, v2.8h 1023 sqxtun v0.8b, v16.8h 1024 tbnz w2, #0, 1f // none of the first 4 pixel pairs should be updated if so 1025 st2 {v0.b, v1.b}[0], [x0], x1 1026 st2 {v0.b, v1.b}[1], [x0], x1 1027 st2 {v0.b, v1.b}[2], [x0], x1 1028 st2 {v0.b, v1.b}[3], [x0] 1029 1: tbnz w3, #0, 2f // none of the second 4 pixel pairs should be updated if so 1030 st2 {v0.b, v1.b}[4], [x4], x1 1031 st2 {v0.b, v1.b}[5], [x4], x1 1032 st2 {v0.b, v1.b}[6], [x4], x1 1033 st2 {v0.b, v1.b}[7], [x4] 1034 2: ret 1035 endfunc 1036 1037 // VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks 1038 // On entry: 1039 // x0 -> top-left pel of lower block 1040 // x1 = row stride, bytes 1041 // w2 = PQUANT bitstream parameter 1042 function ff_vc1_v_loop_filter16_neon, export=1 1043 sub x3, x0, w1, sxtw #2 1044 ldr d0, .Lcoeffs 1045 ld1 {v1.16b}, [x0], x1 // P5 1046 movi v2.2d, #0x0000ffff00000000 1047 ld1 {v3.16b}, [x3], x1 // P1 1048 ld1 {v4.16b}, [x3], x1 // P2 1049 ld1 {v5.16b}, [x0], x1 // P6 1050 ld1 {v6.16b}, [x3], x1 // P3 1051 ld1 {v7.16b}, [x0], x1 // P7 1052 ushll v16.8h, v1.8b, #1 // 2*P5[0..7] 1053 ushll v17.8h, v3.8b, #1 // 2*P1[0..7] 1054 ld1 {v18.16b}, [x3] // P4 1055 uxtl v19.8h, v4.8b // P2[0..7] 1056 ld1 {v20.16b}, [x0] // P8 1057 uxtl v21.8h, v5.8b // P6[0..7] 1058 dup v22.8h, w2 // pq 1059 ushll2 v3.8h, v3.16b, #1 // 2*P1[8..15] 1060 mls v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] 1061 ushll2 v19.8h, v1.16b, #1 // 2*P5[8..15] 1062 uxtl2 v4.8h, v4.16b // P2[8..15] 1063 mls v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] 1064 uxtl2 v5.8h, v5.16b // P6[8..15] 1065 uxtl v23.8h, v6.8b // P3[0..7] 1066 uxtl v24.8h, v7.8b // P7[0..7] 1067 mls v3.8h, v4.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] 1068 ushll v4.8h, v6.8b, #1 // 2*P3[0..7] 1069 uxtl v25.8h, v18.8b // P4[0..7] 1070 mls v19.8h, v5.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] 1071 uxtl2 v26.8h, v6.16b // P3[8..15] 1072 mla v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] 1073 uxtl2 v7.8h, v7.16b // P7[8..15] 1074 ushll2 v6.8h, v6.16b, #1 // 2*P3[8..15] 1075 mla v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] 1076 uxtl2 v18.8h, v18.16b // P4[8..15] 1077 uxtl v23.8h, v20.8b // P8[0..7] 1078 mls v4.8h, v25.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] 1079 uxtl v24.8h, v1.8b // P5[0..7] 1080 uxtl2 v20.8h, v20.16b // P8[8..15] 1081 mla v3.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] 1082 uxtl2 v1.8h, v1.16b // P5[8..15] 1083 sub v26.8h, v25.8h, v24.8h // P4[0..7]-P5[0..7] 1084 mla v19.8h, v7.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] 1085 sub v7.8h, v18.8h, v1.8h // P4[8..15]-P5[8..15] 1086 mls v6.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] 1087 abs v27.8h, v26.8h 1088 sshr v26.8h, v26.8h, #8 // clip_sign[0..7] 1089 mls v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] 1090 abs v28.8h, v7.8h 1091 sshr v27.8h, v27.8h, #1 // clip[0..7] 1092 mls v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] 1093 sshr v7.8h, v7.8h, #8 // clip_sign[8..15] 1094 sshr v23.8h, v28.8h, #1 // clip[8..15] 1095 mla v4.8h, v24.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] 1096 cmeq v28.8h, v27.8h, #0 // test clip[0..7] == 0 1097 srshr v17.8h, v17.8h, #3 1098 mls v3.8h, v18.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] 1099 cmeq v29.8h, v23.8h, #0 // test clip[8..15] == 0 1100 srshr v16.8h, v16.8h, #3 1101 mls v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] 1102 abs v17.8h, v17.8h // a1[0..7] 1103 mla v6.8h, v1.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] 1104 srshr v3.8h, v3.8h, #3 1105 mls v4.8h, v21.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] 1106 abs v16.8h, v16.8h // a2[0..7] 1107 srshr v19.8h, v19.8h, #3 1108 mls v6.8h, v5.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] 1109 cmhs v5.8h, v17.8h, v16.8h // test a1[0..7] >= a2[0..7] 1110 abs v3.8h, v3.8h // a1[8..15] 1111 srshr v4.8h, v4.8h, #3 1112 abs v19.8h, v19.8h // a2[8..15] 1113 bsl v5.16b, v16.16b, v17.16b // a3[0..7] 1114 srshr v6.8h, v6.8h, #3 1115 cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15] 1116 abs v17.8h, v4.8h // a0[0..7] 1117 sshr v4.8h, v4.8h, #8 // a0_sign[0..7] 1118 bsl v16.16b, v19.16b, v3.16b // a3[8..15] 1119 uqsub v3.8h, v17.8h, v5.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1120 abs v19.8h, v6.8h // a0[8..15] 1121 cmhs v20.8h, v17.8h, v22.8h // test a0[0..7] >= pq 1122 cmhs v5.8h, v5.8h, v17.8h // test a3[0..7] >= a0[0..7] 1123 sub v4.8h, v26.8h, v4.8h // clip_sign[0..7] - a0_sign[0..7] 1124 sshr v6.8h, v6.8h, #8 // a0_sign[8..15] 1125 mul v3.8h, v3.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 1126 uqsub v17.8h, v19.8h, v16.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1127 orr v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq 1128 cmhs v21.8h, v19.8h, v22.8h // test a0[8..15] >= pq 1129 cmhs v16.8h, v16.8h, v19.8h // test a3[8..15] >= a0[8..15] 1130 mul v0.8h, v17.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 1131 sub v6.8h, v7.8h, v6.8h // clip_sign[8..15] - a0_sign[8..15] 1132 orr v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] 1133 ushr v3.8h, v3.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 1134 orr v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq 1135 cmtst v17.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either 1136 mov w0, v5.s[1] // move to gp reg 1137 cmhs v19.8h, v3.8h, v27.8h 1138 ushr v0.8h, v0.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 1139 mov w2, v5.s[3] 1140 orr v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] 1141 orr v16.16b, v20.16b, v17.16b 1142 bsl v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7]) 1143 cmtst v2.2d, v5.2d, v2.2d 1144 cmhs v3.8h, v0.8h, v23.8h 1145 mov w4, v5.s[1] 1146 mov w5, v5.s[3] 1147 and w0, w0, w2 1148 bic v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) 1149 orr v2.16b, v7.16b, v2.16b 1150 bsl v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15]) 1151 mls v25.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] 1152 and w2, w4, w5 1153 bic v0.16b, v3.16b, v2.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) 1154 mla v24.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] 1155 and w0, w0, w2 1156 mls v18.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] 1157 sqxtun v2.8b, v25.8h 1158 tbnz w0, #0, 1f // none of the 16 pixel pairs should be updated in this case 1159 mla v1.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] 1160 sqxtun v0.8b, v24.8h 1161 sqxtun2 v2.16b, v18.8h 1162 sqxtun2 v0.16b, v1.8h 1163 st1 {v2.16b}, [x3], x1 1164 st1 {v0.16b}, [x3] 1165 1: ret 1166 endfunc 1167 1168 // VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks 1169 // On entry: 1170 // x0 -> top-left pel of right block 1171 // x1 = row stride, bytes 1172 // w2 = PQUANT bitstream parameter 1173 function ff_vc1_h_loop_filter16_neon, export=1 1174 sub x3, x0, #4 // where to start reading 1175 ldr d0, .Lcoeffs 1176 ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... 1177 sub x0, x0, #1 // where to start writing 1178 ld1 {v2.8b}, [x3], x1 1179 add x4, x0, x1, lsl #3 1180 ld1 {v3.8b}, [x3], x1 1181 add x5, x0, x1, lsl #2 1182 ld1 {v4.8b}, [x3], x1 1183 add x6, x4, x1, lsl #2 1184 ld1 {v5.8b}, [x3], x1 1185 ld1 {v6.8b}, [x3], x1 1186 ld1 {v7.8b}, [x3], x1 1187 trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... 1188 ld1 {v17.8b}, [x3], x1 1189 trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... 1190 ld1 {v2.8b}, [x3], x1 1191 trn1 v18.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... 1192 ld1 {v19.8b}, [x3], x1 1193 trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... 1194 ld1 {v4.8b}, [x3], x1 1195 trn1 v20.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... 1196 ld1 {v21.8b}, [x3], x1 1197 trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... 1198 ld1 {v6.8b}, [x3], x1 1199 trn1 v22.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... 1200 ld1 {v23.8b}, [x3], x1 1201 trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... 1202 ld1 {v17.8b}, [x3], x1 1203 trn1 v24.8b, v2.8b, v19.8b // P1[8], P1[9], P3[8]... 1204 ld1 {v25.8b}, [x3] 1205 trn2 v2.8b, v2.8b, v19.8b // P2[8], P2[9], P4[8]... 1206 trn1 v19.4h, v16.4h, v18.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... 1207 trn1 v26.8b, v4.8b, v21.8b // P1[10], P1[11], P3[10]... 1208 trn2 v4.8b, v4.8b, v21.8b // P2[10], P2[11], P4[10]... 1209 trn1 v21.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... 1210 trn1 v27.4h, v20.4h, v22.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... 1211 trn1 v28.8b, v6.8b, v23.8b // P1[12], P1[13], P3[12]... 1212 trn2 v6.8b, v6.8b, v23.8b // P2[12], P2[13], P4[12]... 1213 trn1 v23.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... 1214 trn1 v29.4h, v24.4h, v26.4h // P1[8], P1[9], P1[10], P1[11], P5[8]... 1215 trn1 v30.8b, v17.8b, v25.8b // P1[14], P1[15], P3[14]... 1216 trn2 v17.8b, v17.8b, v25.8b // P2[14], P2[15], P4[14]... 1217 trn1 v25.4h, v2.4h, v4.4h // P2[8], P2[9], P2[10], P2[11], P6[8]... 1218 trn1 v31.2s, v19.2s, v27.2s // P1[0..7] 1219 trn2 v19.2s, v19.2s, v27.2s // P5[0..7] 1220 trn1 v27.2s, v21.2s, v23.2s // P2[0..7] 1221 trn2 v21.2s, v21.2s, v23.2s // P6[0..7] 1222 trn1 v23.4h, v28.4h, v30.4h // P1[12], P1[13], P1[14], P1[15], P5[12]... 1223 trn2 v16.4h, v16.4h, v18.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... 1224 trn1 v18.4h, v6.4h, v17.4h // P2[12], P2[13], P2[14], P2[15], P6[12]... 1225 trn2 v20.4h, v20.4h, v22.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... 1226 trn2 v22.4h, v24.4h, v26.4h // P3[8], P3[9], P3[10], P3[11], P7[8]... 1227 trn1 v24.2s, v29.2s, v23.2s // P1[8..15] 1228 trn2 v23.2s, v29.2s, v23.2s // P5[8..15] 1229 trn1 v26.2s, v25.2s, v18.2s // P2[8..15] 1230 trn2 v18.2s, v25.2s, v18.2s // P6[8..15] 1231 trn2 v25.4h, v28.4h, v30.4h // P3[12], P3[13], P3[14], P3[15], P7[12]... 1232 trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... 1233 trn2 v3.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... 1234 trn2 v2.4h, v2.4h, v4.4h // P4[8], P4[9], P4[10], P4[11], P8[8]... 1235 trn2 v4.4h, v6.4h, v17.4h // P4[12], P4[13], P4[14], P4[15], P8[12]... 1236 ushll v5.8h, v31.8b, #1 // 2*P1[0..7] 1237 ushll v6.8h, v19.8b, #1 // 2*P5[0..7] 1238 trn1 v7.2s, v16.2s, v20.2s // P3[0..7] 1239 uxtl v17.8h, v27.8b // P2[0..7] 1240 trn2 v16.2s, v16.2s, v20.2s // P7[0..7] 1241 uxtl v20.8h, v21.8b // P6[0..7] 1242 trn1 v21.2s, v22.2s, v25.2s // P3[8..15] 1243 ushll v24.8h, v24.8b, #1 // 2*P1[8..15] 1244 trn2 v22.2s, v22.2s, v25.2s // P7[8..15] 1245 ushll v25.8h, v23.8b, #1 // 2*P5[8..15] 1246 trn1 v27.2s, v1.2s, v3.2s // P4[0..7] 1247 uxtl v26.8h, v26.8b // P2[8..15] 1248 mls v5.8h, v17.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] 1249 uxtl v17.8h, v18.8b // P6[8..15] 1250 mls v6.8h, v20.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] 1251 trn1 v18.2s, v2.2s, v4.2s // P4[8..15] 1252 uxtl v28.8h, v7.8b // P3[0..7] 1253 mls v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] 1254 uxtl v16.8h, v16.8b // P7[0..7] 1255 uxtl v26.8h, v21.8b // P3[8..15] 1256 mls v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] 1257 uxtl v22.8h, v22.8b // P7[8..15] 1258 ushll v7.8h, v7.8b, #1 // 2*P3[0..7] 1259 uxtl v27.8h, v27.8b // P4[0..7] 1260 trn2 v1.2s, v1.2s, v3.2s // P8[0..7] 1261 ushll v3.8h, v21.8b, #1 // 2*P3[8..15] 1262 trn2 v2.2s, v2.2s, v4.2s // P8[8..15] 1263 uxtl v4.8h, v18.8b // P4[8..15] 1264 mla v5.8h, v28.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] 1265 uxtl v1.8h, v1.8b // P8[0..7] 1266 mla v6.8h, v16.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] 1267 uxtl v2.8h, v2.8b // P8[8..15] 1268 uxtl v16.8h, v19.8b // P5[0..7] 1269 mla v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] 1270 uxtl v18.8h, v23.8b // P5[8..15] 1271 dup v19.8h, w2 // pq 1272 mla v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] 1273 sub v21.8h, v27.8h, v16.8h // P4[0..7]-P5[0..7] 1274 sub v22.8h, v4.8h, v18.8h // P4[8..15]-P5[8..15] 1275 mls v7.8h, v27.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] 1276 abs v23.8h, v21.8h 1277 mls v3.8h, v4.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] 1278 abs v26.8h, v22.8h 1279 sshr v21.8h, v21.8h, #8 // clip_sign[0..7] 1280 mls v5.8h, v27.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] 1281 sshr v23.8h, v23.8h, #1 // clip[0..7] 1282 sshr v26.8h, v26.8h, #1 // clip[8..15] 1283 mls v6.8h, v1.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] 1284 sshr v1.8h, v22.8h, #8 // clip_sign[8..15] 1285 cmeq v22.8h, v23.8h, #0 // test clip[0..7] == 0 1286 mls v24.8h, v4.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] 1287 cmeq v28.8h, v26.8h, #0 // test clip[8..15] == 0 1288 srshr v5.8h, v5.8h, #3 1289 mls v25.8h, v2.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] 1290 srshr v2.8h, v6.8h, #3 1291 mla v7.8h, v16.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] 1292 srshr v6.8h, v24.8h, #3 1293 mla v3.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] 1294 abs v5.8h, v5.8h // a1[0..7] 1295 srshr v24.8h, v25.8h, #3 1296 mls v3.8h, v17.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] 1297 abs v2.8h, v2.8h // a2[0..7] 1298 abs v6.8h, v6.8h // a1[8..15] 1299 mls v7.8h, v20.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] 1300 abs v17.8h, v24.8h // a2[8..15] 1301 cmhs v20.8h, v5.8h, v2.8h // test a1[0..7] >= a2[0..7] 1302 srshr v3.8h, v3.8h, #3 1303 cmhs v24.8h, v6.8h, v17.8h // test a1[8..15] >= a2[8.15] 1304 srshr v7.8h, v7.8h, #3 1305 bsl v20.16b, v2.16b, v5.16b // a3[0..7] 1306 abs v2.8h, v3.8h // a0[8..15] 1307 sshr v3.8h, v3.8h, #8 // a0_sign[8..15] 1308 bsl v24.16b, v17.16b, v6.16b // a3[8..15] 1309 abs v5.8h, v7.8h // a0[0..7] 1310 sshr v6.8h, v7.8h, #8 // a0_sign[0..7] 1311 cmhs v7.8h, v2.8h, v19.8h // test a0[8..15] >= pq 1312 sub v1.8h, v1.8h, v3.8h // clip_sign[8..15] - a0_sign[8..15] 1313 uqsub v3.8h, v2.8h, v24.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1314 cmhs v2.8h, v24.8h, v2.8h // test a3[8..15] >= a0[8..15] 1315 uqsub v17.8h, v5.8h, v20.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1316 cmhs v19.8h, v5.8h, v19.8h // test a0[0..7] >= pq 1317 orr v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq 1318 sub v6.8h, v21.8h, v6.8h // clip_sign[0..7] - a0_sign[0..7] 1319 mul v3.8h, v3.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 1320 cmhs v5.8h, v20.8h, v5.8h // test a3[0..7] >= a0[0..7] 1321 orr v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq 1322 mul v0.8h, v17.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 1323 orr v2.16b, v7.16b, v2.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] 1324 orr v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] 1325 ushr v3.8h, v3.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 1326 mov w7, v2.s[1] 1327 mov w8, v2.s[3] 1328 ushr v0.8h, v0.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 1329 mov w2, v5.s[1] // move to gp reg 1330 cmhs v2.8h, v3.8h, v26.8h 1331 mov w3, v5.s[3] 1332 cmhs v5.8h, v0.8h, v23.8h 1333 bsl v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15]) 1334 and w9, w7, w8 1335 bsl v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7]) 1336 and w10, w2, w3 1337 bic v0.16b, v2.16b, v7.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) 1338 and w9, w10, w9 1339 bic v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) 1340 mls v4.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 1341 tbnz w9, #0, 4f // none of the 16 pixel pairs should be updated in this case 1342 mls v27.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 1343 mla v16.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 1344 sqxtun v2.8b, v4.8h 1345 mla v18.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 1346 sqxtun v0.8b, v27.8h 1347 sqxtun v1.8b, v16.8h 1348 sqxtun v3.8b, v18.8h 1349 tbnz w2, #0, 1f 1350 st2 {v0.b, v1.b}[0], [x0], x1 1351 st2 {v0.b, v1.b}[1], [x0], x1 1352 st2 {v0.b, v1.b}[2], [x0], x1 1353 st2 {v0.b, v1.b}[3], [x0] 1354 1: tbnz w3, #0, 2f 1355 st2 {v0.b, v1.b}[4], [x5], x1 1356 st2 {v0.b, v1.b}[5], [x5], x1 1357 st2 {v0.b, v1.b}[6], [x5], x1 1358 st2 {v0.b, v1.b}[7], [x5] 1359 2: tbnz w7, #0, 3f 1360 st2 {v2.b, v3.b}[0], [x4], x1 1361 st2 {v2.b, v3.b}[1], [x4], x1 1362 st2 {v2.b, v3.b}[2], [x4], x1 1363 st2 {v2.b, v3.b}[3], [x4] 1364 3: tbnz w8, #0, 4f 1365 st2 {v2.b, v3.b}[4], [x6], x1 1366 st2 {v2.b, v3.b}[5], [x6], x1 1367 st2 {v2.b, v3.b}[6], [x6], x1 1368 st2 {v2.b, v3.b}[7], [x6] 1369 4: ret 1370 endfunc 1371 1372 // Copy at most the specified number of bytes from source to destination buffer, 1373 // stopping at a multiple of 32 bytes, none of which are the start of an escape sequence 1374 // On entry: 1375 // x0 -> source buffer 1376 // w1 = max number of bytes to copy 1377 // x2 -> destination buffer, optimally 8-byte aligned 1378 // On exit: 1379 // w0 = number of bytes not copied 1380 function ff_vc1_unescape_buffer_helper_neon, export=1 1381 // Offset by 80 to screen out cases that are too short for us to handle, 1382 // and also make it easy to test for loop termination, or to determine 1383 // whether we need an odd number of half-iterations of the loop. 1384 subs w1, w1, #80 1385 b.mi 90f 1386 1387 // Set up useful constants 1388 movi v20.4s, #3, lsl #24 1389 movi v21.4s, #3, lsl #16 1390 1391 tst w1, #32 1392 b.ne 1f 1393 1394 ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48 1395 ext v25.16b, v0.16b, v1.16b, #1 1396 ext v26.16b, v0.16b, v1.16b, #2 1397 ext v27.16b, v0.16b, v1.16b, #3 1398 ext v29.16b, v1.16b, v2.16b, #1 1399 ext v30.16b, v1.16b, v2.16b, #2 1400 ext v31.16b, v1.16b, v2.16b, #3 1401 bic v24.16b, v0.16b, v20.16b 1402 bic v25.16b, v25.16b, v20.16b 1403 bic v26.16b, v26.16b, v20.16b 1404 bic v27.16b, v27.16b, v20.16b 1405 bic v28.16b, v1.16b, v20.16b 1406 bic v29.16b, v29.16b, v20.16b 1407 bic v30.16b, v30.16b, v20.16b 1408 bic v31.16b, v31.16b, v20.16b 1409 eor v24.16b, v24.16b, v21.16b 1410 eor v25.16b, v25.16b, v21.16b 1411 eor v26.16b, v26.16b, v21.16b 1412 eor v27.16b, v27.16b, v21.16b 1413 eor v28.16b, v28.16b, v21.16b 1414 eor v29.16b, v29.16b, v21.16b 1415 eor v30.16b, v30.16b, v21.16b 1416 eor v31.16b, v31.16b, v21.16b 1417 cmeq v24.4s, v24.4s, #0 1418 cmeq v25.4s, v25.4s, #0 1419 cmeq v26.4s, v26.4s, #0 1420 cmeq v27.4s, v27.4s, #0 1421 add w1, w1, #32 1422 b 3f 1423 1424 1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48 1425 ext v25.16b, v3.16b, v4.16b, #1 1426 ext v26.16b, v3.16b, v4.16b, #2 1427 ext v27.16b, v3.16b, v4.16b, #3 1428 ext v29.16b, v4.16b, v5.16b, #1 1429 ext v30.16b, v4.16b, v5.16b, #2 1430 ext v31.16b, v4.16b, v5.16b, #3 1431 bic v24.16b, v3.16b, v20.16b 1432 bic v25.16b, v25.16b, v20.16b 1433 bic v26.16b, v26.16b, v20.16b 1434 bic v27.16b, v27.16b, v20.16b 1435 bic v28.16b, v4.16b, v20.16b 1436 bic v29.16b, v29.16b, v20.16b 1437 bic v30.16b, v30.16b, v20.16b 1438 bic v31.16b, v31.16b, v20.16b 1439 eor v24.16b, v24.16b, v21.16b 1440 eor v25.16b, v25.16b, v21.16b 1441 eor v26.16b, v26.16b, v21.16b 1442 eor v27.16b, v27.16b, v21.16b 1443 eor v28.16b, v28.16b, v21.16b 1444 eor v29.16b, v29.16b, v21.16b 1445 eor v30.16b, v30.16b, v21.16b 1446 eor v31.16b, v31.16b, v21.16b 1447 cmeq v24.4s, v24.4s, #0 1448 cmeq v25.4s, v25.4s, #0 1449 cmeq v26.4s, v26.4s, #0 1450 cmeq v27.4s, v27.4s, #0 1451 // Drop through... 1452 2: mov v0.16b, v5.16b 1453 ld1 {v1.16b, v2.16b}, [x0], #32 1454 cmeq v28.4s, v28.4s, #0 1455 cmeq v29.4s, v29.4s, #0 1456 cmeq v30.4s, v30.4s, #0 1457 cmeq v31.4s, v31.4s, #0 1458 orr v24.16b, v24.16b, v25.16b 1459 orr v26.16b, v26.16b, v27.16b 1460 orr v28.16b, v28.16b, v29.16b 1461 orr v30.16b, v30.16b, v31.16b 1462 ext v25.16b, v0.16b, v1.16b, #1 1463 orr v22.16b, v24.16b, v26.16b 1464 ext v26.16b, v0.16b, v1.16b, #2 1465 ext v27.16b, v0.16b, v1.16b, #3 1466 ext v29.16b, v1.16b, v2.16b, #1 1467 orr v23.16b, v28.16b, v30.16b 1468 ext v30.16b, v1.16b, v2.16b, #2 1469 ext v31.16b, v1.16b, v2.16b, #3 1470 bic v24.16b, v0.16b, v20.16b 1471 bic v25.16b, v25.16b, v20.16b 1472 bic v26.16b, v26.16b, v20.16b 1473 orr v22.16b, v22.16b, v23.16b 1474 bic v27.16b, v27.16b, v20.16b 1475 bic v28.16b, v1.16b, v20.16b 1476 bic v29.16b, v29.16b, v20.16b 1477 bic v30.16b, v30.16b, v20.16b 1478 bic v31.16b, v31.16b, v20.16b 1479 addv s22, v22.4s 1480 eor v24.16b, v24.16b, v21.16b 1481 eor v25.16b, v25.16b, v21.16b 1482 eor v26.16b, v26.16b, v21.16b 1483 eor v27.16b, v27.16b, v21.16b 1484 eor v28.16b, v28.16b, v21.16b 1485 mov w3, v22.s[0] 1486 eor v29.16b, v29.16b, v21.16b 1487 eor v30.16b, v30.16b, v21.16b 1488 eor v31.16b, v31.16b, v21.16b 1489 cmeq v24.4s, v24.4s, #0 1490 cmeq v25.4s, v25.4s, #0 1491 cmeq v26.4s, v26.4s, #0 1492 cmeq v27.4s, v27.4s, #0 1493 cbnz w3, 90f 1494 st1 {v3.16b, v4.16b}, [x2], #32 1495 3: mov v3.16b, v2.16b 1496 ld1 {v4.16b, v5.16b}, [x0], #32 1497 cmeq v28.4s, v28.4s, #0 1498 cmeq v29.4s, v29.4s, #0 1499 cmeq v30.4s, v30.4s, #0 1500 cmeq v31.4s, v31.4s, #0 1501 orr v24.16b, v24.16b, v25.16b 1502 orr v26.16b, v26.16b, v27.16b 1503 orr v28.16b, v28.16b, v29.16b 1504 orr v30.16b, v30.16b, v31.16b 1505 ext v25.16b, v3.16b, v4.16b, #1 1506 orr v22.16b, v24.16b, v26.16b 1507 ext v26.16b, v3.16b, v4.16b, #2 1508 ext v27.16b, v3.16b, v4.16b, #3 1509 ext v29.16b, v4.16b, v5.16b, #1 1510 orr v23.16b, v28.16b, v30.16b 1511 ext v30.16b, v4.16b, v5.16b, #2 1512 ext v31.16b, v4.16b, v5.16b, #3 1513 bic v24.16b, v3.16b, v20.16b 1514 bic v25.16b, v25.16b, v20.16b 1515 bic v26.16b, v26.16b, v20.16b 1516 orr v22.16b, v22.16b, v23.16b 1517 bic v27.16b, v27.16b, v20.16b 1518 bic v28.16b, v4.16b, v20.16b 1519 bic v29.16b, v29.16b, v20.16b 1520 bic v30.16b, v30.16b, v20.16b 1521 bic v31.16b, v31.16b, v20.16b 1522 addv s22, v22.4s 1523 eor v24.16b, v24.16b, v21.16b 1524 eor v25.16b, v25.16b, v21.16b 1525 eor v26.16b, v26.16b, v21.16b 1526 eor v27.16b, v27.16b, v21.16b 1527 eor v28.16b, v28.16b, v21.16b 1528 mov w3, v22.s[0] 1529 eor v29.16b, v29.16b, v21.16b 1530 eor v30.16b, v30.16b, v21.16b 1531 eor v31.16b, v31.16b, v21.16b 1532 cmeq v24.4s, v24.4s, #0 1533 cmeq v25.4s, v25.4s, #0 1534 cmeq v26.4s, v26.4s, #0 1535 cmeq v27.4s, v27.4s, #0 1536 cbnz w3, 91f 1537 st1 {v0.16b, v1.16b}, [x2], #32 1538 subs w1, w1, #64 1539 b.pl 2b 1540 1541 90: add w0, w1, #80 1542 ret 1543 1544 91: sub w1, w1, #32 1545 b 90b 1546 endfunc 1547