1 /* 2 * VC1 NEON optimisations 3 * 4 * Copyright (c) 2010 Rob Clark <rob@ti.com> 5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24 #include "libavutil/arm/asm.S" 25 #include "neon.S" 26 27 #include "config.h" 28 29 @ Transpose rows into columns of a matrix of 16-bit elements. For 4x4, pass 30 @ double-word registers, for 8x4, pass quad-word registers. 31 .macro transpose16 r0, r1, r2, r3 32 @ At this point: 33 @ row[0] r0 34 @ row[1] r1 35 @ row[2] r2 36 @ row[3] r3 37 38 vtrn.16 \r0, \r1 @ first and second row 39 vtrn.16 \r2, \r3 @ third and fourth row 40 vtrn.32 \r0, \r2 @ first and third row 41 vtrn.32 \r1, \r3 @ second and fourth row 42 43 @ At this point, if registers are quad-word: 44 @ column[0] d0 45 @ column[1] d2 46 @ column[2] d4 47 @ column[3] d6 48 @ column[4] d1 49 @ column[5] d3 50 @ column[6] d5 51 @ column[7] d7 52 53 @ At this point, if registers are double-word: 54 @ column[0] d0 55 @ column[1] d1 56 @ column[2] d2 57 @ column[3] d3 58 .endm 59 60 @ ff_vc1_inv_trans_{4,8}x{4,8}_neon and overflow: The input values in the file 61 @ are supposed to be in a specific range as to allow for 16-bit math without 62 @ causing overflows, but sometimes the input values are just big enough to 63 @ barely cause overflow in vadd instructions like: 64 @ 65 @ vadd.i16 q0, q8, q10 66 @ vshr.s16 q0, q0, #\rshift 67 @ 68 @ To prevent these borderline cases from overflowing, we just need one more 69 @ bit of precision, which is accomplished by replacing the sequence above with: 70 @ 71 @ vhadd.s16 q0, q8, q10 72 @ vshr.s16 q0, q0, #(\rshift -1) 73 @ 74 @ This works because vhadd is a single instruction that adds, then shifts to 75 @ the right once, all before writing the result to the destination register. 76 @ 77 @ Even with this workaround, there were still some files that caused overflows 78 @ in ff_vc1_inv_trans_8x8_neon. See the comments in ff_vc1_inv_trans_8x8_neon 79 @ for the additional workaround. 80 81 @ Takes 4 columns of 8 values each and operates on it. Modeled after the first 82 @ for loop in vc1_inv_trans_4x8_c. 83 @ Input columns: q0 q1 q2 q3 84 @ Output columns: q0 q1 q2 q3 85 @ Trashes: r12 q8 q9 q10 q11 q12 q13 86 .macro vc1_inv_trans_4x8_helper add rshift 87 @ Compute temp1, temp2 and setup scalar #17, #22, #10 88 vadd.i16 q12, q0, q2 @ temp1 = src[0] + src[2] 89 movw r12, #17 90 vsub.i16 q13, q0, q2 @ temp2 = src[0] - src[2] 91 movt r12, #22 92 vmov.32 d0[0], r12 93 movw r12, #10 94 vmov.16 d1[0], r12 95 96 vmov.i16 q8, #\add @ t1 will accumulate here 97 vmov.i16 q9, #\add @ t2 will accumulate here 98 99 vmul.i16 q10, q1, d0[1] @ t3 = 22 * (src[1]) 100 vmul.i16 q11, q3, d0[1] @ t4 = 22 * (src[3]) 101 102 vmla.i16 q8, q12, d0[0] @ t1 = 17 * (temp1) + 4 103 vmla.i16 q9, q13, d0[0] @ t2 = 17 * (temp2) + 4 104 105 vmla.i16 q10, q3, d1[0] @ t3 += 10 * src[3] 106 vmls.i16 q11, q1, d1[0] @ t4 -= 10 * src[1] 107 108 vhadd.s16 q0, q8, q10 @ dst[0] = (t1 + t3) >> 1 109 vhsub.s16 q3, q8, q10 @ dst[3] = (t1 - t3) >> 1 110 vhsub.s16 q1, q9, q11 @ dst[1] = (t2 - t4) >> 1 111 vhadd.s16 q2, q9, q11 @ dst[2] = (t2 + t4) >> 1 112 113 @ Halving add/sub above already did one shift 114 vshr.s16 q0, q0, #(\rshift - 1) @ dst[0] >>= (rshift - 1) 115 vshr.s16 q3, q3, #(\rshift - 1) @ dst[3] >>= (rshift - 1) 116 vshr.s16 q1, q1, #(\rshift - 1) @ dst[1] >>= (rshift - 1) 117 vshr.s16 q2, q2, #(\rshift - 1) @ dst[2] >>= (rshift - 1) 118 .endm 119 120 @ Takes 8 columns of 4 values each and operates on it. Modeled after the second 121 @ for loop in vc1_inv_trans_4x8_c. 122 @ Input columns: d0 d2 d4 d6 d1 d3 d5 d7 123 @ Output columns: d16 d17 d18 d19 d21 d20 d23 d22 124 @ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7 125 .macro vc1_inv_trans_8x4_helper add add1beforeshift rshift 126 @ At this point: 127 @ src[0] d0 overwritten later 128 @ src[8] d2 129 @ src[16] d4 overwritten later 130 @ src[24] d6 131 @ src[32] d1 overwritten later 132 @ src[40] d3 133 @ src[48] d5 overwritten later 134 @ src[56] d7 135 136 movw r12, #12 137 vmov.i16 q14, #\add @ t1|t2 will accumulate here 138 movt r12, #6 139 140 vadd.i16 d20, d0, d1 @ temp1 = src[0] + src[32] 141 vsub.i16 d21, d0, d1 @ temp2 = src[0] - src[32] 142 vmov.i32 d0[0], r12 @ 16-bit: d0[0] = #12, d0[1] = #6 143 144 vshl.i16 q15, q2, #4 @ t3|t4 = 16 * (src[16]|src[48]) 145 vswp d4, d5 @ q2 = src[48]|src[16] 146 vmla.i16 q14, q10, d0[0] @ t1|t2 = 12 * (temp1|temp2) + 64 147 movw r12, #15 148 movt r12, #9 149 vmov.i32 d0[1], r12 @ 16-bit: d0[2] = #15, d0[3] = #9 150 vneg.s16 d31, d31 @ t4 = -t4 151 vmla.i16 q15, q2, d0[1] @ t3|t4 += 6 * (src[48]|src[16]) 152 153 @ At this point: 154 @ d0[2] #15 155 @ d0[3] #9 156 @ q1 src[8]|src[40] 157 @ q3 src[24]|src[56] 158 @ q14 old t1|t2 159 @ q15 old t3|t4 160 161 vshl.i16 q8, q1, #4 @ t1|t2 = 16 * (src[8]|src[40]) 162 vswp d2, d3 @ q1 = src[40]|src[8] 163 vshl.i16 q12, q3, #4 @ temp3a|temp4a = 16 * src[24]|src[56] 164 vswp d6, d7 @ q3 = src[56]|src[24] 165 vshl.i16 q13, q1, #2 @ temp3b|temp4b = 4 * (src[40]|src[8]) 166 vshl.i16 q2, q3, #2 @ temp1|temp2 = 4 * (src[56]|src[24]) 167 vswp d3, d6 @ q1 = src[40]|src[56], q3 = src[8]|src[24] 168 vsub.i16 q9, q13, q12 @ t3|t4 = - (temp3a|temp4a) + (temp3b|temp4b) 169 vadd.i16 q8, q8, q2 @ t1|t2 += temp1|temp2 170 vmul.i16 q12, q3, d0[3] @ temp3|temp4 = 9 * src[8]|src[24] 171 vmla.i16 q8, q1, d0[3] @ t1|t2 += 9 * (src[40]|src[56]) 172 vswp d6, d7 @ q3 = src[24]|src[8] 173 vswp d2, d3 @ q1 = src[56]|src[40] 174 175 vsub.i16 q11, q14, q15 @ t8|t7 = old t1|t2 - old t3|t4 176 vadd.i16 q10, q14, q15 @ t5|t6 = old t1|t2 + old t3|t4 177 .if \add1beforeshift 178 vmov.i16 q15, #1 179 .endif 180 181 vadd.i16 d18, d18, d24 @ t3 += temp3 182 vsub.i16 d19, d19, d25 @ t4 -= temp4 183 184 vswp d22, d23 @ q11 = t7|t8 185 186 vneg.s16 d17, d17 @ t2 = -t2 187 vmla.i16 q9, q1, d0[2] @ t3|t4 += 15 * src[56]|src[40] 188 vmla.i16 q8, q3, d0[2] @ t1|t2 += 15 * src[24]|src[8] 189 190 @ At this point: 191 @ t1 d16 192 @ t2 d17 193 @ t3 d18 194 @ t4 d19 195 @ t5 d20 196 @ t6 d21 197 @ t7 d22 198 @ t8 d23 199 @ #1 q15 200 201 .if \add1beforeshift 202 vadd.i16 q3, q15, q10 @ line[7,6] = t5|t6 + 1 203 vadd.i16 q2, q15, q11 @ line[5,4] = t7|t8 + 1 204 .endif 205 206 @ Sometimes this overflows, so to get one additional bit of precision, use 207 @ a single instruction that both adds and shifts right (halving). 208 vhadd.s16 q1, q9, q11 @ line[2,3] = (t3|t4 + t7|t8) >> 1 209 vhadd.s16 q0, q8, q10 @ line[0,1] = (t1|t2 + t5|t6) >> 1 210 .if \add1beforeshift 211 vhsub.s16 q2, q2, q9 @ line[5,4] = (t7|t8 - t3|t4 + 1) >> 1 212 vhsub.s16 q3, q3, q8 @ line[7,6] = (t5|t6 - t1|t2 + 1) >> 1 213 .else 214 vhsub.s16 q2, q11, q9 @ line[5,4] = (t7|t8 - t3|t4) >> 1 215 vhsub.s16 q3, q10, q8 @ line[7,6] = (t5|t6 - t1|t2) >> 1 216 .endif 217 218 vshr.s16 q9, q1, #(\rshift - 1) @ one shift is already done by vhadd/vhsub above 219 vshr.s16 q8, q0, #(\rshift - 1) 220 vshr.s16 q10, q2, #(\rshift - 1) 221 vshr.s16 q11, q3, #(\rshift - 1) 222 223 @ At this point: 224 @ dst[0] d16 225 @ dst[1] d17 226 @ dst[2] d18 227 @ dst[3] d19 228 @ dst[4] d21 229 @ dst[5] d20 230 @ dst[6] d23 231 @ dst[7] d22 232 .endm 233 234 @ This is modeled after the first and second for loop in vc1_inv_trans_8x8_c. 235 @ Input columns: q8, q9, q10, q11, q12, q13, q14, q15 236 @ Output columns: q8, q9, q10, q11, q12, q13, q14, q15 237 @ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7 238 .macro vc1_inv_trans_8x8_helper add add1beforeshift rshift 239 @ This actually computes half of t1, t2, t3, t4, as explained below 240 @ near `tNhalf`. 241 vmov.i16 q0, #(6 / 2) @ q0 = #6/2 242 vshl.i16 q1, q10, #3 @ t3 = 16/2 * src[16] 243 vshl.i16 q3, q14, #3 @ temp4 = 16/2 * src[48] 244 vmul.i16 q2, q10, q0 @ t4 = 6/2 * src[16] 245 vmla.i16 q1, q14, q0 @ t3 += 6/2 * src[48] 246 @ unused: q0, q10, q14 247 vmov.i16 q0, #(12 / 2) @ q0 = #12/2 248 vadd.i16 q10, q8, q12 @ temp1 = src[0] + src[32] 249 vsub.i16 q14, q8, q12 @ temp2 = src[0] - src[32] 250 @ unused: q8, q12 251 vmov.i16 q8, #(\add / 2) @ t1 will accumulate here 252 vmov.i16 q12, #(\add / 2) @ t2 will accumulate here 253 movw r12, #15 254 vsub.i16 q2, q2, q3 @ t4 = 6/2 * src[16] - 16/2 * src[48] 255 movt r12, #9 256 @ unused: q3 257 vmla.i16 q8, q10, q0 @ t1 = 12/2 * temp1 + add 258 vmla.i16 q12, q14, q0 @ t2 = 12/2 * temp2 + add 259 vmov.i32 d0[0], r12 260 @ unused: q3, q10, q14 261 262 @ At this point: 263 @ q0 d0=#15|#9 264 @ q1 old t3 265 @ q2 old t4 266 @ q3 267 @ q8 old t1 268 @ q9 src[8] 269 @ q10 270 @ q11 src[24] 271 @ q12 old t2 272 @ q13 src[40] 273 @ q14 274 @ q15 src[56] 275 276 @ unused: q3, q10, q14 277 movw r12, #16 278 vshl.i16 q3, q9, #4 @ t1 = 16 * src[8] 279 movt r12, #4 280 vshl.i16 q10, q9, #2 @ t4 = 4 * src[8] 281 vmov.i32 d1[0], r12 282 vmul.i16 q14, q9, d0[0] @ t2 = 15 * src[8] 283 vmul.i16 q9, q9, d0[1] @ t3 = 9 * src[8] 284 @ unused: none 285 vmla.i16 q3, q11, d0[0] @ t1 += 15 * src[24] 286 vmls.i16 q10, q11, d0[1] @ t4 -= 9 * src[24] 287 vmls.i16 q14, q11, d1[1] @ t2 -= 4 * src[24] 288 vmls.i16 q9, q11, d1[0] @ t3 -= 16 * src[24] 289 @ unused: q11 290 vmla.i16 q3, q13, d0[1] @ t1 += 9 * src[40] 291 vmla.i16 q10, q13, d0[0] @ t4 += 15 * src[40] 292 vmls.i16 q14, q13, d1[0] @ t2 -= 16 * src[40] 293 vmla.i16 q9, q13, d1[1] @ t3 += 4 * src[40] 294 @ unused: q11, q13 295 296 @ Compute t5, t6, t7, t8 from old t1, t2, t3, t4. Actually, it computes 297 @ half of t5, t6, t7, t8 since t1, t2, t3, t4 are halved. 298 vadd.i16 q11, q8, q1 @ t5 = t1 + t3 299 vsub.i16 q1, q8, q1 @ t8 = t1 - t3 300 vadd.i16 q13, q12, q2 @ t6 = t2 + t4 301 vsub.i16 q2, q12, q2 @ t7 = t2 - t4 302 @ unused: q8, q12 303 304 .if \add1beforeshift 305 vmov.i16 q12, #1 306 .endif 307 308 @ unused: q8 309 vmla.i16 q3, q15, d1[1] @ t1 += 4 * src[56] 310 vmls.i16 q14, q15, d0[1] @ t2 -= 9 * src[56] 311 vmla.i16 q9, q15, d0[0] @ t3 += 15 * src[56] 312 vmls.i16 q10, q15, d1[0] @ t4 -= 16 * src[56] 313 @ unused: q0, q8, q15 314 315 @ At this point: 316 @ t1 q3 317 @ t2 q14 318 @ t3 q9 319 @ t4 q10 320 @ t5half q11 321 @ t6half q13 322 @ t7half q2 323 @ t8half q1 324 @ #1 q12 325 @ 326 @ tNhalf is half of the value of tN (as described in vc1_inv_trans_8x8_c). 327 @ This is done because sometimes files have input that causes tN + tM to 328 @ overflow. To avoid this overflow, we compute tNhalf, then compute 329 @ tNhalf + tM (which doesn't overflow), and then we use vhadd to compute 330 @ (tNhalf + (tNhalf + tM)) >> 1 which does not overflow because it is 331 @ one instruction. 332 333 @ For each pair of tN and tM, do: 334 @ lineA = t5half + t1 335 @ if add1beforeshift: t1 -= 1 336 @ lineA = (t5half + lineA) >> 1 337 @ lineB = t5half - t1 338 @ lineB = (t5half + lineB) >> 1 339 @ lineA >>= rshift - 1 340 @ lineB >>= rshift - 1 341 342 vadd.i16 q8, q11, q3 @ q8 = t5half + t1 343 .if \add1beforeshift 344 vsub.i16 q3, q3, q12 @ q3 = t1 - 1 345 .endif 346 347 vadd.i16 q0, q13, q14 @ q0 = t6half + t2 348 .if \add1beforeshift 349 vsub.i16 q14, q14, q12 @ q14 = t2 - 1 350 .endif 351 352 vadd.i16 q15, q2, q9 @ q15 = t7half + t3 353 .if \add1beforeshift 354 vsub.i16 q9, q9, q12 @ q9 = t3 - 1 355 .endif 356 @ unused: none 357 358 vhadd.s16 q8, q11, q8 @ q8 = (t5half + t5half + t1) >> 1 359 vsub.i16 q3, q11, q3 @ q3 = t5half - t1 + 1 360 361 vhadd.s16 q0, q13, q0 @ q0 = (t6half + t6half + t2) >> 1 362 vsub.i16 q14, q13, q14 @ q14 = t6half - t2 + 1 363 364 vhadd.s16 q15, q2, q15 @ q15 = (t7half + t7half + t3) >> 1 365 vsub.i16 q9, q2, q9 @ q9 = t7half - t3 + 1 366 367 vhadd.s16 q3, q11, q3 @ q3 = (t5half + t5half - t1 + 1) >> 1 368 @ unused: q11 369 370 vadd.i16 q11, q1, q10 @ q11 = t8half + t4 371 .if \add1beforeshift 372 vsub.i16 q10, q10, q12 @ q10 = t4 - 1 373 .endif 374 @ unused: q12 375 376 vhadd.s16 q14, q13, q14 @ q14 = (t6half + t6half - t2 + 1) >> 1 377 @ unused: q12, q13 378 vhadd.s16 q13, q2, q9 @ q9 = (t7half + t7half - t3 + 1) >> 1 379 @ unused: q12, q2, q9 380 381 vsub.i16 q10, q1, q10 @ q10 = t8half - t4 + 1 382 vhadd.s16 q11, q1, q11 @ q11 = (t8half + t8half + t4) >> 1 383 384 vshr.s16 q8, q8, #(\rshift - 1) @ q8 = line[0] 385 vhadd.s16 q12, q1, q10 @ q12 = (t8half + t8half - t4 + 1) >> 1 386 vshr.s16 q9, q0, #(\rshift - 1) @ q9 = line[1] 387 vshr.s16 q10, q15, #(\rshift - 1) @ q10 = line[2] 388 vshr.s16 q11, q11, #(\rshift - 1) @ q11 = line[3] 389 vshr.s16 q12, q12, #(\rshift - 1) @ q12 = line[4] 390 vshr.s16 q13, q13, #(\rshift - 1) @ q13 = line[5] 391 vshr.s16 q14, q14, #(\rshift - 1) @ q14 = line[6] 392 vshr.s16 q15, q3, #(\rshift - 1) @ q15 = line[7] 393 .endm 394 395 @ (int16_t *block [r0]) 396 function ff_vc1_inv_trans_8x8_neon, export=1 397 vld1.64 {q8-q9}, [r0,:128]! 398 vld1.64 {q10-q11}, [r0,:128]! 399 vld1.64 {q12-q13}, [r0,:128]! 400 vld1.64 {q14-q15}, [r0,:128] 401 sub r0, r0, #(16 * 2 * 3) @ restore r0 402 403 @ At this point: 404 @ src[0] q8 405 @ src[8] q9 406 @ src[16] q10 407 @ src[24] q11 408 @ src[32] q12 409 @ src[40] q13 410 @ src[48] q14 411 @ src[56] q15 412 413 vc1_inv_trans_8x8_helper add=4, add1beforeshift=0, rshift=3 414 415 @ Transpose result matrix of 8x8 416 swap4 d17, d19, d21, d23, d24, d26, d28, d30 417 transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15 418 419 vc1_inv_trans_8x8_helper add=64, add1beforeshift=1, rshift=7 420 421 vst1.64 {q8-q9}, [r0,:128]! 422 vst1.64 {q10-q11}, [r0,:128]! 423 vst1.64 {q12-q13}, [r0,:128]! 424 vst1.64 {q14-q15}, [r0,:128] 425 426 bx lr 427 endfunc 428 429 @ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2]) 430 function ff_vc1_inv_trans_8x4_neon, export=1 431 vld1.64 {q0-q1}, [r2,:128]! @ load 8 * 4 * 2 = 64 bytes / 16 bytes per quad = 4 quad registers 432 vld1.64 {q2-q3}, [r2,:128] 433 434 transpose16 q0, q1, q2, q3 @ transpose rows to columns 435 436 @ At this point: 437 @ src[0] d0 438 @ src[1] d2 439 @ src[2] d4 440 @ src[3] d6 441 @ src[4] d1 442 @ src[5] d3 443 @ src[6] d5 444 @ src[7] d7 445 446 vc1_inv_trans_8x4_helper add=4, add1beforeshift=0, rshift=3 447 448 @ Move output to more standardized registers 449 vmov d0, d16 450 vmov d2, d17 451 vmov d4, d18 452 vmov d6, d19 453 vmov d1, d21 454 vmov d3, d20 455 vmov d5, d23 456 vmov d7, d22 457 458 @ At this point: 459 @ dst[0] d0 460 @ dst[1] d2 461 @ dst[2] d4 462 @ dst[3] d6 463 @ dst[4] d1 464 @ dst[5] d3 465 @ dst[6] d5 466 @ dst[7] d7 467 468 transpose16 q0, q1, q2, q3 @ turn columns into rows 469 470 @ At this point: 471 @ row[0] q0 472 @ row[1] q1 473 @ row[2] q2 474 @ row[3] q3 475 476 vc1_inv_trans_4x8_helper add=64, rshift=7 477 478 @ At this point: 479 @ line[0].l d0 480 @ line[0].h d1 481 @ line[1].l d2 482 @ line[1].h d3 483 @ line[2].l d4 484 @ line[2].h d5 485 @ line[3].l d6 486 @ line[3].h d7 487 488 @ unused registers: q12, q13, q14, q15 489 490 vld1.64 {d28}, [r0,:64], r1 @ read dest 491 vld1.64 {d29}, [r0,:64], r1 492 vld1.64 {d30}, [r0,:64], r1 493 vld1.64 {d31}, [r0,:64], r1 494 sub r0, r0, r1, lsl #2 @ restore original r0 value 495 496 vaddw.u8 q0, q0, d28 @ line[0] += dest[0] 497 vaddw.u8 q1, q1, d29 @ line[1] += dest[1] 498 vaddw.u8 q2, q2, d30 @ line[2] += dest[2] 499 vaddw.u8 q3, q3, d31 @ line[3] += dest[3] 500 501 vqmovun.s16 d0, q0 @ line[0] 502 vqmovun.s16 d1, q1 @ line[1] 503 vqmovun.s16 d2, q2 @ line[2] 504 vqmovun.s16 d3, q3 @ line[3] 505 506 vst1.64 {d0}, [r0,:64], r1 @ write dest 507 vst1.64 {d1}, [r0,:64], r1 508 vst1.64 {d2}, [r0,:64], r1 509 vst1.64 {d3}, [r0,:64] 510 511 bx lr 512 endfunc 513 514 @ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2]) 515 function ff_vc1_inv_trans_4x8_neon, export=1 516 mov r12, #(8 * 2) @ 8 elements per line, each element 2 bytes 517 vld4.16 {d0[], d2[], d4[], d6[]}, [r2,:64], r12 @ read each column into a q register 518 vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r2,:64], r12 519 vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r2,:64], r12 520 vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r2,:64], r12 521 vld4.16 {d1[], d3[], d5[], d7[]}, [r2,:64], r12 522 vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r2,:64], r12 523 vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r2,:64], r12 524 vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r2,:64] 525 526 vc1_inv_trans_4x8_helper add=4, rshift=3 527 528 @ At this point: 529 @ dst[0] = q0 530 @ dst[1] = q1 531 @ dst[2] = q2 532 @ dst[3] = q3 533 534 transpose16 q0, q1, q2, q3 @ Transpose rows (registers) into columns 535 536 vc1_inv_trans_8x4_helper add=64, add1beforeshift=1, rshift=7 537 538 vld1.32 {d28[]}, [r0,:32], r1 @ read dest 539 vld1.32 {d28[1]}, [r0,:32], r1 540 vld1.32 {d29[]}, [r0,:32], r1 541 vld1.32 {d29[1]}, [r0,:32], r1 542 543 vld1.32 {d30[]}, [r0,:32], r1 544 vld1.32 {d30[0]}, [r0,:32], r1 545 vld1.32 {d31[]}, [r0,:32], r1 546 vld1.32 {d31[0]}, [r0,:32], r1 547 sub r0, r0, r1, lsl #3 @ restore original r0 value 548 549 vaddw.u8 q8, q8, d28 @ line[0,1] += dest[0,1] 550 vaddw.u8 q9, q9, d29 @ line[2,3] += dest[2,3] 551 vaddw.u8 q10, q10, d30 @ line[5,4] += dest[5,4] 552 vaddw.u8 q11, q11, d31 @ line[7,6] += dest[7,6] 553 554 vqmovun.s16 d16, q8 @ clip(line[0,1]) 555 vqmovun.s16 d18, q9 @ clip(line[2,3]) 556 vqmovun.s16 d20, q10 @ clip(line[5,4]) 557 vqmovun.s16 d22, q11 @ clip(line[7,6]) 558 559 vst1.32 {d16[0]}, [r0,:32], r1 @ write dest 560 vst1.32 {d16[1]}, [r0,:32], r1 561 vst1.32 {d18[0]}, [r0,:32], r1 562 vst1.32 {d18[1]}, [r0,:32], r1 563 564 vst1.32 {d20[1]}, [r0,:32], r1 565 vst1.32 {d20[0]}, [r0,:32], r1 566 vst1.32 {d22[1]}, [r0,:32], r1 567 vst1.32 {d22[0]}, [r0,:32] 568 569 bx lr 570 endfunc 571 572 @ Setup constants in registers which are used by vc1_inv_trans_4x4_helper 573 .macro vc1_inv_trans_4x4_helper_setup 574 vmov.i16 q13, #17 575 vmov.i16 q14, #22 576 vmov.i16 d30, #10 @ only need double-word, not quad-word 577 .endm 578 579 @ This is modeled after the first for loop in vc1_inv_trans_4x4_c. 580 .macro vc1_inv_trans_4x4_helper add rshift 581 vmov.i16 q2, #\add @ t1|t2 will accumulate here 582 583 vadd.i16 d16, d0, d1 @ temp1 = src[0] + src[2] 584 vsub.i16 d17, d0, d1 @ temp2 = src[0] - src[2] 585 vmul.i16 q3, q14, q1 @ t3|t4 = 22 * (src[1]|src[3]) 586 vmla.i16 q2, q13, q8 @ t1|t2 = 17 * (temp1|temp2) + add 587 vmla.i16 d6, d30, d3 @ t3 += 10 * src[3] 588 vmls.i16 d7, d30, d2 @ t4 -= 10 * src[1] 589 590 vadd.i16 q0, q2, q3 @ dst[0,2] = (t1|t2 + t3|t4) 591 vsub.i16 q1, q2, q3 @ dst[3,1] = (t1|t2 - t3|t4) 592 vshr.s16 q0, q0, #\rshift @ dst[0,2] >>= rshift 593 vshr.s16 q1, q1, #\rshift @ dst[3,1] >>= rshift 594 .endm 595 596 @ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2]) 597 function ff_vc1_inv_trans_4x4_neon, export=1 598 mov r12, #(8 * 2) @ 8 elements per line, each element 2 bytes 599 vld4.16 {d0[], d1[], d2[], d3[]}, [r2,:64], r12 @ read each column into a register 600 vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r12 601 vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r12 602 vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64] 603 604 vswp d1, d2 @ so that we can later access column 1 and column 3 as a single q1 register 605 606 vc1_inv_trans_4x4_helper_setup 607 608 @ At this point: 609 @ src[0] = d0 610 @ src[1] = d2 611 @ src[2] = d1 612 @ src[3] = d3 613 614 vc1_inv_trans_4x4_helper add=4, rshift=3 @ compute t1, t2, t3, t4 and combine them into dst[0-3] 615 616 @ At this point: 617 @ dst[0] = d0 618 @ dst[1] = d3 619 @ dst[2] = d1 620 @ dst[3] = d2 621 622 transpose16 d0, d3, d1, d2 @ Transpose rows (registers) into columns 623 624 @ At this point: 625 @ src[0] = d0 626 @ src[8] = d3 627 @ src[16] = d1 628 @ src[24] = d2 629 630 vswp d2, d3 @ so that we can later access column 1 and column 3 in order as a single q1 register 631 632 @ At this point: 633 @ src[0] = d0 634 @ src[8] = d2 635 @ src[16] = d1 636 @ src[24] = d3 637 638 vc1_inv_trans_4x4_helper add=64, rshift=7 @ compute t1, t2, t3, t4 and combine them into dst[0-3] 639 640 @ At this point: 641 @ line[0] = d0 642 @ line[1] = d3 643 @ line[2] = d1 644 @ line[3] = d2 645 646 vld1.32 {d18[]}, [r0,:32], r1 @ read dest 647 vld1.32 {d19[]}, [r0,:32], r1 648 vld1.32 {d18[1]}, [r0,:32], r1 649 vld1.32 {d19[0]}, [r0,:32], r1 650 sub r0, r0, r1, lsl #2 @ restore original r0 value 651 652 vaddw.u8 q0, q0, d18 @ line[0,2] += dest[0,2] 653 vaddw.u8 q1, q1, d19 @ line[3,1] += dest[3,1] 654 655 vqmovun.s16 d0, q0 @ clip(line[0,2]) 656 vqmovun.s16 d1, q1 @ clip(line[3,1]) 657 658 vst1.32 {d0[0]}, [r0,:32], r1 @ write dest 659 vst1.32 {d1[1]}, [r0,:32], r1 660 vst1.32 {d0[1]}, [r0,:32], r1 661 vst1.32 {d1[0]}, [r0,:32] 662 663 bx lr 664 endfunc 665 666 @ The absolute value of multiplication constants from vc1_mspel_filter and vc1_mspel_{ver,hor}_filter_16bits. 667 @ The sign is embedded in the code below that carries out the multiplication (mspel_filter{,.16}). 668 #define MSPEL_MODE_1_MUL_CONSTANTS 4, 53, 18, 3 669 #define MSPEL_MODE_2_MUL_CONSTANTS 1, 9, 9, 1 670 #define MSPEL_MODE_3_MUL_CONSTANTS 3, 18, 53, 4 671 672 @ These constants are from reading the source code of vc1_mspel_mc and determining the value that 673 @ is added to `rnd` to result in the variable `r`, and the value of the variable `shift`. 674 #define MSPEL_MODES_11_ADDSHIFT_CONSTANTS 15, 5 675 #define MSPEL_MODES_12_ADDSHIFT_CONSTANTS 3, 3 676 #define MSPEL_MODES_13_ADDSHIFT_CONSTANTS 15, 5 677 #define MSPEL_MODES_21_ADDSHIFT_CONSTANTS MSPEL_MODES_12_ADDSHIFT_CONSTANTS 678 #define MSPEL_MODES_22_ADDSHIFT_CONSTANTS 0, 1 679 #define MSPEL_MODES_23_ADDSHIFT_CONSTANTS 3, 3 680 #define MSPEL_MODES_31_ADDSHIFT_CONSTANTS MSPEL_MODES_13_ADDSHIFT_CONSTANTS 681 #define MSPEL_MODES_32_ADDSHIFT_CONSTANTS MSPEL_MODES_23_ADDSHIFT_CONSTANTS 682 #define MSPEL_MODES_33_ADDSHIFT_CONSTANTS 15, 5 683 684 @ The addition and shift constants from vc1_mspel_filter. 685 #define MSPEL_MODE_1_ADDSHIFT_CONSTANTS 32, 6 686 #define MSPEL_MODE_2_ADDSHIFT_CONSTANTS 8, 4 687 #define MSPEL_MODE_3_ADDSHIFT_CONSTANTS 32, 6 688 689 @ Setup constants in registers for a subsequent use of mspel_filter{,.16}. 690 .macro mspel_constants typesize reg_a reg_b reg_c reg_d filter_a filter_b filter_c filter_d reg_add filter_add_register 691 @ Typesize should be i8 or i16. 692 693 @ Only set the register if the value is not 1 and unique 694 .if \filter_a != 1 695 vmov.\typesize \reg_a, #\filter_a @ reg_a = filter_a 696 .endif 697 vmov.\typesize \reg_b, #\filter_b @ reg_b = filter_b 698 .if \filter_b != \filter_c 699 vmov.\typesize \reg_c, #\filter_c @ reg_c = filter_c 700 .endif 701 .if \filter_d != 1 702 vmov.\typesize \reg_d, #\filter_d @ reg_d = filter_d 703 .endif 704 @ vdup to double the size of typesize 705 .ifc \typesize,i8 706 vdup.16 \reg_add, \filter_add_register @ reg_add = filter_add_register 707 .else 708 vdup.32 \reg_add, \filter_add_register @ reg_add = filter_add_register 709 .endif 710 .endm 711 712 @ After mspel_constants has been used, do the filtering. 713 .macro mspel_filter acc dest src0 src1 src2 src3 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift narrow=1 714 .if \filter_a != 1 715 @ If filter_a != 1, then we need a move and subtract instruction 716 vmov \acc, \reg_add @ acc = reg_add 717 vmlsl.u8 \acc, \reg_a, \src0 @ acc -= filter_a * src[-stride] 718 .else 719 @ If filter_a is 1, then just subtract without an extra move 720 vsubw.u8 \acc, \reg_add, \src0 @ acc = reg_add - src[-stride] @ since filter_a == 1 721 .endif 722 vmlal.u8 \acc, \reg_b, \src1 @ acc += filter_b * src[0] 723 .if \filter_b != \filter_c 724 vmlal.u8 \acc, \reg_c, \src2 @ acc += filter_c * src[stride] 725 .else 726 @ If filter_b is the same as filter_c, use the same reg_b register 727 vmlal.u8 \acc, \reg_b, \src2 @ acc += filter_c * src[stride] @ where filter_c == filter_b 728 .endif 729 .if \filter_d != 1 730 @ If filter_d != 1, then do a multiply accumulate 731 vmlsl.u8 \acc, \reg_d, \src3 @ acc -= filter_d * src[stride * 2] 732 .else 733 @ If filter_d is 1, then just do a subtract 734 vsubw.u8 \acc, \acc, \src3 @ acc -= src[stride * 2] @ since filter_d == 1 735 .endif 736 .if \narrow 737 vqshrun.s16 \dest, \acc, #\filter_shift @ dest = clip_uint8(acc >> filter_shift) 738 .else 739 vshr.s16 \dest, \acc, #\filter_shift @ dest = acc >> filter_shift 740 .endif 741 .endm 742 743 @ This is similar to mspel_filter, but the input is 16-bit instead of 8-bit and narrow=0 is not supported. 744 .macro mspel_filter.16 acc0 acc1 acc0_0 acc0_1 dest src0 src1 src2 src3 src4 src5 src6 src7 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift 745 .if \filter_a != 1 746 vmov \acc0, \reg_add 747 vmov \acc1, \reg_add 748 vmlsl.s16 \acc0, \reg_a, \src0 749 vmlsl.s16 \acc1, \reg_a, \src1 750 .else 751 vsubw.s16 \acc0, \reg_add, \src0 752 vsubw.s16 \acc1, \reg_add, \src1 753 .endif 754 vmlal.s16 \acc0, \reg_b, \src2 755 vmlal.s16 \acc1, \reg_b, \src3 756 .if \filter_b != \filter_c 757 vmlal.s16 \acc0, \reg_c, \src4 758 vmlal.s16 \acc1, \reg_c, \src5 759 .else 760 vmlal.s16 \acc0, \reg_b, \src4 761 vmlal.s16 \acc1, \reg_b, \src5 762 .endif 763 .if \filter_d != 1 764 vmlsl.s16 \acc0, \reg_d, \src6 765 vmlsl.s16 \acc1, \reg_d, \src7 766 .else 767 vsubw.s16 \acc0, \acc0, \src6 768 vsubw.s16 \acc1, \acc1, \src7 769 .endif 770 @ Use acc0_0 and acc0_1 as temp space 771 vqshrun.s32 \acc0_0, \acc0, #\filter_shift @ Shift and narrow with saturation from s32 to u16 772 vqshrun.s32 \acc0_1, \acc1, #\filter_shift 773 vqmovn.u16 \dest, \acc0 @ Narrow with saturation from u16 to u8 774 .endm 775 776 @ Register usage for put_vc1_mspel_mc functions. Registers marked 'hv' are only used in put_vc1_mspel_mc_hv. 777 @ 778 @ r0 adjusted dst 779 @ r1 adjusted src 780 @ r2 stride 781 @ r3 adjusted rnd 782 @ r4 [hv] tmp 783 @ r11 [hv] sp saved 784 @ r12 loop counter 785 @ d0 src[-stride] 786 @ d1 src[0] 787 @ d2 src[stride] 788 @ d3 src[stride * 2] 789 @ q0 [hv] src[-stride] 790 @ q1 [hv] src[0] 791 @ q2 [hv] src[stride] 792 @ q3 [hv] src[stride * 2] 793 @ d21 often result from mspel_filter 794 @ q11 accumulator 0 795 @ q12 [hv] accumulator 1 796 @ q13 accumulator initial value 797 @ d28 filter_a 798 @ d29 filter_b 799 @ d30 filter_c 800 @ d31 filter_d 801 802 @ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3]) 803 .macro put_vc1_mspel_mc_hv hmode vmode filter_h_a filter_h_b filter_h_c filter_h_d filter_v_a filter_v_b filter_v_c filter_v_d filter_add filter_shift 804 function ff_put_vc1_mspel_mc\hmode\()\vmode\()_neon, export=1 805 push {r4, r11, lr} 806 mov r11, sp @ r11 = stack pointer before realignmnet 807 A bic sp, sp, #15 @ sp = round down to multiple of 16 bytes 808 T bic r4, r11, #15 809 T mov sp, r4 810 sub sp, sp, #(8*2*16) @ make space for 8 rows * 2 byte per element * 16 elements per row (to fit 11 actual elements per row) 811 mov r4, sp @ r4 = int16_t tmp[8 * 16] 812 813 sub r1, r1, #1 @ src -= 1 814 .if \filter_add != 0 815 add r3, r3, #\filter_add @ r3 = filter_add + rnd 816 .endif 817 mov r12, #8 @ loop counter 818 sub r1, r1, r2 @ r1 = &src[-stride] @ slide back 819 820 @ Do vertical filtering from src into tmp 821 mspel_constants i8, d28, d29, d30, d31, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, q13, r3 822 823 vld1.64 {d0,d1}, [r1], r2 824 vld1.64 {d2,d3}, [r1], r2 825 vld1.64 {d4,d5}, [r1], r2 826 827 1: 828 subs r12, r12, #4 829 830 vld1.64 {d6,d7}, [r1], r2 831 mspel_filter q11, q11, d0, d2, d4, d6, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 832 mspel_filter q12, q12, d1, d3, d5, d7, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 833 vst1.64 {q11,q12}, [r4,:128]! @ store and increment 834 835 vld1.64 {d0,d1}, [r1], r2 836 mspel_filter q11, q11, d2, d4, d6, d0, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 837 mspel_filter q12, q12, d3, d5, d7, d1, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 838 vst1.64 {q11,q12}, [r4,:128]! @ store and increment 839 840 vld1.64 {d2,d3}, [r1], r2 841 mspel_filter q11, q11, d4, d6, d0, d2, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 842 mspel_filter q12, q12, d5, d7, d1, d3, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 843 vst1.64 {q11,q12}, [r4,:128]! @ store and increment 844 845 vld1.64 {d4,d5}, [r1], r2 846 mspel_filter q11, q11, d6, d0, d2, d4, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 847 mspel_filter q12, q12, d7, d1, d3, d5, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0 848 vst1.64 {q11,q12}, [r4,:128]! @ store and increment 849 850 bne 1b 851 852 rsb r3, r3, #(64 + \filter_add) @ r3 = (64 + filter_add) - r3 853 mov r12, #8 @ loop counter 854 mov r4, sp @ r4 = tmp 855 856 @ Do horizontal filtering from temp to dst 857 mspel_constants i16, d28, d29, d30, d31, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, q13, r3 858 859 2: 860 subs r12, r12, #1 861 862 vld1.64 {q0,q1}, [r4,:128]! @ read one line of tmp 863 vext.16 q2, q0, q1, #2 864 vext.16 q3, q0, q1, #3 865 vext.16 q1, q0, q1, #1 @ do last because it writes to q1 which is read by the other vext instructions 866 867 mspel_filter.16 q11, q12, d22, d23, d21, d0, d1, d2, d3, d4, d5, d6, d7, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, d28, d29, d30, d31, q13, 7 868 869 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 870 871 bne 2b 872 873 mov sp, r11 874 pop {r4, r11, pc} 875 endfunc 876 .endm 877 878 @ Use C preprocessor and assembler macros to expand to functions for horizontal and vertical filtering. 879 #define PUT_VC1_MSPEL_MC_HV(hmode, vmode) \ 880 put_vc1_mspel_mc_hv hmode, vmode, \ 881 MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, \ 882 MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, \ 883 MSPEL_MODES_ ## hmode ## vmode ## _ADDSHIFT_CONSTANTS 884 885 PUT_VC1_MSPEL_MC_HV(1, 1) 886 PUT_VC1_MSPEL_MC_HV(1, 2) 887 PUT_VC1_MSPEL_MC_HV(1, 3) 888 PUT_VC1_MSPEL_MC_HV(2, 1) 889 PUT_VC1_MSPEL_MC_HV(2, 2) 890 PUT_VC1_MSPEL_MC_HV(2, 3) 891 PUT_VC1_MSPEL_MC_HV(3, 1) 892 PUT_VC1_MSPEL_MC_HV(3, 2) 893 PUT_VC1_MSPEL_MC_HV(3, 3) 894 895 #undef PUT_VC1_MSPEL_MC_HV 896 897 .macro put_vc1_mspel_mc_h_only hmode filter_a filter_b filter_c filter_d filter_add filter_shift 898 function ff_put_vc1_mspel_mc\hmode\()0_neon, export=1 899 rsb r3, r3, #\filter_add @ r3 = filter_add - r = filter_add - rnd 900 mov r12, #8 @ loop counter 901 sub r1, r1, #1 @ slide back, using immediate 902 903 mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3 904 905 1: 906 subs r12, r12, #1 907 908 vld1.64 {d0,d1}, [r1], r2 @ read 16 bytes even though we only need 11, also src += stride 909 vext.8 d2, d0, d1, #2 910 vext.8 d3, d0, d1, #3 911 vext.8 d1, d0, d1, #1 @ do last because it writes to d1 which is read by the other vext instructions 912 913 mspel_filter q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift 914 915 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 916 917 bne 1b 918 919 bx lr 920 endfunc 921 .endm 922 923 @ Use C preprocessor and assembler macros to expand to functions for horizontal only filtering. 924 #define PUT_VC1_MSPEL_MC_H_ONLY(hmode) \ 925 put_vc1_mspel_mc_h_only hmode, MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## hmode ## _ADDSHIFT_CONSTANTS 926 927 PUT_VC1_MSPEL_MC_H_ONLY(1) 928 PUT_VC1_MSPEL_MC_H_ONLY(2) 929 PUT_VC1_MSPEL_MC_H_ONLY(3) 930 931 #undef PUT_VC1_MSPEL_MC_H_ONLY 932 933 @ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3]) 934 .macro put_vc1_mspel_mc_v_only vmode filter_a filter_b filter_c filter_d filter_add filter_shift 935 function ff_put_vc1_mspel_mc0\vmode\()_neon, export=1 936 add r3, r3, #\filter_add - 1 @ r3 = filter_add - r = filter_add - (1 - rnd) = filter_add - 1 + rnd 937 mov r12, #8 @ loop counter 938 sub r1, r1, r2 @ r1 = &src[-stride] @ slide back 939 940 mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3 941 942 vld1.64 {d0}, [r1], r2 @ d0 = src[-stride] 943 vld1.64 {d1}, [r1], r2 @ d1 = src[0] 944 vld1.64 {d2}, [r1], r2 @ d2 = src[stride] 945 946 1: 947 subs r12, r12, #4 948 949 vld1.64 {d3}, [r1], r2 @ d3 = src[stride * 2] 950 mspel_filter q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift 951 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 952 953 vld1.64 {d0}, [r1], r2 @ d0 = next line 954 mspel_filter q11, d21, d1, d2, d3, d0, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift 955 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 956 957 vld1.64 {d1}, [r1], r2 @ d1 = next line 958 mspel_filter q11, d21, d2, d3, d0, d1, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift 959 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 960 961 vld1.64 {d2}, [r1], r2 @ d2 = next line 962 mspel_filter q11, d21, d3, d0, d1, d2, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift 963 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst 964 965 bne 1b 966 967 bx lr 968 endfunc 969 .endm 970 971 @ Use C preprocessor and assembler macros to expand to functions for vertical only filtering. 972 #define PUT_VC1_MSPEL_MC_V_ONLY(vmode) \ 973 put_vc1_mspel_mc_v_only vmode, MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## vmode ## _ADDSHIFT_CONSTANTS 974 975 PUT_VC1_MSPEL_MC_V_ONLY(1) 976 PUT_VC1_MSPEL_MC_V_ONLY(2) 977 PUT_VC1_MSPEL_MC_V_ONLY(3) 978 979 #undef PUT_VC1_MSPEL_MC_V_ONLY 980 981 function ff_put_pixels8x8_neon, export=1 982 vld1.64 {d0}, [r1], r2 983 vld1.64 {d1}, [r1], r2 984 vld1.64 {d2}, [r1], r2 985 vld1.64 {d3}, [r1], r2 986 vld1.64 {d4}, [r1], r2 987 vld1.64 {d5}, [r1], r2 988 vld1.64 {d6}, [r1], r2 989 vld1.64 {d7}, [r1] 990 vst1.64 {d0}, [r0,:64], r2 991 vst1.64 {d1}, [r0,:64], r2 992 vst1.64 {d2}, [r0,:64], r2 993 vst1.64 {d3}, [r0,:64], r2 994 vst1.64 {d4}, [r0,:64], r2 995 vst1.64 {d5}, [r0,:64], r2 996 vst1.64 {d6}, [r0,:64], r2 997 vst1.64 {d7}, [r0,:64] 998 bx lr 999 endfunc 1000 1001 function ff_vc1_inv_trans_8x8_dc_neon, export=1 1002 ldrsh r2, [r2] @ int dc = block[0]; 1003 1004 vld1.64 {d0}, [r0,:64], r1 1005 vld1.64 {d1}, [r0,:64], r1 1006 vld1.64 {d4}, [r0,:64], r1 1007 vld1.64 {d5}, [r0,:64], r1 1008 1009 add r2, r2, r2, lsl #1 @ dc = (3 * dc + 1) >> 1; 1010 vld1.64 {d6}, [r0,:64], r1 1011 add r2, r2, #1 1012 vld1.64 {d7}, [r0,:64], r1 1013 vld1.64 {d16}, [r0,:64], r1 1014 vld1.64 {d17}, [r0,:64], r1 1015 asr r2, r2, #1 1016 1017 sub r0, r0, r1, lsl #3 @ restore r0 to original value 1018 1019 add r2, r2, r2, lsl #1 @ dc = (3 * dc + 16) >> 5; 1020 add r2, r2, #16 1021 asr r2, r2, #5 1022 1023 vdup.16 q1, r2 @ dc 1024 1025 vaddw.u8 q9, q1, d0 1026 vaddw.u8 q10, q1, d1 1027 vaddw.u8 q11, q1, d4 1028 vaddw.u8 q12, q1, d5 1029 vqmovun.s16 d0, q9 1030 vqmovun.s16 d1, q10 1031 vqmovun.s16 d4, q11 1032 vst1.64 {d0}, [r0,:64], r1 1033 vqmovun.s16 d5, q12 1034 vst1.64 {d1}, [r0,:64], r1 1035 vaddw.u8 q13, q1, d6 1036 vst1.64 {d4}, [r0,:64], r1 1037 vaddw.u8 q14, q1, d7 1038 vst1.64 {d5}, [r0,:64], r1 1039 vaddw.u8 q15, q1, d16 1040 vaddw.u8 q1, q1, d17 @ this destroys q1 1041 vqmovun.s16 d6, q13 1042 vqmovun.s16 d7, q14 1043 vqmovun.s16 d16, q15 1044 vqmovun.s16 d17, q1 1045 vst1.64 {d6}, [r0,:64], r1 1046 vst1.64 {d7}, [r0,:64], r1 1047 vst1.64 {d16}, [r0,:64], r1 1048 vst1.64 {d17}, [r0,:64] 1049 bx lr 1050 endfunc 1051 1052 function ff_vc1_inv_trans_8x4_dc_neon, export=1 1053 ldrsh r2, [r2] @ int dc = block[0]; 1054 1055 vld1.64 {d0}, [r0,:64], r1 1056 vld1.64 {d1}, [r0,:64], r1 1057 vld1.64 {d4}, [r0,:64], r1 1058 vld1.64 {d5}, [r0,:64], r1 1059 1060 add r2, r2, r2, lsl #1 @ dc = ( 3 * dc + 1) >> 1; 1061 1062 sub r0, r0, r1, lsl #2 @ restore r0 to original value 1063 1064 add r2, r2, #1 1065 asr r2, r2, #1 1066 1067 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 64) >> 7; 1068 add r2, r2, #64 1069 asr r2, r2, #7 1070 1071 vdup.16 q1, r2 @ dc 1072 1073 vaddw.u8 q3, q1, d0 1074 vaddw.u8 q8, q1, d1 1075 vaddw.u8 q9, q1, d4 1076 vaddw.u8 q10, q1, d5 1077 vqmovun.s16 d0, q3 1078 vqmovun.s16 d1, q8 1079 vqmovun.s16 d4, q9 1080 vst1.64 {d0}, [r0,:64], r1 1081 vqmovun.s16 d5, q10 1082 vst1.64 {d1}, [r0,:64], r1 1083 vst1.64 {d4}, [r0,:64], r1 1084 vst1.64 {d5}, [r0,:64] 1085 bx lr 1086 endfunc 1087 1088 function ff_vc1_inv_trans_4x8_dc_neon, export=1 1089 ldrsh r2, [r2] @ int dc = block[0]; 1090 1091 vld1.32 {d0[]}, [r0,:32], r1 1092 vld1.32 {d1[]}, [r0,:32], r1 1093 vld1.32 {d0[1]}, [r0,:32], r1 1094 vld1.32 {d1[1]}, [r0,:32], r1 1095 1096 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 4) >> 3; 1097 vld1.32 {d4[]}, [r0,:32], r1 1098 add r2, r2, #4 1099 vld1.32 {d5[]}, [r0,:32], r1 1100 vld1.32 {d4[1]}, [r0,:32], r1 1101 asr r2, r2, #3 1102 vld1.32 {d5[1]}, [r0,:32], r1 1103 1104 add r2, r2, r2, lsl #1 @ dc = (12 * dc + 64) >> 7; 1105 1106 sub r0, r0, r1, lsl #3 @ restore r0 to original value 1107 1108 lsl r2, r2, #2 1109 add r2, r2, #64 1110 asr r2, r2, #7 1111 1112 vdup.16 q1, r2 @ dc 1113 1114 vaddw.u8 q3, q1, d0 1115 vaddw.u8 q8, q1, d1 1116 vaddw.u8 q9, q1, d4 1117 vaddw.u8 q10, q1, d5 1118 vqmovun.s16 d0, q3 1119 vst1.32 {d0[0]}, [r0,:32], r1 1120 vqmovun.s16 d1, q8 1121 vst1.32 {d1[0]}, [r0,:32], r1 1122 vqmovun.s16 d4, q9 1123 vst1.32 {d0[1]}, [r0,:32], r1 1124 vqmovun.s16 d5, q10 1125 vst1.32 {d1[1]}, [r0,:32], r1 1126 vst1.32 {d4[0]}, [r0,:32], r1 1127 vst1.32 {d5[0]}, [r0,:32], r1 1128 vst1.32 {d4[1]}, [r0,:32], r1 1129 vst1.32 {d5[1]}, [r0,:32] 1130 bx lr 1131 endfunc 1132 1133 function ff_vc1_inv_trans_4x4_dc_neon, export=1 1134 ldrsh r2, [r2] @ int dc = block[0]; 1135 1136 vld1.32 {d0[]}, [r0,:32], r1 1137 vld1.32 {d1[]}, [r0,:32], r1 1138 vld1.32 {d0[1]}, [r0,:32], r1 1139 vld1.32 {d1[1]}, [r0,:32], r1 1140 1141 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 4) >> 3; 1142 1143 sub r0, r0, r1, lsl #2 @ restore r0 to original value 1144 1145 add r2, r2, #4 1146 asr r2, r2, #3 1147 1148 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 64) >> 7; 1149 add r2, r2, #64 1150 asr r2, r2, #7 1151 1152 vdup.16 q1, r2 @ dc 1153 1154 vaddw.u8 q2, q1, d0 1155 vaddw.u8 q3, q1, d1 1156 vqmovun.s16 d0, q2 1157 vst1.32 {d0[0]}, [r0,:32], r1 1158 vqmovun.s16 d1, q3 1159 vst1.32 {d1[0]}, [r0,:32], r1 1160 vst1.32 {d0[1]}, [r0,:32], r1 1161 vst1.32 {d1[1]}, [r0,:32] 1162 bx lr 1163 endfunc 1164 1165 @ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks 1166 @ On entry: 1167 @ r0 -> top-left pel of lower block 1168 @ r1 = row stride, bytes 1169 @ r2 = PQUANT bitstream parameter 1170 function ff_vc1_v_loop_filter4_neon, export=1 1171 sub r3, r0, r1, lsl #2 1172 vldr d0, .Lcoeffs 1173 vld1.32 {d1[0]}, [r0], r1 @ P5 1174 vld1.32 {d2[0]}, [r3], r1 @ P1 1175 vld1.32 {d3[0]}, [r3], r1 @ P2 1176 vld1.32 {d4[0]}, [r0], r1 @ P6 1177 vld1.32 {d5[0]}, [r3], r1 @ P3 1178 vld1.32 {d6[0]}, [r0], r1 @ P7 1179 vld1.32 {d7[0]}, [r3] @ P4 1180 vld1.32 {d16[0]}, [r0] @ P8 1181 vshll.u8 q9, d1, #1 @ 2*P5 1182 vdup.16 d17, r2 @ pq 1183 vshll.u8 q10, d2, #1 @ 2*P1 1184 vmovl.u8 q11, d3 @ P2 1185 vmovl.u8 q1, d4 @ P6 1186 vmovl.u8 q12, d5 @ P3 1187 vmls.i16 d20, d22, d0[1] @ 2*P1-5*P2 1188 vmovl.u8 q11, d6 @ P7 1189 vmls.i16 d18, d2, d0[1] @ 2*P5-5*P6 1190 vshll.u8 q2, d5, #1 @ 2*P3 1191 vmovl.u8 q3, d7 @ P4 1192 vmla.i16 d18, d22, d0[1] @ 2*P5-5*P6+5*P7 1193 vmovl.u8 q11, d16 @ P8 1194 vmla.u16 d20, d24, d0[1] @ 2*P1-5*P2+5*P3 1195 vmovl.u8 q12, d1 @ P5 1196 vmls.u16 d4, d6, d0[1] @ 2*P3-5*P4 1197 vmls.u16 d18, d22, d0[0] @ 2*P5-5*P6+5*P7-2*P8 1198 vsub.i16 d1, d6, d24 @ P4-P5 1199 vmls.i16 d20, d6, d0[0] @ 2*P1-5*P2+5*P3-2*P4 1200 vmla.i16 d4, d24, d0[1] @ 2*P3-5*P4+5*P5 1201 vmls.i16 d4, d2, d0[0] @ 2*P3-5*P4+5*P5-2*P6 1202 vabs.s16 d2, d1 1203 vrshr.s16 d3, d18, #3 1204 vrshr.s16 d5, d20, #3 1205 vshr.s16 d2, d2, #1 @ clip 1206 vrshr.s16 d4, d4, #3 1207 vabs.s16 d3, d3 @ a2 1208 vshr.s16 d1, d1, #8 @ clip_sign 1209 vabs.s16 d5, d5 @ a1 1210 vceq.i16 d7, d2, #0 @ test clip == 0 1211 vabs.s16 d16, d4 @ a0 1212 vshr.s16 d4, d4, #8 @ a0_sign 1213 vcge.s16 d18, d5, d3 @ test a1 >= a2 1214 vcge.s16 d17, d16, d17 @ test a0 >= pq 1215 vbsl d18, d3, d5 @ a3 1216 vsub.i16 d1, d1, d4 @ clip_sign - a0_sign 1217 vorr d3, d7, d17 @ test clip == 0 || a0 >= pq 1218 vqsub.u16 d4, d16, d18 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1219 vcge.s16 d5, d18, d16 @ test a3 >= a0 1220 vmul.i16 d0, d4, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 1221 vorr d4, d3, d5 @ test clip == 0 || a0 >= pq || a3 >= a0 1222 vmov.32 r0, d4[1] @ move to gp reg 1223 vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 1224 vcge.s16 d4, d0, d2 1225 tst r0, #1 1226 bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered 1227 vbsl d4, d2, d0 @ FFMIN(d, clip) 1228 vbic d0, d4, d3 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) 1229 vmls.i16 d6, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 1230 vmla.i16 d24, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 1231 vqmovun.s16 d0, q3 1232 vqmovun.s16 d1, q12 1233 vst1.32 {d0[0]}, [r3], r1 1234 vst1.32 {d1[0]}, [r3] 1235 1: bx lr 1236 endfunc 1237 1238 @ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks 1239 @ On entry: 1240 @ r0 -> top-left pel of right block 1241 @ r1 = row stride, bytes 1242 @ r2 = PQUANT bitstream parameter 1243 function ff_vc1_h_loop_filter4_neon, export=1 1244 sub r3, r0, #4 @ where to start reading 1245 vldr d0, .Lcoeffs 1246 vld1.32 {d2}, [r3], r1 1247 sub r0, r0, #1 @ where to start writing 1248 vld1.32 {d4}, [r3], r1 1249 vld1.32 {d3}, [r3], r1 1250 vld1.32 {d5}, [r3] 1251 vdup.16 d1, r2 @ pq 1252 vtrn.8 q1, q2 1253 vtrn.16 d2, d3 @ P1, P5, P3, P7 1254 vtrn.16 d4, d5 @ P2, P6, P4, P8 1255 vshll.u8 q3, d2, #1 @ 2*P1, 2*P5 1256 vmovl.u8 q8, d4 @ P2, P6 1257 vmovl.u8 q9, d3 @ P3, P7 1258 vmovl.u8 q2, d5 @ P4, P8 1259 vmls.i16 q3, q8, d0[1] @ 2*P1-5*P2, 2*P5-5*P6 1260 vshll.u8 q10, d3, #1 @ 2*P3, 2*P7 1261 vmovl.u8 q1, d2 @ P1, P5 1262 vmla.i16 q3, q9, d0[1] @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 1263 vmls.i16 q3, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 1264 vmov d2, d3 @ needs to be in an even-numbered vector for when we come to narrow it later 1265 vmls.i16 d20, d4, d0[1] @ 2*P3-5*P4 1266 vmla.i16 d20, d3, d0[1] @ 2*P3-5*P4+5*P5 1267 vsub.i16 d3, d4, d2 @ P4-P5 1268 vmls.i16 d20, d17, d0[0] @ 2*P3-5*P4+5*P5-2*P6 1269 vrshr.s16 q3, q3, #3 1270 vabs.s16 d5, d3 1271 vshr.s16 d3, d3, #8 @ clip_sign 1272 vrshr.s16 d16, d20, #3 1273 vabs.s16 q3, q3 @ a1, a2 1274 vshr.s16 d5, d5, #1 @ clip 1275 vabs.s16 d17, d16 @ a0 1276 vceq.i16 d18, d5, #0 @ test clip == 0 1277 vshr.s16 d16, d16, #8 @ a0_sign 1278 vcge.s16 d19, d6, d7 @ test a1 >= a2 1279 vcge.s16 d1, d17, d1 @ test a0 >= pq 1280 vsub.i16 d16, d3, d16 @ clip_sign - a0_sign 1281 vbsl d19, d7, d6 @ a3 1282 vorr d1, d18, d1 @ test clip == 0 || a0 >= pq 1283 vqsub.u16 d3, d17, d19 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1284 vcge.s16 d6, d19, d17 @ test a3 >= a0 @ 1285 vmul.i16 d0, d3, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 1286 vorr d3, d1, d6 @ test clip == 0 || a0 >= pq || a3 >= a0 1287 vmov.32 r2, d3[1] @ move to gp reg 1288 vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 1289 vcge.s16 d3, d0, d5 1290 tst r2, #1 1291 bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered 1292 vbsl d3, d5, d0 @ FFMIN(d, clip) 1293 vbic d0, d3, d1 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) 1294 vmla.i16 d2, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 1295 vmls.i16 d4, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 1296 vqmovun.s16 d1, q1 1297 vqmovun.s16 d0, q2 1298 vst2.8 {d0[0], d1[0]}, [r0], r1 1299 vst2.8 {d0[1], d1[1]}, [r0], r1 1300 vst2.8 {d0[2], d1[2]}, [r0], r1 1301 vst2.8 {d0[3], d1[3]}, [r0] 1302 1: bx lr 1303 endfunc 1304 1305 @ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks 1306 @ On entry: 1307 @ r0 -> top-left pel of lower block 1308 @ r1 = row stride, bytes 1309 @ r2 = PQUANT bitstream parameter 1310 function ff_vc1_v_loop_filter8_neon, export=1 1311 sub r3, r0, r1, lsl #2 1312 vldr d0, .Lcoeffs 1313 vld1.32 {d1}, [r0 :64], r1 @ P5 1314 vld1.32 {d2}, [r3 :64], r1 @ P1 1315 vld1.32 {d3}, [r3 :64], r1 @ P2 1316 vld1.32 {d4}, [r0 :64], r1 @ P6 1317 vld1.32 {d5}, [r3 :64], r1 @ P3 1318 vld1.32 {d6}, [r0 :64], r1 @ P7 1319 vshll.u8 q8, d1, #1 @ 2*P5 1320 vshll.u8 q9, d2, #1 @ 2*P1 1321 vld1.32 {d7}, [r3 :64] @ P4 1322 vmovl.u8 q1, d3 @ P2 1323 vld1.32 {d20}, [r0 :64] @ P8 1324 vmovl.u8 q11, d4 @ P6 1325 vdup.16 q12, r2 @ pq 1326 vmovl.u8 q13, d5 @ P3 1327 vmls.i16 q9, q1, d0[1] @ 2*P1-5*P2 1328 vmovl.u8 q1, d6 @ P7 1329 vshll.u8 q2, d5, #1 @ 2*P3 1330 vmls.i16 q8, q11, d0[1] @ 2*P5-5*P6 1331 vmovl.u8 q3, d7 @ P4 1332 vmovl.u8 q10, d20 @ P8 1333 vmla.i16 q8, q1, d0[1] @ 2*P5-5*P6+5*P7 1334 vmovl.u8 q1, d1 @ P5 1335 vmla.i16 q9, q13, d0[1] @ 2*P1-5*P2+5*P3 1336 vsub.i16 q13, q3, q1 @ P4-P5 1337 vmls.i16 q2, q3, d0[1] @ 2*P3-5*P4 1338 vmls.i16 q8, q10, d0[0] @ 2*P5-5*P6+5*P7-2*P8 1339 vabs.s16 q10, q13 1340 vshr.s16 q13, q13, #8 @ clip_sign 1341 vmls.i16 q9, q3, d0[0] @ 2*P1-5*P2+5*P3-2*P4 1342 vshr.s16 q10, q10, #1 @ clip 1343 vmla.i16 q2, q1, d0[1] @ 2*P3-5*P4+5*P5 1344 vrshr.s16 q8, q8, #3 1345 vmls.i16 q2, q11, d0[0] @ 2*P3-5*P4+5*P5-2*P6 1346 vceq.i16 q11, q10, #0 @ test clip == 0 1347 vrshr.s16 q9, q9, #3 1348 vabs.s16 q8, q8 @ a2 1349 vabs.s16 q9, q9 @ a1 1350 vrshr.s16 q2, q2, #3 1351 vcge.s16 q14, q9, q8 @ test a1 >= a2 1352 vabs.s16 q15, q2 @ a0 1353 vshr.s16 q2, q2, #8 @ a0_sign 1354 vbsl q14, q8, q9 @ a3 1355 vcge.s16 q8, q15, q12 @ test a0 >= pq 1356 vsub.i16 q2, q13, q2 @ clip_sign - a0_sign 1357 vqsub.u16 q9, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1358 vcge.s16 q12, q14, q15 @ test a3 >= a0 1359 vorr q8, q11, q8 @ test clip == 0 || a0 >= pq 1360 vmul.i16 q0, q9, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 1361 vorr q9, q8, q12 @ test clip == 0 || a0 >= pq || a3 >= a0 1362 vshl.i64 q11, q9, #16 1363 vmov.32 r0, d18[1] @ move to gp reg 1364 vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 1365 vmov.32 r2, d19[1] 1366 vshr.s64 q9, q11, #48 1367 vcge.s16 q11, q0, q10 1368 vorr q8, q8, q9 1369 and r0, r0, r2 1370 vbsl q11, q10, q0 @ FFMIN(d, clip) 1371 tst r0, #1 1372 bne 1f @ none of the 8 pixel pairs should be updated in this case 1373 vbic q0, q11, q8 @ set each d to zero if it should not be filtered 1374 vmls.i16 q3, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 1375 vmla.i16 q1, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 1376 vqmovun.s16 d0, q3 1377 vqmovun.s16 d1, q1 1378 vst1.32 {d0}, [r3 :64], r1 1379 vst1.32 {d1}, [r3 :64] 1380 1: bx lr 1381 endfunc 1382 1383 .align 5 1384 .Lcoeffs: 1385 .quad 0x00050002 1386 1387 @ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks 1388 @ On entry: 1389 @ r0 -> top-left pel of right block 1390 @ r1 = row stride, bytes 1391 @ r2 = PQUANT bitstream parameter 1392 function ff_vc1_h_loop_filter8_neon, export=1 1393 push {lr} 1394 sub r3, r0, #4 @ where to start reading 1395 vldr d0, .Lcoeffs 1396 vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... 1397 sub r0, r0, #1 @ where to start writing 1398 vld1.32 {d4}, [r3], r1 1399 add r12, r0, r1, lsl #2 1400 vld1.32 {d3}, [r3], r1 1401 vld1.32 {d5}, [r3], r1 1402 vld1.32 {d6}, [r3], r1 1403 vld1.32 {d16}, [r3], r1 1404 vld1.32 {d7}, [r3], r1 1405 vld1.32 {d17}, [r3] 1406 vtrn.8 q1, q2 @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]... 1407 vdup.16 q9, r2 @ pq 1408 vtrn.16 d2, d3 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... 1409 vtrn.16 d4, d5 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... 1410 vtrn.8 q3, q8 @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]... 1411 vtrn.16 d6, d7 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]... 1412 vtrn.16 d16, d17 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... 1413 vtrn.32 d2, d6 @ P1, P5 1414 vtrn.32 d4, d16 @ P2, P6 1415 vtrn.32 d3, d7 @ P3, P7 1416 vtrn.32 d5, d17 @ P4, P8 1417 vshll.u8 q10, d2, #1 @ 2*P1 1418 vshll.u8 q11, d6, #1 @ 2*P5 1419 vmovl.u8 q12, d4 @ P2 1420 vmovl.u8 q13, d16 @ P6 1421 vmovl.u8 q14, d3 @ P3 1422 vmls.i16 q10, q12, d0[1] @ 2*P1-5*P2 1423 vmovl.u8 q12, d7 @ P7 1424 vshll.u8 q1, d3, #1 @ 2*P3 1425 vmls.i16 q11, q13, d0[1] @ 2*P5-5*P6 1426 vmovl.u8 q2, d5 @ P4 1427 vmovl.u8 q8, d17 @ P8 1428 vmla.i16 q11, q12, d0[1] @ 2*P5-5*P6+5*P7 1429 vmovl.u8 q3, d6 @ P5 1430 vmla.i16 q10, q14, d0[1] @ 2*P1-5*P2+5*P3 1431 vsub.i16 q12, q2, q3 @ P4-P5 1432 vmls.i16 q1, q2, d0[1] @ 2*P3-5*P4 1433 vmls.i16 q11, q8, d0[0] @ 2*P5-5*P6+5*P7-2*P8 1434 vabs.s16 q8, q12 1435 vshr.s16 q12, q12, #8 @ clip_sign 1436 vmls.i16 q10, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4 1437 vshr.s16 q8, q8, #1 @ clip 1438 vmla.i16 q1, q3, d0[1] @ 2*P3-5*P4+5*P5 1439 vrshr.s16 q11, q11, #3 1440 vmls.i16 q1, q13, d0[0] @ 2*P3-5*P4+5*P5-2*P6 1441 vceq.i16 q13, q8, #0 @ test clip == 0 1442 vrshr.s16 q10, q10, #3 1443 vabs.s16 q11, q11 @ a2 1444 vabs.s16 q10, q10 @ a1 1445 vrshr.s16 q1, q1, #3 1446 vcge.s16 q14, q10, q11 @ test a1 >= a2 1447 vabs.s16 q15, q1 @ a0 1448 vshr.s16 q1, q1, #8 @ a0_sign 1449 vbsl q14, q11, q10 @ a3 1450 vcge.s16 q9, q15, q9 @ test a0 >= pq 1451 vsub.i16 q1, q12, q1 @ clip_sign - a0_sign 1452 vqsub.u16 q10, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1453 vcge.s16 q11, q14, q15 @ test a3 >= a0 1454 vorr q9, q13, q9 @ test clip == 0 || a0 >= pq 1455 vmul.i16 q0, q10, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 1456 vorr q10, q9, q11 @ test clip == 0 || a0 >= pq || a3 >= a0 1457 vmov.32 r2, d20[1] @ move to gp reg 1458 vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 1459 vmov.32 r3, d21[1] 1460 vcge.s16 q10, q0, q8 1461 and r14, r2, r3 1462 vbsl q10, q8, q0 @ FFMIN(d, clip) 1463 tst r14, #1 1464 bne 2f @ none of the 8 pixel pairs should be updated in this case 1465 vbic q0, q10, q9 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) 1466 vmla.i16 q3, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 1467 vmls.i16 q2, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 1468 vqmovun.s16 d1, q3 1469 vqmovun.s16 d0, q2 1470 tst r2, #1 1471 bne 1f @ none of the first 4 pixel pairs should be updated if so 1472 vst2.8 {d0[0], d1[0]}, [r0], r1 1473 vst2.8 {d0[1], d1[1]}, [r0], r1 1474 vst2.8 {d0[2], d1[2]}, [r0], r1 1475 vst2.8 {d0[3], d1[3]}, [r0] 1476 1: tst r3, #1 1477 bne 2f @ none of the second 4 pixel pairs should be updated if so 1478 vst2.8 {d0[4], d1[4]}, [r12], r1 1479 vst2.8 {d0[5], d1[5]}, [r12], r1 1480 vst2.8 {d0[6], d1[6]}, [r12], r1 1481 vst2.8 {d0[7], d1[7]}, [r12] 1482 2: pop {pc} 1483 endfunc 1484 1485 @ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks 1486 @ On entry: 1487 @ r0 -> top-left pel of lower block 1488 @ r1 = row stride, bytes 1489 @ r2 = PQUANT bitstream parameter 1490 function ff_vc1_v_loop_filter16_neon, export=1 1491 vpush {d8-d15} 1492 sub r3, r0, r1, lsl #2 1493 vldr d0, .Lcoeffs 1494 vld1.64 {q1}, [r0 :128], r1 @ P5 1495 vld1.64 {q2}, [r3 :128], r1 @ P1 1496 vld1.64 {q3}, [r3 :128], r1 @ P2 1497 vld1.64 {q4}, [r0 :128], r1 @ P6 1498 vld1.64 {q5}, [r3 :128], r1 @ P3 1499 vld1.64 {q6}, [r0 :128], r1 @ P7 1500 vshll.u8 q7, d2, #1 @ 2*P5[0..7] 1501 vshll.u8 q8, d4, #1 @ 2*P1[0..7] 1502 vld1.64 {q9}, [r3 :128] @ P4 1503 vmovl.u8 q10, d6 @ P2[0..7] 1504 vld1.64 {q11}, [r0 :128] @ P8 1505 vmovl.u8 q12, d8 @ P6[0..7] 1506 vdup.16 q13, r2 @ pq 1507 vshll.u8 q2, d5, #1 @ 2*P1[8..15] 1508 vmls.i16 q8, q10, d0[1] @ 2*P1[0..7]-5*P2[0..7] 1509 vshll.u8 q10, d3, #1 @ 2*P5[8..15] 1510 vmovl.u8 q3, d7 @ P2[8..15] 1511 vmls.i16 q7, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] 1512 vmovl.u8 q4, d9 @ P6[8..15] 1513 vmovl.u8 q14, d10 @ P3[0..7] 1514 vmovl.u8 q15, d12 @ P7[0..7] 1515 vmls.i16 q2, q3, d0[1] @ 2*P1[8..15]-5*P2[8..15] 1516 vshll.u8 q3, d10, #1 @ 2*P3[0..7] 1517 vmls.i16 q10, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15] 1518 vmovl.u8 q6, d13 @ P7[8..15] 1519 vmla.i16 q8, q14, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] 1520 vmovl.u8 q14, d18 @ P4[0..7] 1521 vmovl.u8 q9, d19 @ P4[8..15] 1522 vmla.i16 q7, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] 1523 vmovl.u8 q15, d11 @ P3[8..15] 1524 vshll.u8 q5, d11, #1 @ 2*P3[8..15] 1525 vmls.i16 q3, q14, d0[1] @ 2*P3[0..7]-5*P4[0..7] 1526 vmla.i16 q2, q15, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] 1527 vmovl.u8 q15, d22 @ P8[0..7] 1528 vmovl.u8 q11, d23 @ P8[8..15] 1529 vmla.i16 q10, q6, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] 1530 vmovl.u8 q6, d2 @ P5[0..7] 1531 vmovl.u8 q1, d3 @ P5[8..15] 1532 vmls.i16 q5, q9, d0[1] @ 2*P3[8..15]-5*P4[8..15] 1533 vmls.i16 q8, q14, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] 1534 vmls.i16 q7, q15, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] 1535 vsub.i16 q15, q14, q6 @ P4[0..7]-P5[0..7] 1536 vmla.i16 q3, q6, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] 1537 vrshr.s16 q8, q8, #3 1538 vmls.i16 q2, q9, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] 1539 vrshr.s16 q7, q7, #3 1540 vmls.i16 q10, q11, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] 1541 vabs.s16 q11, q15 1542 vabs.s16 q8, q8 @ a1[0..7] 1543 vmla.i16 q5, q1, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] 1544 vshr.s16 q15, q15, #8 @ clip_sign[0..7] 1545 vrshr.s16 q2, q2, #3 1546 vmls.i16 q3, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] 1547 vabs.s16 q7, q7 @ a2[0..7] 1548 vrshr.s16 q10, q10, #3 1549 vsub.i16 q12, q9, q1 @ P4[8..15]-P5[8..15] 1550 vshr.s16 q11, q11, #1 @ clip[0..7] 1551 vmls.i16 q5, q4, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] 1552 vcge.s16 q4, q8, q7 @ test a1[0..7] >= a2[0..7] 1553 vabs.s16 q2, q2 @ a1[8..15] 1554 vrshr.s16 q3, q3, #3 1555 vabs.s16 q10, q10 @ a2[8..15] 1556 vbsl q4, q7, q8 @ a3[0..7] 1557 vabs.s16 q7, q12 1558 vshr.s16 q8, q12, #8 @ clip_sign[8..15] 1559 vrshr.s16 q5, q5, #3 1560 vcge.s16 q12, q2, q10 @ test a1[8..15] >= a2[8.15] 1561 vshr.s16 q7, q7, #1 @ clip[8..15] 1562 vbsl q12, q10, q2 @ a3[8..15] 1563 vabs.s16 q2, q3 @ a0[0..7] 1564 vceq.i16 q10, q11, #0 @ test clip[0..7] == 0 1565 vshr.s16 q3, q3, #8 @ a0_sign[0..7] 1566 vsub.i16 q3, q15, q3 @ clip_sign[0..7] - a0_sign[0..7] 1567 vcge.s16 q15, q2, q13 @ test a0[0..7] >= pq 1568 vorr q10, q10, q15 @ test clip[0..7] == 0 || a0[0..7] >= pq 1569 vqsub.u16 q15, q2, q4 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1570 vcge.s16 q2, q4, q2 @ test a3[0..7] >= a0[0..7] 1571 vabs.s16 q4, q5 @ a0[8..15] 1572 vshr.s16 q5, q5, #8 @ a0_sign[8..15] 1573 vmul.i16 q15, q15, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 1574 vcge.s16 q13, q4, q13 @ test a0[8..15] >= pq 1575 vorr q2, q10, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] 1576 vsub.i16 q5, q8, q5 @ clip_sign[8..15] - a0_sign[8..15] 1577 vceq.i16 q8, q7, #0 @ test clip[8..15] == 0 1578 vshr.u16 q15, q15, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 1579 vmov.32 r0, d4[1] @ move to gp reg 1580 vorr q8, q8, q13 @ test clip[8..15] == 0 || a0[8..15] >= pq 1581 vqsub.u16 q13, q4, q12 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1582 vmov.32 r2, d5[1] 1583 vcge.s16 q4, q12, q4 @ test a3[8..15] >= a0[8..15] 1584 vshl.i64 q2, q2, #16 1585 vcge.s16 q12, q15, q11 1586 vmul.i16 q0, q13, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 1587 vorr q4, q8, q4 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] 1588 vshr.s64 q2, q2, #48 1589 and r0, r0, r2 1590 vbsl q12, q11, q15 @ FFMIN(d[0..7], clip[0..7]) 1591 vshl.i64 q11, q4, #16 1592 vmov.32 r2, d8[1] 1593 vshr.u16 q0, q0, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 1594 vorr q2, q10, q2 1595 vmov.32 r12, d9[1] 1596 vshr.s64 q4, q11, #48 1597 vcge.s16 q10, q0, q7 1598 vbic q2, q12, q2 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) 1599 vorr q4, q8, q4 1600 and r2, r2, r12 1601 vbsl q10, q7, q0 @ FFMIN(d[8..15], clip[8..15]) 1602 vmls.i16 q14, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] 1603 and r0, r0, r2 1604 vbic q0, q10, q4 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) 1605 tst r0, #1 1606 bne 1f @ none of the 16 pixel pairs should be updated in this case 1607 vmla.i16 q6, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] 1608 vmls.i16 q9, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] 1609 vqmovun.s16 d4, q14 1610 vmla.i16 q1, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] 1611 vqmovun.s16 d0, q6 1612 vqmovun.s16 d5, q9 1613 vqmovun.s16 d1, q1 1614 vst1.64 {q2}, [r3 :128], r1 1615 vst1.64 {q0}, [r3 :128] 1616 1: vpop {d8-d15} 1617 bx lr 1618 endfunc 1619 1620 @ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks 1621 @ On entry: 1622 @ r0 -> top-left pel of right block 1623 @ r1 = row stride, bytes 1624 @ r2 = PQUANT bitstream parameter 1625 function ff_vc1_h_loop_filter16_neon, export=1 1626 push {r4-r6,lr} 1627 vpush {d8-d15} 1628 sub r3, r0, #4 @ where to start reading 1629 vldr d0, .Lcoeffs 1630 vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... 1631 sub r0, r0, #1 @ where to start writing 1632 vld1.32 {d3}, [r3], r1 1633 add r4, r0, r1, lsl #2 1634 vld1.32 {d10}, [r3], r1 1635 vld1.32 {d11}, [r3], r1 1636 vld1.32 {d16}, [r3], r1 1637 vld1.32 {d4}, [r3], r1 1638 vld1.32 {d8}, [r3], r1 1639 vtrn.8 d2, d3 @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]... 1640 vld1.32 {d14}, [r3], r1 1641 vld1.32 {d5}, [r3], r1 1642 vtrn.8 d10, d11 @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]... 1643 vld1.32 {d6}, [r3], r1 1644 vld1.32 {d12}, [r3], r1 1645 vtrn.8 d16, d4 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]... 1646 vld1.32 {d13}, [r3], r1 1647 vtrn.16 d2, d10 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... 1648 vld1.32 {d1}, [r3], r1 1649 vtrn.8 d8, d14 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]... 1650 vld1.32 {d7}, [r3], r1 1651 vtrn.16 d3, d11 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... 1652 vld1.32 {d9}, [r3], r1 1653 vtrn.8 d5, d6 @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]... 1654 vld1.32 {d15}, [r3] 1655 vtrn.16 d16, d8 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]... 1656 vtrn.16 d4, d14 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... 1657 vtrn.8 d12, d13 @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]... 1658 vdup.16 q9, r2 @ pq 1659 vtrn.8 d1, d7 @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]... 1660 vtrn.32 d2, d16 @ P1[0..7], P5[0..7] 1661 vtrn.16 d5, d12 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]... 1662 vtrn.16 d6, d13 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]... 1663 vtrn.8 d9, d15 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]... 1664 vtrn.32 d3, d4 @ P2[0..7], P6[0..7] 1665 vshll.u8 q10, d2, #1 @ 2*P1[0..7] 1666 vtrn.32 d10, d8 @ P3[0..7], P7[0..7] 1667 vshll.u8 q11, d16, #1 @ 2*P5[0..7] 1668 vtrn.32 d11, d14 @ P4[0..7], P8[0..7] 1669 vtrn.16 d1, d9 @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]... 1670 vtrn.16 d7, d15 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]... 1671 vmovl.u8 q1, d3 @ P2[0..7] 1672 vmovl.u8 q12, d4 @ P6[0..7] 1673 vtrn.32 d5, d1 @ P1[8..15], P5[8..15] 1674 vtrn.32 d6, d7 @ P2[8..15], P6[8..15] 1675 vtrn.32 d12, d9 @ P3[8..15], P7[8..15] 1676 vtrn.32 d13, d15 @ P4[8..15], P8[8..15] 1677 vmls.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7] 1678 vmovl.u8 q1, d10 @ P3[0..7] 1679 vshll.u8 q2, d5, #1 @ 2*P1[8..15] 1680 vshll.u8 q13, d1, #1 @ 2*P5[8..15] 1681 vmls.i16 q11, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] 1682 vmovl.u8 q14, d6 @ P2[8..15] 1683 vmovl.u8 q3, d7 @ P6[8..15] 1684 vmovl.u8 q15, d8 @ P7[0..7] 1685 vmla.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] 1686 vmovl.u8 q1, d12 @ P3[8..15] 1687 vmls.i16 q2, q14, d0[1] @ 2*P1[8..15]-5*P2[8..15] 1688 vmovl.u8 q4, d9 @ P7[8..15] 1689 vshll.u8 q14, d10, #1 @ 2*P3[0..7] 1690 vmls.i16 q13, q3, d0[1] @ 2*P5[8..15]-5*P6[8..15] 1691 vmovl.u8 q5, d11 @ P4[0..7] 1692 vmla.i16 q11, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] 1693 vshll.u8 q15, d12, #1 @ 2*P3[8..15] 1694 vmovl.u8 q6, d13 @ P4[8..15] 1695 vmla.i16 q2, q1, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] 1696 vmovl.u8 q1, d14 @ P8[0..7] 1697 vmovl.u8 q7, d15 @ P8[8..15] 1698 vmla.i16 q13, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] 1699 vmovl.u8 q4, d16 @ P5[0..7] 1700 vmovl.u8 q8, d1 @ P5[8..15] 1701 vmls.i16 q14, q5, d0[1] @ 2*P3[0..7]-5*P4[0..7] 1702 vmls.i16 q15, q6, d0[1] @ 2*P3[8..15]-5*P4[8..15] 1703 vmls.i16 q10, q5, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] 1704 vmls.i16 q11, q1, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] 1705 vsub.i16 q1, q5, q4 @ P4[0..7]-P5[0..7] 1706 vmls.i16 q2, q6, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] 1707 vrshr.s16 q10, q10, #3 1708 vmls.i16 q13, q7, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] 1709 vsub.i16 q7, q6, q8 @ P4[8..15]-P5[8..15] 1710 vrshr.s16 q11, q11, #3 1711 vmla.s16 q14, q4, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] 1712 vrshr.s16 q2, q2, #3 1713 vmla.i16 q15, q8, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] 1714 vabs.s16 q10, q10 @ a1[0..7] 1715 vrshr.s16 q13, q13, #3 1716 vmls.i16 q15, q3, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] 1717 vabs.s16 q3, q11 @ a2[0..7] 1718 vabs.s16 q2, q2 @ a1[8..15] 1719 vmls.i16 q14, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] 1720 vabs.s16 q11, q1 1721 vabs.s16 q12, q13 @ a2[8..15] 1722 vcge.s16 q13, q10, q3 @ test a1[0..7] >= a2[0..7] 1723 vshr.s16 q1, q1, #8 @ clip_sign[0..7] 1724 vrshr.s16 q15, q15, #3 1725 vshr.s16 q11, q11, #1 @ clip[0..7] 1726 vrshr.s16 q14, q14, #3 1727 vbsl q13, q3, q10 @ a3[0..7] 1728 vcge.s16 q3, q2, q12 @ test a1[8..15] >= a2[8.15] 1729 vabs.s16 q10, q15 @ a0[8..15] 1730 vshr.s16 q15, q15, #8 @ a0_sign[8..15] 1731 vbsl q3, q12, q2 @ a3[8..15] 1732 vabs.s16 q2, q14 @ a0[0..7] 1733 vabs.s16 q12, q7 1734 vshr.s16 q7, q7, #8 @ clip_sign[8..15] 1735 vshr.s16 q14, q14, #8 @ a0_sign[0..7] 1736 vshr.s16 q12, q12, #1 @ clip[8..15] 1737 vsub.i16 q7, q7, q15 @ clip_sign[8..15] - a0_sign[8..15] 1738 vqsub.u16 q15, q10, q3 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1739 vcge.s16 q3, q3, q10 @ test a3[8..15] >= a0[8..15] 1740 vcge.s16 q10, q10, q9 @ test a0[8..15] >= pq 1741 vcge.s16 q9, q2, q9 @ test a0[0..7] >= pq 1742 vsub.i16 q1, q1, q14 @ clip_sign[0..7] - a0_sign[0..7] 1743 vqsub.u16 q14, q2, q13 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) 1744 vcge.s16 q2, q13, q2 @ test a3[0..7] >= a0[0..7] 1745 vmul.i16 q13, q15, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 1746 vceq.i16 q15, q11, #0 @ test clip[0..7] == 0 1747 vmul.i16 q0, q14, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 1748 vorr q9, q15, q9 @ test clip[0..7] == 0 || a0[0..7] >= pq 1749 vceq.i16 q14, q12, #0 @ test clip[8..15] == 0 1750 vshr.u16 q13, q13, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 1751 vorr q2, q9, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] 1752 vshr.u16 q0, q0, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 1753 vorr q10, q14, q10 @ test clip[8..15] == 0 || a0[8..15] >= pq 1754 vcge.s16 q14, q13, q12 1755 vmov.32 r2, d4[1] @ move to gp reg 1756 vorr q3, q10, q3 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] 1757 vmov.32 r3, d5[1] 1758 vcge.s16 q2, q0, q11 1759 vbsl q14, q12, q13 @ FFMIN(d[8..15], clip[8..15]) 1760 vbsl q2, q11, q0 @ FFMIN(d[0..7], clip[0..7]) 1761 vmov.32 r5, d6[1] 1762 vbic q0, q14, q10 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) 1763 vmov.32 r6, d7[1] 1764 and r12, r2, r3 1765 vbic q2, q2, q9 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) 1766 vmls.i16 q6, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 1767 vmls.i16 q5, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 1768 and r14, r5, r6 1769 vmla.i16 q4, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 1770 and r12, r12, r14 1771 vqmovun.s16 d4, q6 1772 vmla.i16 q8, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 1773 tst r12, #1 1774 bne 4f @ none of the 16 pixel pairs should be updated in this case 1775 vqmovun.s16 d2, q5 1776 vqmovun.s16 d3, q4 1777 vqmovun.s16 d5, q8 1778 tst r2, #1 1779 bne 1f 1780 vst2.8 {d2[0], d3[0]}, [r0], r1 1781 vst2.8 {d2[1], d3[1]}, [r0], r1 1782 vst2.8 {d2[2], d3[2]}, [r0], r1 1783 vst2.8 {d2[3], d3[3]}, [r0] 1784 1: add r0, r4, r1, lsl #2 1785 tst r3, #1 1786 bne 2f 1787 vst2.8 {d2[4], d3[4]}, [r4], r1 1788 vst2.8 {d2[5], d3[5]}, [r4], r1 1789 vst2.8 {d2[6], d3[6]}, [r4], r1 1790 vst2.8 {d2[7], d3[7]}, [r4] 1791 2: add r4, r0, r1, lsl #2 1792 tst r5, #1 1793 bne 3f 1794 vst2.8 {d4[0], d5[0]}, [r0], r1 1795 vst2.8 {d4[1], d5[1]}, [r0], r1 1796 vst2.8 {d4[2], d5[2]}, [r0], r1 1797 vst2.8 {d4[3], d5[3]}, [r0] 1798 3: tst r6, #1 1799 bne 4f 1800 vst2.8 {d4[4], d5[4]}, [r4], r1 1801 vst2.8 {d4[5], d5[5]}, [r4], r1 1802 vst2.8 {d4[6], d5[6]}, [r4], r1 1803 vst2.8 {d4[7], d5[7]}, [r4] 1804 4: vpop {d8-d15} 1805 pop {r4-r6,pc} 1806 endfunc 1807 1808 @ Copy at most the specified number of bytes from source to destination buffer, 1809 @ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence 1810 @ On entry: 1811 @ r0 -> source buffer 1812 @ r1 = max number of bytes to copy 1813 @ r2 -> destination buffer, optimally 8-byte aligned 1814 @ On exit: 1815 @ r0 = number of bytes not copied 1816 function ff_vc1_unescape_buffer_helper_neon, export=1 1817 @ Offset by 48 to screen out cases that are too short for us to handle, 1818 @ and also make it easy to test for loop termination, or to determine 1819 @ whether we need an odd number of half-iterations of the loop. 1820 subs r1, r1, #48 1821 bmi 90f 1822 1823 @ Set up useful constants 1824 vmov.i32 q0, #0x3000000 1825 vmov.i32 q1, #0x30000 1826 1827 tst r1, #16 1828 bne 1f 1829 1830 vld1.8 {q8, q9}, [r0]! 1831 vbic q12, q8, q0 1832 vext.8 q13, q8, q9, #1 1833 vext.8 q14, q8, q9, #2 1834 vext.8 q15, q8, q9, #3 1835 veor q12, q12, q1 1836 vbic q13, q13, q0 1837 vbic q14, q14, q0 1838 vbic q15, q15, q0 1839 vceq.i32 q12, q12, #0 1840 veor q13, q13, q1 1841 veor q14, q14, q1 1842 veor q15, q15, q1 1843 vceq.i32 q13, q13, #0 1844 vceq.i32 q14, q14, #0 1845 vceq.i32 q15, q15, #0 1846 add r1, r1, #16 1847 b 3f 1848 1849 1: vld1.8 {q10, q11}, [r0]! 1850 vbic q12, q10, q0 1851 vext.8 q13, q10, q11, #1 1852 vext.8 q14, q10, q11, #2 1853 vext.8 q15, q10, q11, #3 1854 veor q12, q12, q1 1855 vbic q13, q13, q0 1856 vbic q14, q14, q0 1857 vbic q15, q15, q0 1858 vceq.i32 q12, q12, #0 1859 veor q13, q13, q1 1860 veor q14, q14, q1 1861 veor q15, q15, q1 1862 vceq.i32 q13, q13, #0 1863 vceq.i32 q14, q14, #0 1864 vceq.i32 q15, q15, #0 1865 @ Drop through... 1866 2: vmov q8, q11 1867 vld1.8 {q9}, [r0]! 1868 vorr q13, q12, q13 1869 vorr q15, q14, q15 1870 vbic q12, q8, q0 1871 vorr q3, q13, q15 1872 vext.8 q13, q8, q9, #1 1873 vext.8 q14, q8, q9, #2 1874 vext.8 q15, q8, q9, #3 1875 veor q12, q12, q1 1876 vorr d6, d6, d7 1877 vbic q13, q13, q0 1878 vbic q14, q14, q0 1879 vbic q15, q15, q0 1880 vceq.i32 q12, q12, #0 1881 vmov r3, r12, d6 1882 veor q13, q13, q1 1883 veor q14, q14, q1 1884 veor q15, q15, q1 1885 vceq.i32 q13, q13, #0 1886 vceq.i32 q14, q14, #0 1887 vceq.i32 q15, q15, #0 1888 orrs r3, r3, r12 1889 bne 90f 1890 vst1.64 {q10}, [r2]! 1891 3: vmov q10, q9 1892 vld1.8 {q11}, [r0]! 1893 vorr q13, q12, q13 1894 vorr q15, q14, q15 1895 vbic q12, q10, q0 1896 vorr q3, q13, q15 1897 vext.8 q13, q10, q11, #1 1898 vext.8 q14, q10, q11, #2 1899 vext.8 q15, q10, q11, #3 1900 veor q12, q12, q1 1901 vorr d6, d6, d7 1902 vbic q13, q13, q0 1903 vbic q14, q14, q0 1904 vbic q15, q15, q0 1905 vceq.i32 q12, q12, #0 1906 vmov r3, r12, d6 1907 veor q13, q13, q1 1908 veor q14, q14, q1 1909 veor q15, q15, q1 1910 vceq.i32 q13, q13, #0 1911 vceq.i32 q14, q14, #0 1912 vceq.i32 q15, q15, #0 1913 orrs r3, r3, r12 1914 bne 91f 1915 vst1.64 {q8}, [r2]! 1916 subs r1, r1, #32 1917 bpl 2b 1918 1919 90: add r0, r1, #48 1920 bx lr 1921 1922 91: sub r1, r1, #16 1923 b 90b 1924 endfunc 1925