1 /* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include "libavutil/arm/asm.S" 22 #include "neon.S" 23 24 /* H.264 qpel MC */ 25 26 .macro lowpass_const r 27 movw \r, #5 28 movt \r, #20 29 vmov.32 d6[0], \r 30 .endm 31 32 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 33 .if \narrow 34 t0 .req q0 35 t1 .req q8 36 .else 37 t0 .req \d0 38 t1 .req \d1 39 .endif 40 vext.8 d2, \r0, \r1, #2 41 vext.8 d3, \r0, \r1, #3 42 vaddl.u8 q1, d2, d3 43 vext.8 d4, \r0, \r1, #1 44 vext.8 d5, \r0, \r1, #4 45 vaddl.u8 q2, d4, d5 46 vext.8 d30, \r0, \r1, #5 47 vaddl.u8 t0, \r0, d30 48 vext.8 d18, \r2, \r3, #2 49 vmla.i16 t0, q1, d6[1] 50 vext.8 d19, \r2, \r3, #3 51 vaddl.u8 q9, d18, d19 52 vext.8 d20, \r2, \r3, #1 53 vmls.i16 t0, q2, d6[0] 54 vext.8 d21, \r2, \r3, #4 55 vaddl.u8 q10, d20, d21 56 vext.8 d31, \r2, \r3, #5 57 vaddl.u8 t1, \r2, d31 58 vmla.i16 t1, q9, d6[1] 59 vmls.i16 t1, q10, d6[0] 60 .if \narrow 61 vqrshrun.s16 \d0, t0, #5 62 vqrshrun.s16 \d1, t1, #5 63 .endif 64 .unreq t0 65 .unreq t1 66 .endm 67 68 .macro lowpass_8_1 r0, r1, d0, narrow=1 69 .if \narrow 70 t0 .req q0 71 .else 72 t0 .req \d0 73 .endif 74 vext.8 d2, \r0, \r1, #2 75 vext.8 d3, \r0, \r1, #3 76 vaddl.u8 q1, d2, d3 77 vext.8 d4, \r0, \r1, #1 78 vext.8 d5, \r0, \r1, #4 79 vaddl.u8 q2, d4, d5 80 vext.8 d30, \r0, \r1, #5 81 vaddl.u8 t0, \r0, d30 82 vmla.i16 t0, q1, d6[1] 83 vmls.i16 t0, q2, d6[0] 84 .if \narrow 85 vqrshrun.s16 \d0, t0, #5 86 .endif 87 .unreq t0 88 .endm 89 90 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d 91 vext.16 q1, \r0, \r1, #2 92 vext.16 q0, \r0, \r1, #3 93 vaddl.s16 q9, d2, d0 94 vext.16 q2, \r0, \r1, #1 95 vaddl.s16 q1, d3, d1 96 vext.16 q3, \r0, \r1, #4 97 vaddl.s16 q10, d4, d6 98 vext.16 \r1, \r0, \r1, #5 99 vaddl.s16 q2, d5, d7 100 vaddl.s16 q0, \h0, \h1 101 vaddl.s16 q8, \l0, \l1 102 103 vshl.i32 q3, q9, #4 104 vshl.i32 q9, q9, #2 105 vshl.i32 q15, q10, #2 106 vadd.i32 q9, q9, q3 107 vadd.i32 q10, q10, q15 108 109 vshl.i32 q3, q1, #4 110 vshl.i32 q1, q1, #2 111 vshl.i32 q15, q2, #2 112 vadd.i32 q1, q1, q3 113 vadd.i32 q2, q2, q15 114 115 vadd.i32 q9, q9, q8 116 vsub.i32 q9, q9, q10 117 118 vadd.i32 q1, q1, q0 119 vsub.i32 q1, q1, q2 120 121 vrshrn.s32 d18, q9, #10 122 vrshrn.s32 d19, q1, #10 123 124 vqmovun.s16 \d, q9 125 .endm 126 127 function put_h264_qpel16_h_lowpass_neon_packed 128 mov r4, lr 129 mov r12, #16 130 mov r3, #8 131 bl put_h264_qpel8_h_lowpass_neon 132 sub r1, r1, r2, lsl #4 133 add r1, r1, #8 134 mov r12, #16 135 mov lr, r4 136 b put_h264_qpel8_h_lowpass_neon 137 endfunc 138 139 .macro h264_qpel_h_lowpass type 140 function \type\()_h264_qpel16_h_lowpass_neon 141 push {lr} 142 mov r12, #16 143 bl \type\()_h264_qpel8_h_lowpass_neon 144 sub r0, r0, r3, lsl #4 145 sub r1, r1, r2, lsl #4 146 add r0, r0, #8 147 add r1, r1, #8 148 mov r12, #16 149 pop {lr} 150 endfunc 151 152 function \type\()_h264_qpel8_h_lowpass_neon 153 1: vld1.8 {d0, d1}, [r1], r2 154 vld1.8 {d16,d17}, [r1], r2 155 subs r12, r12, #2 156 lowpass_8 d0, d1, d16, d17, d0, d16 157 .ifc \type,avg 158 vld1.8 {d2}, [r0,:64], r3 159 vld1.8 {d3}, [r0,:64] 160 vrhadd.u8 d0, d0, d2 161 vrhadd.u8 d16, d16, d3 162 sub r0, r0, r3 163 .endif 164 vst1.8 {d0}, [r0,:64], r3 165 vst1.8 {d16}, [r0,:64], r3 166 bne 1b 167 bx lr 168 endfunc 169 .endm 170 171 h264_qpel_h_lowpass put 172 h264_qpel_h_lowpass avg 173 174 .macro h264_qpel_h_lowpass_l2 type 175 function \type\()_h264_qpel16_h_lowpass_l2_neon 176 push {lr} 177 mov r12, #16 178 bl \type\()_h264_qpel8_h_lowpass_l2_neon 179 sub r0, r0, r2, lsl #4 180 sub r1, r1, r2, lsl #4 181 sub r3, r3, r2, lsl #4 182 add r0, r0, #8 183 add r1, r1, #8 184 add r3, r3, #8 185 mov r12, #16 186 pop {lr} 187 endfunc 188 189 function \type\()_h264_qpel8_h_lowpass_l2_neon 190 1: vld1.8 {d0, d1}, [r1], r2 191 vld1.8 {d16,d17}, [r1], r2 192 vld1.8 {d28}, [r3], r2 193 vld1.8 {d29}, [r3], r2 194 subs r12, r12, #2 195 lowpass_8 d0, d1, d16, d17, d0, d1 196 vrhadd.u8 q0, q0, q14 197 .ifc \type,avg 198 vld1.8 {d2}, [r0,:64], r2 199 vld1.8 {d3}, [r0,:64] 200 vrhadd.u8 q0, q0, q1 201 sub r0, r0, r2 202 .endif 203 vst1.8 {d0}, [r0,:64], r2 204 vst1.8 {d1}, [r0,:64], r2 205 bne 1b 206 bx lr 207 endfunc 208 .endm 209 210 h264_qpel_h_lowpass_l2 put 211 h264_qpel_h_lowpass_l2 avg 212 213 function put_h264_qpel16_v_lowpass_neon_packed 214 mov r4, lr 215 mov r2, #8 216 bl put_h264_qpel8_v_lowpass_neon 217 sub r1, r1, r3, lsl #2 218 bl put_h264_qpel8_v_lowpass_neon 219 sub r1, r1, r3, lsl #4 220 sub r1, r1, r3, lsl #2 221 add r1, r1, #8 222 bl put_h264_qpel8_v_lowpass_neon 223 sub r1, r1, r3, lsl #2 224 mov lr, r4 225 b put_h264_qpel8_v_lowpass_neon 226 endfunc 227 228 .macro h264_qpel_v_lowpass type 229 function \type\()_h264_qpel16_v_lowpass_neon 230 mov r4, lr 231 bl \type\()_h264_qpel8_v_lowpass_neon 232 sub r1, r1, r3, lsl #2 233 bl \type\()_h264_qpel8_v_lowpass_neon 234 sub r0, r0, r2, lsl #4 235 add r0, r0, #8 236 sub r1, r1, r3, lsl #4 237 sub r1, r1, r3, lsl #2 238 add r1, r1, #8 239 bl \type\()_h264_qpel8_v_lowpass_neon 240 sub r1, r1, r3, lsl #2 241 mov lr, r4 242 endfunc 243 244 function \type\()_h264_qpel8_v_lowpass_neon 245 vld1.8 {d8}, [r1], r3 246 vld1.8 {d10}, [r1], r3 247 vld1.8 {d12}, [r1], r3 248 vld1.8 {d14}, [r1], r3 249 vld1.8 {d22}, [r1], r3 250 vld1.8 {d24}, [r1], r3 251 vld1.8 {d26}, [r1], r3 252 vld1.8 {d28}, [r1], r3 253 vld1.8 {d9}, [r1], r3 254 vld1.8 {d11}, [r1], r3 255 vld1.8 {d13}, [r1], r3 256 vld1.8 {d15}, [r1], r3 257 vld1.8 {d23}, [r1] 258 259 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 260 lowpass_8 d8, d9, d10, d11, d8, d10 261 lowpass_8 d12, d13, d14, d15, d12, d14 262 lowpass_8 d22, d23, d24, d25, d22, d24 263 lowpass_8 d26, d27, d28, d29, d26, d28 264 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 265 266 .ifc \type,avg 267 vld1.8 {d9}, [r0,:64], r2 268 vld1.8 {d11}, [r0,:64], r2 269 vld1.8 {d13}, [r0,:64], r2 270 vrhadd.u8 d8, d8, d9 271 vld1.8 {d15}, [r0,:64], r2 272 vrhadd.u8 d10, d10, d11 273 vld1.8 {d23}, [r0,:64], r2 274 vrhadd.u8 d12, d12, d13 275 vld1.8 {d25}, [r0,:64], r2 276 vrhadd.u8 d14, d14, d15 277 vld1.8 {d27}, [r0,:64], r2 278 vrhadd.u8 d22, d22, d23 279 vld1.8 {d29}, [r0,:64], r2 280 vrhadd.u8 d24, d24, d25 281 vrhadd.u8 d26, d26, d27 282 vrhadd.u8 d28, d28, d29 283 sub r0, r0, r2, lsl #3 284 .endif 285 286 vst1.8 {d8}, [r0,:64], r2 287 vst1.8 {d10}, [r0,:64], r2 288 vst1.8 {d12}, [r0,:64], r2 289 vst1.8 {d14}, [r0,:64], r2 290 vst1.8 {d22}, [r0,:64], r2 291 vst1.8 {d24}, [r0,:64], r2 292 vst1.8 {d26}, [r0,:64], r2 293 vst1.8 {d28}, [r0,:64], r2 294 295 bx lr 296 endfunc 297 .endm 298 299 h264_qpel_v_lowpass put 300 h264_qpel_v_lowpass avg 301 302 .macro h264_qpel_v_lowpass_l2 type 303 function \type\()_h264_qpel16_v_lowpass_l2_neon 304 mov r4, lr 305 bl \type\()_h264_qpel8_v_lowpass_l2_neon 306 sub r1, r1, r3, lsl #2 307 bl \type\()_h264_qpel8_v_lowpass_l2_neon 308 sub r0, r0, r3, lsl #4 309 sub r12, r12, r2, lsl #4 310 add r0, r0, #8 311 add r12, r12, #8 312 sub r1, r1, r3, lsl #4 313 sub r1, r1, r3, lsl #2 314 add r1, r1, #8 315 bl \type\()_h264_qpel8_v_lowpass_l2_neon 316 sub r1, r1, r3, lsl #2 317 mov lr, r4 318 endfunc 319 320 function \type\()_h264_qpel8_v_lowpass_l2_neon 321 vld1.8 {d8}, [r1], r3 322 vld1.8 {d10}, [r1], r3 323 vld1.8 {d12}, [r1], r3 324 vld1.8 {d14}, [r1], r3 325 vld1.8 {d22}, [r1], r3 326 vld1.8 {d24}, [r1], r3 327 vld1.8 {d26}, [r1], r3 328 vld1.8 {d28}, [r1], r3 329 vld1.8 {d9}, [r1], r3 330 vld1.8 {d11}, [r1], r3 331 vld1.8 {d13}, [r1], r3 332 vld1.8 {d15}, [r1], r3 333 vld1.8 {d23}, [r1] 334 335 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 336 lowpass_8 d8, d9, d10, d11, d8, d9 337 lowpass_8 d12, d13, d14, d15, d12, d13 338 lowpass_8 d22, d23, d24, d25, d22, d23 339 lowpass_8 d26, d27, d28, d29, d26, d27 340 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 341 342 vld1.8 {d0}, [r12], r2 343 vld1.8 {d1}, [r12], r2 344 vld1.8 {d2}, [r12], r2 345 vld1.8 {d3}, [r12], r2 346 vld1.8 {d4}, [r12], r2 347 vrhadd.u8 q0, q0, q4 348 vld1.8 {d5}, [r12], r2 349 vrhadd.u8 q1, q1, q6 350 vld1.8 {d10}, [r12], r2 351 vrhadd.u8 q2, q2, q11 352 vld1.8 {d11}, [r12], r2 353 vrhadd.u8 q5, q5, q13 354 355 .ifc \type,avg 356 vld1.8 {d16}, [r0,:64], r3 357 vld1.8 {d17}, [r0,:64], r3 358 vrhadd.u8 d0, d0, d16 359 vld1.8 {d16}, [r0,:64], r3 360 vrhadd.u8 d1, d1, d17 361 vld1.8 {d17}, [r0,:64], r3 362 vrhadd.u8 d2, d2, d16 363 vld1.8 {d16}, [r0,:64], r3 364 vrhadd.u8 d3, d3, d17 365 vld1.8 {d17}, [r0,:64], r3 366 vrhadd.u8 d4, d4, d16 367 vld1.8 {d16}, [r0,:64], r3 368 vrhadd.u8 d5, d5, d17 369 vld1.8 {d17}, [r0,:64], r3 370 vrhadd.u8 d10, d10, d16 371 vrhadd.u8 d11, d11, d17 372 sub r0, r0, r3, lsl #3 373 .endif 374 375 vst1.8 {d0}, [r0,:64], r3 376 vst1.8 {d1}, [r0,:64], r3 377 vst1.8 {d2}, [r0,:64], r3 378 vst1.8 {d3}, [r0,:64], r3 379 vst1.8 {d4}, [r0,:64], r3 380 vst1.8 {d5}, [r0,:64], r3 381 vst1.8 {d10}, [r0,:64], r3 382 vst1.8 {d11}, [r0,:64], r3 383 384 bx lr 385 endfunc 386 .endm 387 388 h264_qpel_v_lowpass_l2 put 389 h264_qpel_v_lowpass_l2 avg 390 391 function put_h264_qpel8_hv_lowpass_neon_top 392 lowpass_const r12 393 mov r12, #12 394 1: vld1.8 {d0, d1}, [r1], r3 395 vld1.8 {d16,d17}, [r1], r3 396 subs r12, r12, #2 397 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 398 vst1.8 {d22-d25}, [r4,:128]! 399 bne 1b 400 401 vld1.8 {d0, d1}, [r1] 402 lowpass_8_1 d0, d1, q12, narrow=0 403 404 mov r12, #-16 405 add r4, r4, r12 406 vld1.8 {d30,d31}, [r4,:128], r12 407 vld1.8 {d20,d21}, [r4,:128], r12 408 vld1.8 {d18,d19}, [r4,:128], r12 409 vld1.8 {d16,d17}, [r4,:128], r12 410 vld1.8 {d14,d15}, [r4,:128], r12 411 vld1.8 {d12,d13}, [r4,:128], r12 412 vld1.8 {d10,d11}, [r4,:128], r12 413 vld1.8 {d8, d9}, [r4,:128], r12 414 vld1.8 {d6, d7}, [r4,:128], r12 415 vld1.8 {d4, d5}, [r4,:128], r12 416 vld1.8 {d2, d3}, [r4,:128], r12 417 vld1.8 {d0, d1}, [r4,:128] 418 419 swap4 d1, d3, d5, d7, d8, d10, d12, d14 420 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 421 422 swap4 d17, d19, d21, d31, d24, d26, d28, d22 423 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 424 425 vst1.8 {d30,d31}, [r4,:128]! 426 vst1.8 {d6, d7}, [r4,:128]! 427 vst1.8 {d20,d21}, [r4,:128]! 428 vst1.8 {d4, d5}, [r4,:128]! 429 vst1.8 {d18,d19}, [r4,:128]! 430 vst1.8 {d2, d3}, [r4,:128]! 431 vst1.8 {d16,d17}, [r4,:128]! 432 vst1.8 {d0, d1}, [r4,:128] 433 434 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 435 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 436 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 437 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 438 439 vld1.8 {d16,d17}, [r4,:128], r12 440 vld1.8 {d30,d31}, [r4,:128], r12 441 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 442 vld1.8 {d16,d17}, [r4,:128], r12 443 vld1.8 {d30,d31}, [r4,:128], r12 444 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 445 vld1.8 {d16,d17}, [r4,:128], r12 446 vld1.8 {d30,d31}, [r4,:128], r12 447 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 448 vld1.8 {d16,d17}, [r4,:128], r12 449 vld1.8 {d30,d31}, [r4,:128] 450 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 451 452 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 453 454 bx lr 455 endfunc 456 457 .macro h264_qpel8_hv_lowpass type 458 function \type\()_h264_qpel8_hv_lowpass_neon 459 mov r10, lr 460 bl put_h264_qpel8_hv_lowpass_neon_top 461 .ifc \type,avg 462 vld1.8 {d0}, [r0,:64], r2 463 vld1.8 {d1}, [r0,:64], r2 464 vld1.8 {d2}, [r0,:64], r2 465 vld1.8 {d3}, [r0,:64], r2 466 vrhadd.u8 q6, q6, q0 467 vld1.8 {d4}, [r0,:64], r2 468 vld1.8 {d5}, [r0,:64], r2 469 vrhadd.u8 q7, q7, q1 470 vld1.8 {d6}, [r0,:64], r2 471 vld1.8 {d7}, [r0,:64], r2 472 vrhadd.u8 q4, q4, q2 473 vrhadd.u8 q5, q5, q3 474 sub r0, r0, r2, lsl #3 475 .endif 476 477 vst1.8 {d12}, [r0,:64], r2 478 vst1.8 {d13}, [r0,:64], r2 479 vst1.8 {d14}, [r0,:64], r2 480 vst1.8 {d15}, [r0,:64], r2 481 vst1.8 {d8}, [r0,:64], r2 482 vst1.8 {d9}, [r0,:64], r2 483 vst1.8 {d10}, [r0,:64], r2 484 vst1.8 {d11}, [r0,:64], r2 485 486 mov lr, r10 487 bx lr 488 endfunc 489 .endm 490 491 h264_qpel8_hv_lowpass put 492 h264_qpel8_hv_lowpass avg 493 494 .macro h264_qpel8_hv_lowpass_l2 type 495 function \type\()_h264_qpel8_hv_lowpass_l2_neon 496 mov r10, lr 497 bl put_h264_qpel8_hv_lowpass_neon_top 498 499 vld1.8 {d0, d1}, [r2,:128]! 500 vld1.8 {d2, d3}, [r2,:128]! 501 vrhadd.u8 q0, q0, q6 502 vld1.8 {d4, d5}, [r2,:128]! 503 vrhadd.u8 q1, q1, q7 504 vld1.8 {d6, d7}, [r2,:128]! 505 vrhadd.u8 q2, q2, q4 506 vrhadd.u8 q3, q3, q5 507 .ifc \type,avg 508 vld1.8 {d16}, [r0,:64], r3 509 vld1.8 {d17}, [r0,:64], r3 510 vld1.8 {d18}, [r0,:64], r3 511 vld1.8 {d19}, [r0,:64], r3 512 vrhadd.u8 q0, q0, q8 513 vld1.8 {d20}, [r0,:64], r3 514 vld1.8 {d21}, [r0,:64], r3 515 vrhadd.u8 q1, q1, q9 516 vld1.8 {d22}, [r0,:64], r3 517 vld1.8 {d23}, [r0,:64], r3 518 vrhadd.u8 q2, q2, q10 519 vrhadd.u8 q3, q3, q11 520 sub r0, r0, r3, lsl #3 521 .endif 522 vst1.8 {d0}, [r0,:64], r3 523 vst1.8 {d1}, [r0,:64], r3 524 vst1.8 {d2}, [r0,:64], r3 525 vst1.8 {d3}, [r0,:64], r3 526 vst1.8 {d4}, [r0,:64], r3 527 vst1.8 {d5}, [r0,:64], r3 528 vst1.8 {d6}, [r0,:64], r3 529 vst1.8 {d7}, [r0,:64], r3 530 531 mov lr, r10 532 bx lr 533 endfunc 534 .endm 535 536 h264_qpel8_hv_lowpass_l2 put 537 h264_qpel8_hv_lowpass_l2 avg 538 539 .macro h264_qpel16_hv type 540 function \type\()_h264_qpel16_hv_lowpass_neon 541 mov r9, lr 542 bl \type\()_h264_qpel8_hv_lowpass_neon 543 sub r1, r1, r3, lsl #2 544 bl \type\()_h264_qpel8_hv_lowpass_neon 545 sub r1, r1, r3, lsl #4 546 sub r1, r1, r3, lsl #2 547 add r1, r1, #8 548 sub r0, r0, r2, lsl #4 549 add r0, r0, #8 550 bl \type\()_h264_qpel8_hv_lowpass_neon 551 sub r1, r1, r3, lsl #2 552 mov lr, r9 553 b \type\()_h264_qpel8_hv_lowpass_neon 554 endfunc 555 556 function \type\()_h264_qpel16_hv_lowpass_l2_neon 557 mov r9, lr 558 sub r2, r4, #256 559 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 560 sub r1, r1, r3, lsl #2 561 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 562 sub r1, r1, r3, lsl #4 563 sub r1, r1, r3, lsl #2 564 add r1, r1, #8 565 sub r0, r0, r3, lsl #4 566 add r0, r0, #8 567 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 568 sub r1, r1, r3, lsl #2 569 mov lr, r9 570 b \type\()_h264_qpel8_hv_lowpass_l2_neon 571 endfunc 572 .endm 573 574 h264_qpel16_hv put 575 h264_qpel16_hv avg 576 577 .macro h264_qpel8 type 578 function ff_\type\()_h264_qpel8_mc10_neon, export=1 579 lowpass_const r3 580 mov r3, r1 581 sub r1, r1, #2 582 mov r12, #8 583 b \type\()_h264_qpel8_h_lowpass_l2_neon 584 endfunc 585 586 function ff_\type\()_h264_qpel8_mc20_neon, export=1 587 lowpass_const r3 588 sub r1, r1, #2 589 mov r3, r2 590 mov r12, #8 591 b \type\()_h264_qpel8_h_lowpass_neon 592 endfunc 593 594 function ff_\type\()_h264_qpel8_mc30_neon, export=1 595 lowpass_const r3 596 add r3, r1, #1 597 sub r1, r1, #2 598 mov r12, #8 599 b \type\()_h264_qpel8_h_lowpass_l2_neon 600 endfunc 601 602 function ff_\type\()_h264_qpel8_mc01_neon, export=1 603 push {lr} 604 mov r12, r1 605 \type\()_h264_qpel8_mc01: 606 lowpass_const r3 607 mov r3, r2 608 sub r1, r1, r2, lsl #1 609 vpush {d8-d15} 610 bl \type\()_h264_qpel8_v_lowpass_l2_neon 611 vpop {d8-d15} 612 pop {pc} 613 endfunc 614 615 function ff_\type\()_h264_qpel8_mc11_neon, export=1 616 push {r0, r1, r11, lr} 617 \type\()_h264_qpel8_mc11: 618 lowpass_const r3 619 mov r11, sp 620 A bic sp, sp, #15 621 T bic r0, r11, #15 622 T mov sp, r0 623 sub sp, sp, #64 624 mov r0, sp 625 sub r1, r1, #2 626 mov r3, #8 627 mov r12, #8 628 vpush {d8-d15} 629 bl put_h264_qpel8_h_lowpass_neon 630 ldrd r0, r1, [r11], #8 631 mov r3, r2 632 add r12, sp, #64 633 sub r1, r1, r2, lsl #1 634 mov r2, #8 635 bl \type\()_h264_qpel8_v_lowpass_l2_neon 636 vpop {d8-d15} 637 mov sp, r11 638 pop {r11, pc} 639 endfunc 640 641 function ff_\type\()_h264_qpel8_mc21_neon, export=1 642 push {r0, r1, r4, r10, r11, lr} 643 \type\()_h264_qpel8_mc21: 644 lowpass_const r3 645 mov r11, sp 646 A bic sp, sp, #15 647 T bic r0, r11, #15 648 T mov sp, r0 649 sub sp, sp, #(8*8+16*12) 650 sub r1, r1, #2 651 mov r3, #8 652 mov r0, sp 653 mov r12, #8 654 vpush {d8-d15} 655 bl put_h264_qpel8_h_lowpass_neon 656 mov r4, r0 657 ldrd r0, r1, [r11], #8 658 sub r1, r1, r2, lsl #1 659 sub r1, r1, #2 660 mov r3, r2 661 sub r2, r4, #64 662 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 663 vpop {d8-d15} 664 mov sp, r11 665 pop {r4, r10, r11, pc} 666 endfunc 667 668 function ff_\type\()_h264_qpel8_mc31_neon, export=1 669 add r1, r1, #1 670 push {r0, r1, r11, lr} 671 sub r1, r1, #1 672 b \type\()_h264_qpel8_mc11 673 endfunc 674 675 function ff_\type\()_h264_qpel8_mc02_neon, export=1 676 push {lr} 677 lowpass_const r3 678 sub r1, r1, r2, lsl #1 679 mov r3, r2 680 vpush {d8-d15} 681 bl \type\()_h264_qpel8_v_lowpass_neon 682 vpop {d8-d15} 683 pop {pc} 684 endfunc 685 686 function ff_\type\()_h264_qpel8_mc12_neon, export=1 687 push {r0, r1, r4, r10, r11, lr} 688 \type\()_h264_qpel8_mc12: 689 lowpass_const r3 690 mov r11, sp 691 A bic sp, sp, #15 692 T bic r0, r11, #15 693 T mov sp, r0 694 sub sp, sp, #(8*8+16*12) 695 sub r1, r1, r2, lsl #1 696 mov r3, r2 697 mov r2, #8 698 mov r0, sp 699 vpush {d8-d15} 700 bl put_h264_qpel8_v_lowpass_neon 701 mov r4, r0 702 ldrd r0, r1, [r11], #8 703 sub r1, r1, r3, lsl #1 704 sub r1, r1, #2 705 sub r2, r4, #64 706 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 707 vpop {d8-d15} 708 mov sp, r11 709 pop {r4, r10, r11, pc} 710 endfunc 711 712 function ff_\type\()_h264_qpel8_mc22_neon, export=1 713 push {r4, r10, r11, lr} 714 mov r11, sp 715 A bic sp, sp, #15 716 T bic r4, r11, #15 717 T mov sp, r4 718 sub r1, r1, r2, lsl #1 719 sub r1, r1, #2 720 mov r3, r2 721 sub sp, sp, #(16*12) 722 mov r4, sp 723 vpush {d8-d15} 724 bl \type\()_h264_qpel8_hv_lowpass_neon 725 vpop {d8-d15} 726 mov sp, r11 727 pop {r4, r10, r11, pc} 728 endfunc 729 730 function ff_\type\()_h264_qpel8_mc32_neon, export=1 731 push {r0, r1, r4, r10, r11, lr} 732 add r1, r1, #1 733 b \type\()_h264_qpel8_mc12 734 endfunc 735 736 function ff_\type\()_h264_qpel8_mc03_neon, export=1 737 push {lr} 738 add r12, r1, r2 739 b \type\()_h264_qpel8_mc01 740 endfunc 741 742 function ff_\type\()_h264_qpel8_mc13_neon, export=1 743 push {r0, r1, r11, lr} 744 add r1, r1, r2 745 b \type\()_h264_qpel8_mc11 746 endfunc 747 748 function ff_\type\()_h264_qpel8_mc23_neon, export=1 749 push {r0, r1, r4, r10, r11, lr} 750 add r1, r1, r2 751 b \type\()_h264_qpel8_mc21 752 endfunc 753 754 function ff_\type\()_h264_qpel8_mc33_neon, export=1 755 add r1, r1, #1 756 push {r0, r1, r11, lr} 757 add r1, r1, r2 758 sub r1, r1, #1 759 b \type\()_h264_qpel8_mc11 760 endfunc 761 .endm 762 763 h264_qpel8 put 764 h264_qpel8 avg 765 766 .macro h264_qpel16 type 767 function ff_\type\()_h264_qpel16_mc10_neon, export=1 768 lowpass_const r3 769 mov r3, r1 770 sub r1, r1, #2 771 b \type\()_h264_qpel16_h_lowpass_l2_neon 772 endfunc 773 774 function ff_\type\()_h264_qpel16_mc20_neon, export=1 775 lowpass_const r3 776 sub r1, r1, #2 777 mov r3, r2 778 b \type\()_h264_qpel16_h_lowpass_neon 779 endfunc 780 781 function ff_\type\()_h264_qpel16_mc30_neon, export=1 782 lowpass_const r3 783 add r3, r1, #1 784 sub r1, r1, #2 785 b \type\()_h264_qpel16_h_lowpass_l2_neon 786 endfunc 787 788 function ff_\type\()_h264_qpel16_mc01_neon, export=1 789 push {r4, lr} 790 mov r12, r1 791 \type\()_h264_qpel16_mc01: 792 lowpass_const r3 793 mov r3, r2 794 sub r1, r1, r2, lsl #1 795 vpush {d8-d15} 796 bl \type\()_h264_qpel16_v_lowpass_l2_neon 797 vpop {d8-d15} 798 pop {r4, pc} 799 endfunc 800 801 function ff_\type\()_h264_qpel16_mc11_neon, export=1 802 push {r0, r1, r4, r11, lr} 803 \type\()_h264_qpel16_mc11: 804 lowpass_const r3 805 mov r11, sp 806 A bic sp, sp, #15 807 T bic r0, r11, #15 808 T mov sp, r0 809 sub sp, sp, #256 810 mov r0, sp 811 sub r1, r1, #2 812 mov r3, #16 813 vpush {d8-d15} 814 bl put_h264_qpel16_h_lowpass_neon 815 ldrd r0, r1, [r11], #8 816 mov r3, r2 817 add r12, sp, #64 818 sub r1, r1, r2, lsl #1 819 mov r2, #16 820 bl \type\()_h264_qpel16_v_lowpass_l2_neon 821 vpop {d8-d15} 822 mov sp, r11 823 pop {r4, r11, pc} 824 endfunc 825 826 function ff_\type\()_h264_qpel16_mc21_neon, export=1 827 push {r0, r1, r4-r5, r9-r11, lr} 828 \type\()_h264_qpel16_mc21: 829 lowpass_const r3 830 mov r11, sp 831 A bic sp, sp, #15 832 T bic r0, r11, #15 833 T mov sp, r0 834 sub sp, sp, #(16*16+16*12) 835 sub r1, r1, #2 836 mov r0, sp 837 vpush {d8-d15} 838 bl put_h264_qpel16_h_lowpass_neon_packed 839 mov r4, r0 840 ldrd r0, r1, [r11], #8 841 sub r1, r1, r2, lsl #1 842 sub r1, r1, #2 843 mov r3, r2 844 bl \type\()_h264_qpel16_hv_lowpass_l2_neon 845 vpop {d8-d15} 846 mov sp, r11 847 pop {r4-r5, r9-r11, pc} 848 endfunc 849 850 function ff_\type\()_h264_qpel16_mc31_neon, export=1 851 add r1, r1, #1 852 push {r0, r1, r4, r11, lr} 853 sub r1, r1, #1 854 b \type\()_h264_qpel16_mc11 855 endfunc 856 857 function ff_\type\()_h264_qpel16_mc02_neon, export=1 858 push {r4, lr} 859 lowpass_const r3 860 sub r1, r1, r2, lsl #1 861 mov r3, r2 862 vpush {d8-d15} 863 bl \type\()_h264_qpel16_v_lowpass_neon 864 vpop {d8-d15} 865 pop {r4, pc} 866 endfunc 867 868 function ff_\type\()_h264_qpel16_mc12_neon, export=1 869 push {r0, r1, r4-r5, r9-r11, lr} 870 \type\()_h264_qpel16_mc12: 871 lowpass_const r3 872 mov r11, sp 873 A bic sp, sp, #15 874 T bic r0, r11, #15 875 T mov sp, r0 876 sub sp, sp, #(16*16+16*12) 877 sub r1, r1, r2, lsl #1 878 mov r0, sp 879 mov r3, r2 880 vpush {d8-d15} 881 bl put_h264_qpel16_v_lowpass_neon_packed 882 mov r4, r0 883 ldrd r0, r1, [r11], #8 884 sub r1, r1, r3, lsl #1 885 sub r1, r1, #2 886 mov r2, r3 887 bl \type\()_h264_qpel16_hv_lowpass_l2_neon 888 vpop {d8-d15} 889 mov sp, r11 890 pop {r4-r5, r9-r11, pc} 891 endfunc 892 893 function ff_\type\()_h264_qpel16_mc22_neon, export=1 894 push {r4, r9-r11, lr} 895 lowpass_const r3 896 mov r11, sp 897 A bic sp, sp, #15 898 T bic r4, r11, #15 899 T mov sp, r4 900 sub r1, r1, r2, lsl #1 901 sub r1, r1, #2 902 mov r3, r2 903 sub sp, sp, #(16*12) 904 mov r4, sp 905 vpush {d8-d15} 906 bl \type\()_h264_qpel16_hv_lowpass_neon 907 vpop {d8-d15} 908 mov sp, r11 909 pop {r4, r9-r11, pc} 910 endfunc 911 912 function ff_\type\()_h264_qpel16_mc32_neon, export=1 913 push {r0, r1, r4-r5, r9-r11, lr} 914 add r1, r1, #1 915 b \type\()_h264_qpel16_mc12 916 endfunc 917 918 function ff_\type\()_h264_qpel16_mc03_neon, export=1 919 push {r4, lr} 920 add r12, r1, r2 921 b \type\()_h264_qpel16_mc01 922 endfunc 923 924 function ff_\type\()_h264_qpel16_mc13_neon, export=1 925 push {r0, r1, r4, r11, lr} 926 add r1, r1, r2 927 b \type\()_h264_qpel16_mc11 928 endfunc 929 930 function ff_\type\()_h264_qpel16_mc23_neon, export=1 931 push {r0, r1, r4-r5, r9-r11, lr} 932 add r1, r1, r2 933 b \type\()_h264_qpel16_mc21 934 endfunc 935 936 function ff_\type\()_h264_qpel16_mc33_neon, export=1 937 add r1, r1, #1 938 push {r0, r1, r4, r11, lr} 939 add r1, r1, r2 940 sub r1, r1, #1 941 b \type\()_h264_qpel16_mc11 942 endfunc 943 .endm 944 945 h264_qpel16 put 946 h264_qpel16 avg 947