1 /* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #include "libavutil/aarch64/asm.S" 23 #include "neon.S" 24 25 /* H.264 qpel MC */ 26 27 .macro lowpass_const r 28 movz \r, #20, lsl #16 29 movk \r, #5 30 mov v6.S[0], \r 31 .endm 32 33 //trashes v0-v5 34 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 35 ext v2.8B, \r0\().8B, \r1\().8B, #2 36 ext v3.8B, \r0\().8B, \r1\().8B, #3 37 uaddl v2.8H, v2.8B, v3.8B 38 ext v4.8B, \r0\().8B, \r1\().8B, #1 39 ext v5.8B, \r0\().8B, \r1\().8B, #4 40 uaddl v4.8H, v4.8B, v5.8B 41 ext v1.8B, \r0\().8B, \r1\().8B, #5 42 uaddl \d0\().8H, \r0\().8B, v1.8B 43 ext v0.8B, \r2\().8B, \r3\().8B, #2 44 mla \d0\().8H, v2.8H, v6.H[1] 45 ext v1.8B, \r2\().8B, \r3\().8B, #3 46 uaddl v0.8H, v0.8B, v1.8B 47 ext v1.8B, \r2\().8B, \r3\().8B, #1 48 mls \d0\().8H, v4.8H, v6.H[0] 49 ext v3.8B, \r2\().8B, \r3\().8B, #4 50 uaddl v1.8H, v1.8B, v3.8B 51 ext v2.8B, \r2\().8B, \r3\().8B, #5 52 uaddl \d1\().8H, \r2\().8B, v2.8B 53 mla \d1\().8H, v0.8H, v6.H[1] 54 mls \d1\().8H, v1.8H, v6.H[0] 55 .if \narrow 56 sqrshrun \d0\().8B, \d0\().8H, #5 57 sqrshrun \d1\().8B, \d1\().8H, #5 58 .endif 59 .endm 60 61 //trashes v0-v4 62 .macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1 63 uaddl v2.8H, \r2\().8B, \r3\().8B 64 uaddl v0.8H, \r3\().8B, \r4\().8B 65 uaddl v4.8H, \r1\().8B, \r4\().8B 66 uaddl v1.8H, \r2\().8B, \r5\().8B 67 uaddl \d0\().8H, \r0\().8B, \r5\().8B 68 uaddl \d1\().8H, \r1\().8B, \r6\().8B 69 mla \d0\().8H, v2.8H, v6.H[1] 70 mls \d0\().8H, v4.8H, v6.H[0] 71 mla \d1\().8H, v0.8H, v6.H[1] 72 mls \d1\().8H, v1.8H, v6.H[0] 73 .if \narrow 74 sqrshrun \d0\().8B, \d0\().8H, #5 75 sqrshrun \d1\().8B, \d1\().8H, #5 76 .endif 77 .endm 78 79 //trashes v0-v5, v7, v30-v31 80 .macro lowpass_8H r0, r1 81 ext v0.16B, \r0\().16B, \r0\().16B, #2 82 ext v1.16B, \r0\().16B, \r0\().16B, #3 83 uaddl v0.8H, v0.8B, v1.8B 84 ext v2.16B, \r0\().16B, \r0\().16B, #1 85 ext v3.16B, \r0\().16B, \r0\().16B, #4 86 uaddl v2.8H, v2.8B, v3.8B 87 ext v30.16B, \r0\().16B, \r0\().16B, #5 88 uaddl \r0\().8H, \r0\().8B, v30.8B 89 ext v4.16B, \r1\().16B, \r1\().16B, #2 90 mla \r0\().8H, v0.8H, v6.H[1] 91 ext v5.16B, \r1\().16B, \r1\().16B, #3 92 uaddl v4.8H, v4.8B, v5.8B 93 ext v7.16B, \r1\().16B, \r1\().16B, #1 94 mls \r0\().8H, v2.8H, v6.H[0] 95 ext v0.16B, \r1\().16B, \r1\().16B, #4 96 uaddl v7.8H, v7.8B, v0.8B 97 ext v31.16B, \r1\().16B, \r1\().16B, #5 98 uaddl \r1\().8H, \r1\().8B, v31.8B 99 mla \r1\().8H, v4.8H, v6.H[1] 100 mls \r1\().8H, v7.8H, v6.H[0] 101 .endm 102 103 // trashes v2-v5, v30 104 .macro lowpass_8_1 r0, r1, d0, narrow=1 105 ext v2.8B, \r0\().8B, \r1\().8B, #2 106 ext v3.8B, \r0\().8B, \r1\().8B, #3 107 uaddl v2.8H, v2.8B, v3.8B 108 ext v4.8B, \r0\().8B, \r1\().8B, #1 109 ext v5.8B, \r0\().8B, \r1\().8B, #4 110 uaddl v4.8H, v4.8B, v5.8B 111 ext v30.8B, \r0\().8B, \r1\().8B, #5 112 uaddl \d0\().8H, \r0\().8B, v30.8B 113 mla \d0\().8H, v2.8H, v6.H[1] 114 mls \d0\().8H, v4.8H, v6.H[0] 115 .if \narrow 116 sqrshrun \d0\().8B, \d0\().8H, #5 117 .endif 118 .endm 119 120 // trashed v0-v7 121 .macro lowpass_8.16 r0, r1, r2, r3, r4, r5 122 saddl v5.4S, \r2\().4H, \r3\().4H 123 saddl2 v1.4S, \r2\().8H, \r3\().8H 124 saddl v6.4S, \r1\().4H, \r4\().4H 125 saddl2 v2.4S, \r1\().8H, \r4\().8H 126 saddl v0.4S, \r0\().4H, \r5\().4H 127 saddl2 v4.4S, \r0\().8H, \r5\().8H 128 129 shl v3.4S, v5.4S, #4 130 shl v5.4S, v5.4S, #2 131 shl v7.4S, v6.4S, #2 132 add v5.4S, v5.4S, v3.4S 133 add v6.4S, v6.4S, v7.4S 134 135 shl v3.4S, v1.4S, #4 136 shl v1.4S, v1.4S, #2 137 shl v7.4S, v2.4S, #2 138 add v1.4S, v1.4S, v3.4S 139 add v2.4S, v2.4S, v7.4S 140 141 add v5.4S, v5.4S, v0.4S 142 sub v5.4S, v5.4S, v6.4S 143 144 add v1.4S, v1.4S, v4.4S 145 sub v1.4S, v1.4S, v2.4S 146 147 rshrn v5.4H, v5.4S, #10 148 rshrn2 v5.8H, v1.4S, #10 149 150 sqxtun \r0\().8B, v5.8H 151 .endm 152 153 function put_h264_qpel16_h_lowpass_neon_packed 154 mov x4, x30 155 mov x12, #16 156 mov x3, #8 157 bl put_h264_qpel8_h_lowpass_neon 158 sub x1, x1, x2, lsl #4 159 add x1, x1, #8 160 mov x12, #16 161 mov x30, x4 162 b put_h264_qpel8_h_lowpass_neon 163 endfunc 164 165 .macro h264_qpel_h_lowpass type 166 function \type\()_h264_qpel16_h_lowpass_neon 167 mov x13, x30 168 mov x12, #16 169 bl \type\()_h264_qpel8_h_lowpass_neon 170 sub x0, x0, x3, lsl #4 171 sub x1, x1, x2, lsl #4 172 add x0, x0, #8 173 add x1, x1, #8 174 mov x12, #16 175 mov x30, x13 176 endfunc 177 178 function \type\()_h264_qpel8_h_lowpass_neon 179 1: ld1 {v28.8B, v29.8B}, [x1], x2 180 ld1 {v16.8B, v17.8B}, [x1], x2 181 subs x12, x12, #2 182 lowpass_8 v28, v29, v16, v17, v28, v16 183 .ifc \type,avg 184 ld1 {v2.8B}, [x0], x3 185 ld1 {v3.8B}, [x0] 186 urhadd v28.8B, v28.8B, v2.8B 187 urhadd v16.8B, v16.8B, v3.8B 188 sub x0, x0, x3 189 .endif 190 st1 {v28.8B}, [x0], x3 191 st1 {v16.8B}, [x0], x3 192 b.ne 1b 193 ret 194 endfunc 195 .endm 196 197 h264_qpel_h_lowpass put 198 h264_qpel_h_lowpass avg 199 200 .macro h264_qpel_h_lowpass_l2 type 201 function \type\()_h264_qpel16_h_lowpass_l2_neon 202 mov x13, x30 203 mov x12, #16 204 bl \type\()_h264_qpel8_h_lowpass_l2_neon 205 sub x0, x0, x2, lsl #4 206 sub x1, x1, x2, lsl #4 207 sub x3, x3, x2, lsl #4 208 add x0, x0, #8 209 add x1, x1, #8 210 add x3, x3, #8 211 mov x12, #16 212 mov x30, x13 213 endfunc 214 215 function \type\()_h264_qpel8_h_lowpass_l2_neon 216 1: ld1 {v26.8B, v27.8B}, [x1], x2 217 ld1 {v16.8B, v17.8B}, [x1], x2 218 ld1 {v28.8B}, [x3], x2 219 ld1 {v29.8B}, [x3], x2 220 subs x12, x12, #2 221 lowpass_8 v26, v27, v16, v17, v26, v27 222 urhadd v26.8B, v26.8B, v28.8B 223 urhadd v27.8B, v27.8B, v29.8B 224 .ifc \type,avg 225 ld1 {v2.8B}, [x0], x2 226 ld1 {v3.8B}, [x0] 227 urhadd v26.8B, v26.8B, v2.8B 228 urhadd v27.8B, v27.8B, v3.8B 229 sub x0, x0, x2 230 .endif 231 st1 {v26.8B}, [x0], x2 232 st1 {v27.8B}, [x0], x2 233 b.ne 1b 234 ret 235 endfunc 236 .endm 237 238 h264_qpel_h_lowpass_l2 put 239 h264_qpel_h_lowpass_l2 avg 240 241 function put_h264_qpel16_v_lowpass_neon_packed 242 mov x4, x30 243 mov x2, #8 244 bl put_h264_qpel8_v_lowpass_neon 245 sub x1, x1, x3, lsl #2 246 bl put_h264_qpel8_v_lowpass_neon 247 sub x1, x1, x3, lsl #4 248 sub x1, x1, x3, lsl #2 249 add x1, x1, #8 250 bl put_h264_qpel8_v_lowpass_neon 251 sub x1, x1, x3, lsl #2 252 mov x30, x4 253 b put_h264_qpel8_v_lowpass_neon 254 endfunc 255 256 .macro h264_qpel_v_lowpass type 257 function \type\()_h264_qpel16_v_lowpass_neon 258 mov x4, x30 259 bl \type\()_h264_qpel8_v_lowpass_neon 260 sub x1, x1, x3, lsl #2 261 bl \type\()_h264_qpel8_v_lowpass_neon 262 sub x0, x0, x2, lsl #4 263 add x0, x0, #8 264 sub x1, x1, x3, lsl #4 265 sub x1, x1, x3, lsl #2 266 add x1, x1, #8 267 bl \type\()_h264_qpel8_v_lowpass_neon 268 sub x1, x1, x3, lsl #2 269 mov x30, x4 270 endfunc 271 272 function \type\()_h264_qpel8_v_lowpass_neon 273 ld1 {v16.8B}, [x1], x3 274 ld1 {v17.8B}, [x1], x3 275 ld1 {v18.8B}, [x1], x3 276 ld1 {v19.8B}, [x1], x3 277 ld1 {v20.8B}, [x1], x3 278 ld1 {v21.8B}, [x1], x3 279 ld1 {v22.8B}, [x1], x3 280 ld1 {v23.8B}, [x1], x3 281 ld1 {v24.8B}, [x1], x3 282 ld1 {v25.8B}, [x1], x3 283 ld1 {v26.8B}, [x1], x3 284 ld1 {v27.8B}, [x1], x3 285 ld1 {v28.8B}, [x1] 286 287 lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 288 lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 289 lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 290 lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 291 .ifc \type,avg 292 ld1 {v24.8B}, [x0], x2 293 ld1 {v25.8B}, [x0], x2 294 ld1 {v26.8B}, [x0], x2 295 urhadd v16.8B, v16.8B, v24.8B 296 ld1 {v27.8B}, [x0], x2 297 urhadd v17.8B, v17.8B, v25.8B 298 ld1 {v28.8B}, [x0], x2 299 urhadd v18.8B, v18.8B, v26.8B 300 ld1 {v29.8B}, [x0], x2 301 urhadd v19.8B, v19.8B, v27.8B 302 ld1 {v30.8B}, [x0], x2 303 urhadd v20.8B, v20.8B, v28.8B 304 ld1 {v31.8B}, [x0], x2 305 urhadd v21.8B, v21.8B, v29.8B 306 urhadd v22.8B, v22.8B, v30.8B 307 urhadd v23.8B, v23.8B, v31.8B 308 sub x0, x0, x2, lsl #3 309 .endif 310 311 st1 {v16.8B}, [x0], x2 312 st1 {v17.8B}, [x0], x2 313 st1 {v18.8B}, [x0], x2 314 st1 {v19.8B}, [x0], x2 315 st1 {v20.8B}, [x0], x2 316 st1 {v21.8B}, [x0], x2 317 st1 {v22.8B}, [x0], x2 318 st1 {v23.8B}, [x0], x2 319 320 ret 321 endfunc 322 .endm 323 324 h264_qpel_v_lowpass put 325 h264_qpel_v_lowpass avg 326 327 .macro h264_qpel_v_lowpass_l2 type 328 function \type\()_h264_qpel16_v_lowpass_l2_neon 329 mov x4, x30 330 bl \type\()_h264_qpel8_v_lowpass_l2_neon 331 sub x1, x1, x3, lsl #2 332 bl \type\()_h264_qpel8_v_lowpass_l2_neon 333 sub x0, x0, x3, lsl #4 334 sub x12, x12, x2, lsl #4 335 add x0, x0, #8 336 add x12, x12, #8 337 sub x1, x1, x3, lsl #4 338 sub x1, x1, x3, lsl #2 339 add x1, x1, #8 340 bl \type\()_h264_qpel8_v_lowpass_l2_neon 341 sub x1, x1, x3, lsl #2 342 mov x30, x4 343 endfunc 344 345 function \type\()_h264_qpel8_v_lowpass_l2_neon 346 ld1 {v16.8B}, [x1], x3 347 ld1 {v17.8B}, [x1], x3 348 ld1 {v18.8B}, [x1], x3 349 ld1 {v19.8B}, [x1], x3 350 ld1 {v20.8B}, [x1], x3 351 ld1 {v21.8B}, [x1], x3 352 ld1 {v22.8B}, [x1], x3 353 ld1 {v23.8B}, [x1], x3 354 ld1 {v24.8B}, [x1], x3 355 ld1 {v25.8B}, [x1], x3 356 ld1 {v26.8B}, [x1], x3 357 ld1 {v27.8B}, [x1], x3 358 ld1 {v28.8B}, [x1] 359 360 lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 361 lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 362 lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 363 lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 364 365 ld1 {v24.8B}, [x12], x2 366 ld1 {v25.8B}, [x12], x2 367 ld1 {v26.8B}, [x12], x2 368 ld1 {v27.8B}, [x12], x2 369 ld1 {v28.8B}, [x12], x2 370 urhadd v16.8B, v24.8B, v16.8B 371 urhadd v17.8B, v25.8B, v17.8B 372 ld1 {v29.8B}, [x12], x2 373 urhadd v18.8B, v26.8B, v18.8B 374 urhadd v19.8B, v27.8B, v19.8B 375 ld1 {v30.8B}, [x12], x2 376 urhadd v20.8B, v28.8B, v20.8B 377 urhadd v21.8B, v29.8B, v21.8B 378 ld1 {v31.8B}, [x12], x2 379 urhadd v22.8B, v30.8B, v22.8B 380 urhadd v23.8B, v31.8B, v23.8B 381 382 .ifc \type,avg 383 ld1 {v24.8B}, [x0], x3 384 ld1 {v25.8B}, [x0], x3 385 ld1 {v26.8B}, [x0], x3 386 urhadd v16.8B, v16.8B, v24.8B 387 ld1 {v27.8B}, [x0], x3 388 urhadd v17.8B, v17.8B, v25.8B 389 ld1 {v28.8B}, [x0], x3 390 urhadd v18.8B, v18.8B, v26.8B 391 ld1 {v29.8B}, [x0], x3 392 urhadd v19.8B, v19.8B, v27.8B 393 ld1 {v30.8B}, [x0], x3 394 urhadd v20.8B, v20.8B, v28.8B 395 ld1 {v31.8B}, [x0], x3 396 urhadd v21.8B, v21.8B, v29.8B 397 urhadd v22.8B, v22.8B, v30.8B 398 urhadd v23.8B, v23.8B, v31.8B 399 sub x0, x0, x3, lsl #3 400 .endif 401 402 st1 {v16.8B}, [x0], x3 403 st1 {v17.8B}, [x0], x3 404 st1 {v18.8B}, [x0], x3 405 st1 {v19.8B}, [x0], x3 406 st1 {v20.8B}, [x0], x3 407 st1 {v21.8B}, [x0], x3 408 st1 {v22.8B}, [x0], x3 409 st1 {v23.8B}, [x0], x3 410 411 ret 412 endfunc 413 .endm 414 415 h264_qpel_v_lowpass_l2 put 416 h264_qpel_v_lowpass_l2 avg 417 418 function put_h264_qpel8_hv_lowpass_neon_top 419 lowpass_const w12 420 ld1 {v16.8H}, [x1], x3 421 ld1 {v17.8H}, [x1], x3 422 ld1 {v18.8H}, [x1], x3 423 ld1 {v19.8H}, [x1], x3 424 ld1 {v20.8H}, [x1], x3 425 ld1 {v21.8H}, [x1], x3 426 ld1 {v22.8H}, [x1], x3 427 ld1 {v23.8H}, [x1], x3 428 ld1 {v24.8H}, [x1], x3 429 ld1 {v25.8H}, [x1], x3 430 ld1 {v26.8H}, [x1], x3 431 ld1 {v27.8H}, [x1], x3 432 ld1 {v28.8H}, [x1] 433 lowpass_8H v16, v17 434 lowpass_8H v18, v19 435 lowpass_8H v20, v21 436 lowpass_8H v22, v23 437 lowpass_8H v24, v25 438 lowpass_8H v26, v27 439 lowpass_8H v28, v29 440 441 lowpass_8.16 v16, v17, v18, v19, v20, v21 442 lowpass_8.16 v17, v18, v19, v20, v21, v22 443 444 lowpass_8.16 v18, v19, v20, v21, v22, v23 445 lowpass_8.16 v19, v20, v21, v22, v23, v24 446 447 lowpass_8.16 v20, v21, v22, v23, v24, v25 448 lowpass_8.16 v21, v22, v23, v24, v25, v26 449 450 lowpass_8.16 v22, v23, v24, v25, v26, v27 451 lowpass_8.16 v23, v24, v25, v26, v27, v28 452 453 ret 454 endfunc 455 456 .macro h264_qpel8_hv_lowpass type 457 function \type\()_h264_qpel8_hv_lowpass_neon 458 mov x10, x30 459 bl put_h264_qpel8_hv_lowpass_neon_top 460 .ifc \type,avg 461 ld1 {v0.8B}, [x0], x2 462 ld1 {v1.8B}, [x0], x2 463 ld1 {v2.8B}, [x0], x2 464 urhadd v16.8B, v16.8B, v0.8B 465 ld1 {v3.8B}, [x0], x2 466 urhadd v17.8B, v17.8B, v1.8B 467 ld1 {v4.8B}, [x0], x2 468 urhadd v18.8B, v18.8B, v2.8B 469 ld1 {v5.8B}, [x0], x2 470 urhadd v19.8B, v19.8B, v3.8B 471 ld1 {v6.8B}, [x0], x2 472 urhadd v20.8B, v20.8B, v4.8B 473 ld1 {v7.8B}, [x0], x2 474 urhadd v21.8B, v21.8B, v5.8B 475 urhadd v22.8B, v22.8B, v6.8B 476 urhadd v23.8B, v23.8B, v7.8B 477 sub x0, x0, x2, lsl #3 478 .endif 479 480 st1 {v16.8B}, [x0], x2 481 st1 {v17.8B}, [x0], x2 482 st1 {v18.8B}, [x0], x2 483 st1 {v19.8B}, [x0], x2 484 st1 {v20.8B}, [x0], x2 485 st1 {v21.8B}, [x0], x2 486 st1 {v22.8B}, [x0], x2 487 st1 {v23.8B}, [x0], x2 488 489 ret x10 490 endfunc 491 .endm 492 493 h264_qpel8_hv_lowpass put 494 h264_qpel8_hv_lowpass avg 495 496 .macro h264_qpel8_hv_lowpass_l2 type 497 function \type\()_h264_qpel8_hv_lowpass_l2_neon 498 mov x10, x30 499 bl put_h264_qpel8_hv_lowpass_neon_top 500 501 ld1 {v0.8B, v1.8B}, [x2], #16 502 ld1 {v2.8B, v3.8B}, [x2], #16 503 urhadd v0.8B, v0.8B, v16.8B 504 urhadd v1.8B, v1.8B, v17.8B 505 ld1 {v4.8B, v5.8B}, [x2], #16 506 urhadd v2.8B, v2.8B, v18.8B 507 urhadd v3.8B, v3.8B, v19.8B 508 ld1 {v6.8B, v7.8B}, [x2], #16 509 urhadd v4.8B, v4.8B, v20.8B 510 urhadd v5.8B, v5.8B, v21.8B 511 urhadd v6.8B, v6.8B, v22.8B 512 urhadd v7.8B, v7.8B, v23.8B 513 .ifc \type,avg 514 ld1 {v16.8B}, [x0], x3 515 ld1 {v17.8B}, [x0], x3 516 ld1 {v18.8B}, [x0], x3 517 urhadd v0.8B, v0.8B, v16.8B 518 ld1 {v19.8B}, [x0], x3 519 urhadd v1.8B, v1.8B, v17.8B 520 ld1 {v20.8B}, [x0], x3 521 urhadd v2.8B, v2.8B, v18.8B 522 ld1 {v21.8B}, [x0], x3 523 urhadd v3.8B, v3.8B, v19.8B 524 ld1 {v22.8B}, [x0], x3 525 urhadd v4.8B, v4.8B, v20.8B 526 ld1 {v23.8B}, [x0], x3 527 urhadd v5.8B, v5.8B, v21.8B 528 urhadd v6.8B, v6.8B, v22.8B 529 urhadd v7.8B, v7.8B, v23.8B 530 sub x0, x0, x3, lsl #3 531 .endif 532 st1 {v0.8B}, [x0], x3 533 st1 {v1.8B}, [x0], x3 534 st1 {v2.8B}, [x0], x3 535 st1 {v3.8B}, [x0], x3 536 st1 {v4.8B}, [x0], x3 537 st1 {v5.8B}, [x0], x3 538 st1 {v6.8B}, [x0], x3 539 st1 {v7.8B}, [x0], x3 540 541 ret x10 542 endfunc 543 .endm 544 545 h264_qpel8_hv_lowpass_l2 put 546 h264_qpel8_hv_lowpass_l2 avg 547 548 .macro h264_qpel16_hv type 549 function \type\()_h264_qpel16_hv_lowpass_neon 550 mov x13, x30 551 bl \type\()_h264_qpel8_hv_lowpass_neon 552 sub x1, x1, x3, lsl #2 553 bl \type\()_h264_qpel8_hv_lowpass_neon 554 sub x1, x1, x3, lsl #4 555 sub x1, x1, x3, lsl #2 556 add x1, x1, #8 557 sub x0, x0, x2, lsl #4 558 add x0, x0, #8 559 bl \type\()_h264_qpel8_hv_lowpass_neon 560 sub x1, x1, x3, lsl #2 561 mov x30, x13 562 b \type\()_h264_qpel8_hv_lowpass_neon 563 endfunc 564 565 function \type\()_h264_qpel16_hv_lowpass_l2_neon 566 mov x13, x30 567 sub x2, x4, #256 568 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 569 sub x1, x1, x3, lsl #2 570 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 571 sub x1, x1, x3, lsl #4 572 sub x1, x1, x3, lsl #2 573 add x1, x1, #8 574 sub x0, x0, x3, lsl #4 575 add x0, x0, #8 576 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 577 sub x1, x1, x3, lsl #2 578 mov x30, x13 579 b \type\()_h264_qpel8_hv_lowpass_l2_neon 580 endfunc 581 .endm 582 583 h264_qpel16_hv put 584 h264_qpel16_hv avg 585 586 .macro h264_qpel8 type 587 function ff_\type\()_h264_qpel8_mc10_neon, export=1 588 lowpass_const w3 589 mov x3, x1 590 sub x1, x1, #2 591 mov x12, #8 592 b \type\()_h264_qpel8_h_lowpass_l2_neon 593 endfunc 594 595 function ff_\type\()_h264_qpel8_mc20_neon, export=1 596 lowpass_const w3 597 sub x1, x1, #2 598 mov x3, x2 599 mov x12, #8 600 b \type\()_h264_qpel8_h_lowpass_neon 601 endfunc 602 603 function ff_\type\()_h264_qpel8_mc30_neon, export=1 604 lowpass_const w3 605 add x3, x1, #1 606 sub x1, x1, #2 607 mov x12, #8 608 b \type\()_h264_qpel8_h_lowpass_l2_neon 609 endfunc 610 611 function ff_\type\()_h264_qpel8_mc01_neon, export=1 612 mov x14, x30 613 mov x12, x1 614 \type\()_h264_qpel8_mc01: 615 lowpass_const w3 616 mov x3, x2 617 sub x1, x1, x2, lsl #1 618 bl \type\()_h264_qpel8_v_lowpass_l2_neon 619 ret x14 620 endfunc 621 622 function ff_\type\()_h264_qpel8_mc11_neon, export=1 623 mov x14, x30 624 mov x8, x0 625 mov x9, x1 626 \type\()_h264_qpel8_mc11: 627 lowpass_const w3 628 mov x11, sp 629 sub sp, sp, #64 630 mov x0, sp 631 sub x1, x1, #2 632 mov x3, #8 633 mov x12, #8 634 bl put_h264_qpel8_h_lowpass_neon 635 mov x0, x8 636 mov x3, x2 637 mov x12, sp 638 sub x1, x9, x2, lsl #1 639 mov x2, #8 640 bl \type\()_h264_qpel8_v_lowpass_l2_neon 641 mov sp, x11 642 ret x14 643 endfunc 644 645 function ff_\type\()_h264_qpel8_mc21_neon, export=1 646 mov x14, x30 647 mov x8, x0 648 mov x9, x1 649 \type\()_h264_qpel8_mc21: 650 lowpass_const w3 651 mov x11, sp 652 sub sp, sp, #(8*8+16*12) 653 sub x1, x1, #2 654 mov x3, #8 655 mov x0, sp 656 mov x12, #8 657 bl put_h264_qpel8_h_lowpass_neon 658 mov x4, x0 659 mov x0, x8 660 sub x1, x9, x2, lsl #1 661 sub x1, x1, #2 662 mov x3, x2 663 sub x2, x4, #64 664 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 665 mov sp, x11 666 ret x14 667 endfunc 668 669 function ff_\type\()_h264_qpel8_mc31_neon, export=1 670 add x1, x1, #1 671 mov x14, x30 672 mov x8, x0 673 mov x9, x1 674 sub x1, x1, #1 675 b \type\()_h264_qpel8_mc11 676 endfunc 677 678 function ff_\type\()_h264_qpel8_mc02_neon, export=1 679 mov x14, x30 680 lowpass_const w3 681 sub x1, x1, x2, lsl #1 682 mov x3, x2 683 bl \type\()_h264_qpel8_v_lowpass_neon 684 ret x14 685 endfunc 686 687 function ff_\type\()_h264_qpel8_mc12_neon, export=1 688 mov x14, x30 689 mov x8, x0 690 mov x9, x1 691 \type\()_h264_qpel8_mc12: 692 lowpass_const w3 693 mov x11, sp 694 sub sp, sp, #(8*8+16*12) 695 sub x1, x1, x2, lsl #1 696 mov x3, x2 697 mov x2, #8 698 mov x0, sp 699 bl put_h264_qpel8_v_lowpass_neon 700 mov x4, x0 701 mov x0, x8 702 sub x1, x9, x3, lsl #1 703 sub x1, x1, #2 704 sub x2, x4, #64 705 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 706 mov sp, x11 707 ret x14 708 endfunc 709 710 function ff_\type\()_h264_qpel8_mc22_neon, export=1 711 mov x14, x30 712 mov x11, sp 713 sub x1, x1, x2, lsl #1 714 sub x1, x1, #2 715 mov x3, x2 716 bl \type\()_h264_qpel8_hv_lowpass_neon 717 mov sp, x11 718 ret x14 719 endfunc 720 721 function ff_\type\()_h264_qpel8_mc32_neon, export=1 722 mov x14, x30 723 mov x8, x0 724 mov x9, x1 725 add x1, x1, #1 726 b \type\()_h264_qpel8_mc12 727 endfunc 728 729 function ff_\type\()_h264_qpel8_mc03_neon, export=1 730 mov x14, x30 731 add x12, x1, x2 732 b \type\()_h264_qpel8_mc01 733 endfunc 734 735 function ff_\type\()_h264_qpel8_mc13_neon, export=1 736 mov x14, x30 737 mov x8, x0 738 mov x9, x1 739 add x1, x1, x2 740 b \type\()_h264_qpel8_mc11 741 endfunc 742 743 function ff_\type\()_h264_qpel8_mc23_neon, export=1 744 mov x14, x30 745 mov x8, x0 746 mov x9, x1 747 add x1, x1, x2 748 b \type\()_h264_qpel8_mc21 749 endfunc 750 751 function ff_\type\()_h264_qpel8_mc33_neon, export=1 752 add x1, x1, #1 753 mov x14, x30 754 mov x8, x0 755 mov x9, x1 756 add x1, x1, x2 757 sub x1, x1, #1 758 b \type\()_h264_qpel8_mc11 759 endfunc 760 .endm 761 762 h264_qpel8 put 763 h264_qpel8 avg 764 765 .macro h264_qpel16 type 766 function ff_\type\()_h264_qpel16_mc10_neon, export=1 767 lowpass_const w3 768 mov x3, x1 769 sub x1, x1, #2 770 b \type\()_h264_qpel16_h_lowpass_l2_neon 771 endfunc 772 773 function ff_\type\()_h264_qpel16_mc20_neon, export=1 774 lowpass_const w3 775 sub x1, x1, #2 776 mov x3, x2 777 b \type\()_h264_qpel16_h_lowpass_neon 778 endfunc 779 780 function ff_\type\()_h264_qpel16_mc30_neon, export=1 781 lowpass_const w3 782 add x3, x1, #1 783 sub x1, x1, #2 784 b \type\()_h264_qpel16_h_lowpass_l2_neon 785 endfunc 786 787 function ff_\type\()_h264_qpel16_mc01_neon, export=1 788 mov x14, x30 789 mov x12, x1 790 \type\()_h264_qpel16_mc01: 791 lowpass_const w3 792 mov x3, x2 793 sub x1, x1, x2, lsl #1 794 bl \type\()_h264_qpel16_v_lowpass_l2_neon 795 ret x14 796 endfunc 797 798 function ff_\type\()_h264_qpel16_mc11_neon, export=1 799 mov x14, x30 800 mov x8, x0 801 mov x9, x1 802 \type\()_h264_qpel16_mc11: 803 lowpass_const w3 804 mov x11, sp 805 sub sp, sp, #256 806 mov x0, sp 807 sub x1, x1, #2 808 mov x3, #16 809 bl put_h264_qpel16_h_lowpass_neon 810 mov x0, x8 811 mov x3, x2 812 mov x12, sp 813 sub x1, x9, x2, lsl #1 814 mov x2, #16 815 bl \type\()_h264_qpel16_v_lowpass_l2_neon 816 mov sp, x11 817 ret x14 818 endfunc 819 820 function ff_\type\()_h264_qpel16_mc21_neon, export=1 821 mov x14, x30 822 mov x8, x0 823 mov x9, x1 824 \type\()_h264_qpel16_mc21: 825 lowpass_const w3 826 mov x11, sp 827 sub sp, sp, #(16*16+16*12) 828 sub x1, x1, #2 829 mov x0, sp 830 bl put_h264_qpel16_h_lowpass_neon_packed 831 mov x4, x0 832 mov x0, x8 833 sub x1, x9, x2, lsl #1 834 sub x1, x1, #2 835 mov x3, x2 836 bl \type\()_h264_qpel16_hv_lowpass_l2_neon 837 mov sp, x11 838 ret x14 839 endfunc 840 841 function ff_\type\()_h264_qpel16_mc31_neon, export=1 842 add x1, x1, #1 843 mov x14, x30 844 mov x8, x0 845 mov x9, x1 846 sub x1, x1, #1 847 b \type\()_h264_qpel16_mc11 848 endfunc 849 850 function ff_\type\()_h264_qpel16_mc02_neon, export=1 851 mov x14, x30 852 lowpass_const w3 853 sub x1, x1, x2, lsl #1 854 mov x3, x2 855 bl \type\()_h264_qpel16_v_lowpass_neon 856 ret x14 857 endfunc 858 859 function ff_\type\()_h264_qpel16_mc12_neon, export=1 860 mov x14, x30 861 mov x8, x0 862 mov x9, x1 863 \type\()_h264_qpel16_mc12: 864 lowpass_const w3 865 mov x11, sp 866 sub sp, sp, #(16*16+16*12) 867 sub x1, x1, x2, lsl #1 868 mov x0, sp 869 mov x3, x2 870 bl put_h264_qpel16_v_lowpass_neon_packed 871 mov x4, x0 872 mov x0, x8 873 sub x1, x9, x3, lsl #1 874 sub x1, x1, #2 875 mov x2, x3 876 bl \type\()_h264_qpel16_hv_lowpass_l2_neon 877 mov sp, x11 878 ret x14 879 endfunc 880 881 function ff_\type\()_h264_qpel16_mc22_neon, export=1 882 mov x14, x30 883 lowpass_const w3 884 mov x11, sp 885 sub x1, x1, x2, lsl #1 886 sub x1, x1, #2 887 mov x3, x2 888 bl \type\()_h264_qpel16_hv_lowpass_neon 889 mov sp, x11 // restore stack 890 ret x14 891 endfunc 892 893 function ff_\type\()_h264_qpel16_mc32_neon, export=1 894 mov x14, x30 895 mov x8, x0 896 mov x9, x1 897 add x1, x1, #1 898 b \type\()_h264_qpel16_mc12 899 endfunc 900 901 function ff_\type\()_h264_qpel16_mc03_neon, export=1 902 mov x14, x30 903 add x12, x1, x2 904 b \type\()_h264_qpel16_mc01 905 endfunc 906 907 function ff_\type\()_h264_qpel16_mc13_neon, export=1 908 mov x14, x30 909 mov x8, x0 910 mov x9, x1 911 add x1, x1, x2 912 b \type\()_h264_qpel16_mc11 913 endfunc 914 915 function ff_\type\()_h264_qpel16_mc23_neon, export=1 916 mov x14, x30 917 mov x8, x0 918 mov x9, x1 919 add x1, x1, x2 920 b \type\()_h264_qpel16_mc21 921 endfunc 922 923 function ff_\type\()_h264_qpel16_mc33_neon, export=1 924 add x1, x1, #1 925 mov x14, x30 926 mov x8, x0 927 mov x9, x1 928 add x1, x1, x2 929 sub x1, x1, #1 930 b \type\()_h264_qpel16_mc11 931 endfunc 932 .endm 933 934 h264_qpel16 put 935 h264_qpel16 avg 936