1/* 2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22 23.macro ldcol.8 rd, rs, rt, n=8, hi=0 24.if \n >= 8 || \hi == 0 25 ld1 {\rd\().b}[0], [\rs], \rt 26 ld1 {\rd\().b}[1], [\rs], \rt 27 ld1 {\rd\().b}[2], [\rs], \rt 28 ld1 {\rd\().b}[3], [\rs], \rt 29.endif 30.if \n >= 8 || \hi == 1 31 ld1 {\rd\().b}[4], [\rs], \rt 32 ld1 {\rd\().b}[5], [\rs], \rt 33 ld1 {\rd\().b}[6], [\rs], \rt 34 ld1 {\rd\().b}[7], [\rs], \rt 35.endif 36.if \n == 16 37 ld1 {\rd\().b}[8], [\rs], \rt 38 ld1 {\rd\().b}[9], [\rs], \rt 39 ld1 {\rd\().b}[10], [\rs], \rt 40 ld1 {\rd\().b}[11], [\rs], \rt 41 ld1 {\rd\().b}[12], [\rs], \rt 42 ld1 {\rd\().b}[13], [\rs], \rt 43 ld1 {\rd\().b}[14], [\rs], \rt 44 ld1 {\rd\().b}[15], [\rs], \rt 45.endif 46.endm 47 48function ff_pred16x16_128_dc_neon, export=1 49 movi v0.16b, #128 50 b .L_pred16x16_dc_end 51endfunc 52 53function ff_pred16x16_top_dc_neon, export=1 54 sub x2, x0, x1 55 ld1 {v0.16b}, [x2] 56 uaddlv h0, v0.16b 57 rshrn v0.8b, v0.8h, #4 58 dup v0.16b, v0.b[0] 59 b .L_pred16x16_dc_end 60endfunc 61 62function ff_pred16x16_left_dc_neon, export=1 63 sub x2, x0, #1 64 ldcol.8 v0, x2, x1, 16 65 uaddlv h0, v0.16b 66 rshrn v0.8b, v0.8h, #4 67 dup v0.16b, v0.b[0] 68 b .L_pred16x16_dc_end 69endfunc 70 71function ff_pred16x16_dc_neon, export=1 72 sub x2, x0, x1 73 sub x3, x0, #1 74 ld1 {v0.16b}, [x2] 75 ldcol.8 v1, x3, x1, 16 76 uaddlv h0, v0.16b 77 uaddlv h1, v1.16b 78 add v0.4h, v0.4h, v1.4h 79 rshrn v0.8b, v0.8h, #5 80 dup v0.16b, v0.b[0] 81.L_pred16x16_dc_end: 82 mov w3, #8 836: st1 {v0.16b}, [x0], x1 84 subs w3, w3, #1 85 st1 {v0.16b}, [x0], x1 86 b.ne 6b 87 ret 88endfunc 89 90function ff_pred16x16_hor_neon, export=1 91 sub x2, x0, #1 92 mov w3, #16 931: ld1r {v0.16b}, [x2], x1 94 subs w3, w3, #1 95 st1 {v0.16b}, [x0], x1 96 b.ne 1b 97 ret 98endfunc 99 100function ff_pred16x16_vert_neon, export=1 101 sub x2, x0, x1 102 add x1, x1, x1 103 ld1 {v0.16b}, [x2], x1 104 mov w3, #8 1051: subs w3, w3, #1 106 st1 {v0.16b}, [x0], x1 107 st1 {v0.16b}, [x2], x1 108 b.ne 1b 109 ret 110endfunc 111 112function ff_pred16x16_plane_neon, export=1 113 sub x3, x0, x1 114 movrel x4, p16weight 115 add x2, x3, #8 116 sub x3, x3, #1 117 ld1 {v0.8b}, [x3] 118 ld1 {v2.8b}, [x2], x1 119 ldcol.8 v1, x3, x1 120 add x3, x3, x1 121 ldcol.8 v3, x3, x1 122 rev64 v0.8b, v0.8b 123 rev64 v1.8b, v1.8b 124 uaddl v7.8h, v2.8b, v3.8b 125 usubl v2.8h, v2.8b, v0.8b 126 usubl v3.8h, v3.8b, v1.8b 127 ld1 {v0.8h}, [x4] 128 mul v2.8h, v2.8h, v0.8h 129 mul v3.8h, v3.8h, v0.8h 130 addp v2.8h, v2.8h, v3.8h 131 addp v2.8h, v2.8h, v2.8h 132 addp v2.4h, v2.4h, v2.4h 133 sshll v3.4s, v2.4h, #2 134 saddw v2.4s, v3.4s, v2.4h 135 rshrn v4.4h, v2.4s, #6 136 trn2 v5.4h, v4.4h, v4.4h 137 add v2.4h, v4.4h, v5.4h 138 shl v3.4h, v2.4h, #3 139 ext v7.16b, v7.16b, v7.16b, #14 140 sub v3.4h, v3.4h, v2.4h // 7 * (b + c) 141 add v7.4h, v7.4h, v0.4h 142 shl v2.4h, v7.4h, #4 143 sub v2.4h, v2.4h, v3.4h 144 shl v3.4h, v4.4h, #4 145 ext v0.16b, v0.16b, v0.16b, #14 146 sub v6.4h, v5.4h, v3.4h 147 mov v0.h[0], wzr 148 mul v0.8h, v0.8h, v4.h[0] 149 dup v1.8h, v2.h[0] 150 dup v2.8h, v4.h[0] 151 dup v3.8h, v6.h[0] 152 shl v2.8h, v2.8h, #3 153 add v1.8h, v1.8h, v0.8h 154 add v3.8h, v3.8h, v2.8h 155 mov w3, #16 1561: 157 sqshrun v0.8b, v1.8h, #5 158 add v1.8h, v1.8h, v2.8h 159 sqshrun2 v0.16b, v1.8h, #5 160 add v1.8h, v1.8h, v3.8h 161 subs w3, w3, #1 162 st1 {v0.16b}, [x0], x1 163 b.ne 1b 164 ret 165endfunc 166 167const p16weight, align=4 168 .short 1,2,3,4,5,6,7,8 169endconst 170const p8weight, align=4 171 .short 1,2,3,4,1,2,3,4 172endconst 173 174function ff_pred8x8_hor_neon, export=1 175 sub x2, x0, #1 176 mov w3, #8 1771: ld1r {v0.8b}, [x2], x1 178 subs w3, w3, #1 179 st1 {v0.8b}, [x0], x1 180 b.ne 1b 181 ret 182endfunc 183 184function ff_pred8x8_vert_neon, export=1 185 sub x2, x0, x1 186 lsl x1, x1, #1 187 ld1 {v0.8b}, [x2], x1 188 mov w3, #4 1891: subs w3, w3, #1 190 st1 {v0.8b}, [x0], x1 191 st1 {v0.8b}, [x2], x1 192 b.ne 1b 193 ret 194endfunc 195 196function ff_pred8x8_plane_neon, export=1 197 sub x3, x0, x1 198 movrel x4, p8weight 199 movrel x5, p16weight 200 add x2, x3, #4 201 sub x3, x3, #1 202 ld1 {v0.s}[0], [x3] 203 ld1 {v2.s}[0], [x2], x1 204 ldcol.8 v0, x3, x1, 4, hi=1 205 add x3, x3, x1 206 ldcol.8 v3, x3, x1, 4 207 uaddl v7.8h, v2.8b, v3.8b 208 rev32 v0.8b, v0.8b 209 trn1 v2.2s, v2.2s, v3.2s 210 usubl v2.8h, v2.8b, v0.8b 211 ld1 {v6.8h}, [x4] 212 mul v2.8h, v2.8h, v6.8h 213 ld1 {v0.8h}, [x5] 214 saddlp v2.4s, v2.8h 215 addp v2.4s, v2.4s, v2.4s 216 shl v3.4s, v2.4s, #4 217 add v2.4s, v3.4s, v2.4s 218 rshrn v5.4h, v2.4s, #5 219 addp v2.4h, v5.4h, v5.4h 220 shl v3.4h, v2.4h, #1 221 add v3.4h, v3.4h, v2.4h 222 rev64 v7.4h, v7.4h 223 add v7.4h, v7.4h, v0.4h 224 shl v2.4h, v7.4h, #4 225 sub v2.4h, v2.4h, v3.4h 226 ext v0.16b, v0.16b, v0.16b, #14 227 mov v0.h[0], wzr 228 mul v0.8h, v0.8h, v5.h[0] 229 dup v1.8h, v2.h[0] 230 dup v2.8h, v5.h[1] 231 add v1.8h, v1.8h, v0.8h 232 mov w3, #8 2331: 234 sqshrun v0.8b, v1.8h, #5 235 subs w3, w3, #1 236 add v1.8h, v1.8h, v2.8h 237 st1 {v0.8b}, [x0], x1 238 b.ne 1b 239 ret 240endfunc 241 242function ff_pred8x8_128_dc_neon, export=1 243 movi v0.8b, #128 244 movi v1.8b, #128 245 b .L_pred8x8_dc_end 246endfunc 247 248function ff_pred8x8_top_dc_neon, export=1 249 sub x2, x0, x1 250 ld1 {v0.8b}, [x2] 251 uaddlp v0.4h, v0.8b 252 addp v0.4h, v0.4h, v0.4h 253 zip1 v0.8h, v0.8h, v0.8h 254 rshrn v2.8b, v0.8h, #2 255 zip1 v0.8b, v2.8b, v2.8b 256 zip1 v1.8b, v2.8b, v2.8b 257 b .L_pred8x8_dc_end 258endfunc 259 260function ff_pred8x8_left_dc_neon, export=1 261 sub x2, x0, #1 262 ldcol.8 v0, x2, x1 263 uaddlp v0.4h, v0.8b 264 addp v0.4h, v0.4h, v0.4h 265 rshrn v2.8b, v0.8h, #2 266 dup v1.8b, v2.b[1] 267 dup v0.8b, v2.b[0] 268 b .L_pred8x8_dc_end 269endfunc 270 271function ff_pred8x8_dc_neon, export=1 272 sub x2, x0, x1 273 sub x3, x0, #1 274 ld1 {v0.8b}, [x2] 275 ldcol.8 v1, x3, x1 276 uaddlp v0.4h, v0.8b 277 uaddlp v1.4h, v1.8b 278 trn1 v2.2s, v0.2s, v1.2s 279 trn2 v3.2s, v0.2s, v1.2s 280 addp v4.4h, v2.4h, v3.4h 281 addp v5.4h, v4.4h, v4.4h 282 rshrn v6.8b, v5.8h, #3 283 rshrn v7.8b, v4.8h, #2 284 dup v0.8b, v6.b[0] 285 dup v2.8b, v7.b[2] 286 dup v1.8b, v7.b[3] 287 dup v3.8b, v6.b[1] 288 zip1 v0.2s, v0.2s, v2.2s 289 zip1 v1.2s, v1.2s, v3.2s 290.L_pred8x8_dc_end: 291 mov w3, #4 292 add x2, x0, x1, lsl #2 2936: subs w3, w3, #1 294 st1 {v0.8b}, [x0], x1 295 st1 {v1.8b}, [x2], x1 296 b.ne 6b 297 ret 298endfunc 299 300function ff_pred8x8_l0t_dc_neon, export=1 301 sub x2, x0, x1 302 sub x3, x0, #1 303 ld1 {v0.8b}, [x2] 304 ldcol.8 v1, x3, x1, 4 305 zip1 v0.4s, v0.4s, v1.4s 306 uaddlp v0.8h, v0.16b 307 addp v0.8h, v0.8h, v0.8h 308 addp v1.4h, v0.4h, v0.4h 309 rshrn v2.8b, v0.8h, #2 310 rshrn v3.8b, v1.8h, #3 311 dup v4.8b, v3.b[0] 312 dup v6.8b, v2.b[2] 313 dup v5.8b, v2.b[0] 314 zip1 v0.2s, v4.2s, v6.2s 315 zip1 v1.2s, v5.2s, v6.2s 316 b .L_pred8x8_dc_end 317endfunc 318 319function ff_pred8x8_l00_dc_neon, export=1 320 sub x2, x0, #1 321 ldcol.8 v0, x2, x1, 4 322 uaddlp v0.4h, v0.8b 323 addp v0.4h, v0.4h, v0.4h 324 rshrn v0.8b, v0.8h, #2 325 movi v1.8b, #128 326 dup v0.8b, v0.b[0] 327 b .L_pred8x8_dc_end 328endfunc 329 330function ff_pred8x8_0lt_dc_neon, export=1 331 add x3, x0, x1, lsl #2 332 sub x2, x0, x1 333 sub x3, x3, #1 334 ld1 {v0.8b}, [x2] 335 ldcol.8 v1, x3, x1, 4, hi=1 336 zip1 v0.4s, v0.4s, v1.4s 337 uaddlp v0.8h, v0.16b 338 addp v0.8h, v0.8h, v0.8h 339 addp v1.4h, v0.4h, v0.4h 340 rshrn v2.8b, v0.8h, #2 341 rshrn v3.8b, v1.8h, #3 342 dup v4.8b, v2.b[0] 343 dup v5.8b, v2.b[3] 344 dup v6.8b, v2.b[2] 345 dup v7.8b, v3.b[1] 346 zip1 v0.2s, v4.2s, v6.2s 347 zip1 v1.2s, v5.2s, v7.2s 348 b .L_pred8x8_dc_end 349endfunc 350 351function ff_pred8x8_0l0_dc_neon, export=1 352 add x2, x0, x1, lsl #2 353 sub x2, x2, #1 354 ldcol.8 v1, x2, x1, 4 355 uaddlp v2.4h, v1.8b 356 addp v2.4h, v2.4h, v2.4h 357 rshrn v1.8b, v2.8h, #2 358 movi v0.8b, #128 359 dup v1.8b, v1.b[0] 360 b .L_pred8x8_dc_end 361endfunc 362 363.macro ldcol.16 rd, rs, rt, n=4, hi=0 364.if \n >= 4 && \hi == 0 365 ld1 {\rd\().h}[0], [\rs], \rt 366 ld1 {\rd\().h}[1], [\rs], \rt 367 ld1 {\rd\().h}[2], [\rs], \rt 368 ld1 {\rd\().h}[3], [\rs], \rt 369.endif 370.if \n == 8 || \hi == 1 371 ld1 {\rd\().h}[4], [\rs], \rt 372 ld1 {\rd\().h}[5], [\rs], \rt 373 ld1 {\rd\().h}[6], [\rs], \rt 374 ld1 {\rd\().h}[7], [\rs], \rt 375.endif 376.endm 377 378// slower than C 379/* 380function ff_pred16x16_128_dc_neon_10, export=1 381 movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) 382 383 b .L_pred16x16_dc_10_end 384endfunc 385*/ 386 387function ff_pred16x16_top_dc_neon_10, export=1 388 sub x2, x0, x1 389 390 ld1 {v0.8h, v1.8h}, [x2] 391 392 add v0.8h, v0.8h, v1.8h 393 addv h0, v0.8h 394 395 urshr v0.4h, v0.4h, #4 396 dup v0.8h, v0.h[0] 397 b .L_pred16x16_dc_10_end 398endfunc 399 400// slower than C 401/* 402function ff_pred16x16_left_dc_neon_10, export=1 403 sub x2, x0, #2 // access to the "left" column 404 ldcol.16 v0, x2, x1, 8 405 ldcol.16 v1, x2, x1, 8 // load "left" column 406 407 add v0.8h, v0.8h, v1.8h 408 addv h0, v0.8h 409 410 urshr v0.4h, v0.4h, #4 411 dup v0.8h, v0.h[0] 412 b .L_pred16x16_dc_10_end 413endfunc 414*/ 415 416function ff_pred16x16_dc_neon_10, export=1 417 sub x2, x0, x1 // access to the "top" row 418 sub x3, x0, #2 // access to the "left" column 419 420 ld1 {v0.8h, v1.8h}, [x2] 421 ldcol.16 v2, x3, x1, 8 422 ldcol.16 v3, x3, x1, 8 // load pixels in "top" row and "left" col 423 424 add v0.8h, v0.8h, v1.8h 425 add v2.8h, v2.8h, v3.8h 426 add v0.8h, v0.8h, v2.8h 427 addv h0, v0.8h 428 429 urshr v0.4h, v0.4h, #5 430 dup v0.8h, v0.h[0] 431.L_pred16x16_dc_10_end: 432 mov v1.16b, v0.16b 433 mov w3, #8 4346: st1 {v0.8h, v1.8h}, [x0], x1 435 subs w3, w3, #1 436 st1 {v0.8h, v1.8h}, [x0], x1 437 b.ne 6b 438 ret 439endfunc 440 441function ff_pred16x16_hor_neon_10, export=1 442 sub x2, x0, #2 443 add x3, x0, #16 444 445 mov w4, #16 4461: ld1r {v0.8h}, [x2], x1 447 subs w4, w4, #1 448 st1 {v0.8h}, [x0], x1 449 st1 {v0.8h}, [x3], x1 450 b.ne 1b 451 ret 452endfunc 453 454function ff_pred16x16_vert_neon_10, export=1 455 sub x2, x0, x1 456 add x1, x1, x1 457 458 ld1 {v0.8h, v1.8h}, [x2], x1 459 460 mov w3, #8 4611: subs w3, w3, #1 462 st1 {v0.8h, v1.8h}, [x0], x1 463 st1 {v0.8h, v1.8h}, [x2], x1 464 465 b.ne 1b 466 ret 467endfunc 468 469function ff_pred16x16_plane_neon_10, export=1 470 sub x3, x0, x1 471 movrel x4, p16weight 472 add x2, x3, #16 473 sub x3, x3, #2 474 ld1 {v0.8h}, [x3] 475 ld1 {v2.8h}, [x2], x1 476 ldcol.16 v1, x3, x1, 8 477 add x3, x3, x1 478 ldcol.16 v3, x3, x1, 8 479 480 rev64 v16.8h, v0.8h 481 rev64 v17.8h, v1.8h 482 ext v0.16b, v16.16b, v16.16b, #8 483 ext v1.16b, v17.16b, v17.16b, #8 484 485 add v7.8h, v2.8h, v3.8h 486 sub v2.8h, v2.8h, v0.8h 487 sub v3.8h, v3.8h, v1.8h 488 ld1 {v0.8h}, [x4] 489 mul v2.8h, v2.8h, v0.8h 490 mul v3.8h, v3.8h, v0.8h 491 addp v2.8h, v2.8h, v3.8h 492 addp v2.8h, v2.8h, v2.8h 493 addp v2.4h, v2.4h, v2.4h 494 sshll v3.4s, v2.4h, #2 495 saddw v2.4s, v3.4s, v2.4h 496 rshrn v4.4h, v2.4s, #6 497 trn2 v5.4h, v4.4h, v4.4h 498 add v2.4h, v4.4h, v5.4h 499 shl v3.4h, v2.4h, #3 500 ext v7.16b, v7.16b, v7.16b, #14 501 sub v3.4h, v3.4h, v2.4h // 7 * (b + c) 502 add v7.4h, v7.4h, v0.4h 503 shl v2.4h, v7.4h, #4 504 ssubl v2.4s, v2.4h, v3.4h 505 shl v3.4h, v4.4h, #4 506 ext v0.16b, v0.16b, v0.16b, #14 507 ssubl v6.4s, v5.4h, v3.4h 508 509 mov v0.h[0], wzr 510 mul v0.8h, v0.8h, v4.h[0] 511 dup v16.4s, v2.s[0] 512 dup v17.4s, v2.s[0] 513 dup v2.8h, v4.h[0] 514 dup v3.4s, v6.s[0] 515 shl v2.8h, v2.8h, #3 516 saddw v16.4s, v16.4s, v0.4h 517 saddw2 v17.4s, v17.4s, v0.8h 518 saddw v3.4s, v3.4s, v2.4h 519 520 mov w3, #16 521 mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping 5221: 523 sqshrun v0.4h, v16.4s, #5 524 sqshrun2 v0.8h, v17.4s, #5 525 saddw v16.4s, v16.4s, v2.4h 526 saddw v17.4s, v17.4s, v2.4h 527 sqshrun v1.4h, v16.4s, #5 528 sqshrun2 v1.8h, v17.4s, #5 529 add v16.4s, v16.4s, v3.4s 530 add v17.4s, v17.4s, v3.4s 531 532 subs w3, w3, #1 533 534 smin v0.8h, v0.8h, v4.8h 535 smin v1.8h, v1.8h, v4.8h 536 537 st1 {v0.8h, v1.8h}, [x0], x1 538 b.ne 1b 539 ret 540endfunc 541 542function ff_pred8x8_hor_neon_10, export=1 543 sub x2, x0, #2 544 mov w3, #8 545 5461: ld1r {v0.8h}, [x2], x1 547 subs w3, w3, #1 548 st1 {v0.8h}, [x0], x1 549 b.ne 1b 550 ret 551endfunc 552 553function ff_pred8x8_vert_neon_10, export=1 554 sub x2, x0, x1 555 lsl x1, x1, #1 556 557 ld1 {v0.8h}, [x2], x1 558 mov w3, #4 5591: subs w3, w3, #1 560 st1 {v0.8h}, [x0], x1 561 st1 {v0.8h}, [x2], x1 562 b.ne 1b 563 ret 564endfunc 565 566function ff_pred8x8_plane_neon_10, export=1 567 sub x3, x0, x1 568 movrel x4, p8weight 569 movrel x5, p16weight 570 add x2, x3, #8 571 sub x3, x3, #2 572 ld1 {v0.d}[0], [x3] 573 ld1 {v2.d}[0], [x2], x1 574 ldcol.16 v0, x3, x1, hi=1 575 add x3, x3, x1 576 ldcol.16 v3, x3, x1, 4 577 add v7.8h, v2.8h, v3.8h 578 rev64 v0.8h, v0.8h 579 trn1 v2.2d, v2.2d, v3.2d 580 sub v2.8h, v2.8h, v0.8h 581 ld1 {v6.8h}, [x4] 582 mul v2.8h, v2.8h, v6.8h 583 ld1 {v0.8h}, [x5] 584 saddlp v2.4s, v2.8h 585 addp v2.4s, v2.4s, v2.4s 586 shl v3.4s, v2.4s, #4 587 add v2.4s, v3.4s, v2.4s 588 rshrn v5.4h, v2.4s, #5 589 addp v2.4h, v5.4h, v5.4h 590 shl v3.4h, v2.4h, #1 591 add v3.4h, v3.4h, v2.4h 592 rev64 v7.4h, v7.4h 593 add v7.4h, v7.4h, v0.4h 594 shl v2.4h, v7.4h, #4 595 ssubl v2.4s, v2.4h, v3.4h 596 ext v0.16b, v0.16b, v0.16b, #14 597 mov v0.h[0], wzr 598 mul v0.8h, v0.8h, v5.h[0] 599 dup v1.4s, v2.s[0] 600 dup v2.4s, v2.s[0] 601 dup v3.8h, v5.h[1] 602 saddw v1.4s, v1.4s, v0.4h 603 saddw2 v2.4s, v2.4s, v0.8h 604 mov w3, #8 605 mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping 6061: 607 sqshrun v0.4h, v1.4s, #5 608 sqshrun2 v0.8h, v2.4s, #5 609 610 saddw v1.4s, v1.4s, v3.4h 611 saddw v2.4s, v2.4s, v3.4h 612 613 subs w3, w3, #1 614 615 smin v0.8h, v0.8h, v4.8h 616 617 st1 {v0.8h}, [x0], x1 618 b.ne 1b 619 ret 620endfunc 621 622function ff_pred8x8_128_dc_neon_10, export=1 623 movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) 624 movi v1.8h, #2, lsl #8 625 b .L_pred8x8_dc_10_end 626endfunc 627 628function ff_pred8x8_top_dc_neon_10, export=1 629 sub x2, x0, x1 630 ld1 {v0.8h}, [x2] 631 632 addp v0.8h, v0.8h, v0.8h 633 addp v0.4h, v0.4h, v0.4h 634 zip1 v0.4h, v0.4h, v0.4h 635 urshr v2.4h, v0.4h, #2 636 zip1 v0.8h, v2.8h, v2.8h 637 zip1 v1.8h, v2.8h, v2.8h 638 b .L_pred8x8_dc_10_end 639endfunc 640 641function ff_pred8x8_left_dc_neon_10, export=1 642 sub x2, x0, #2 643 ldcol.16 v0, x2, x1, 8 644 645 addp v0.8h, v0.8h, v0.8h 646 addp v0.4h, v0.4h, v0.4h 647 urshr v2.4h, v0.4h, #2 648 dup v1.8h, v2.h[1] 649 dup v0.8h, v2.h[0] 650 b .L_pred8x8_dc_10_end 651endfunc 652 653function ff_pred8x8_dc_neon_10, export=1 654 sub x2, x0, x1 655 sub x3, x0, #2 656 657 ld1 {v0.8h}, [x2] 658 ldcol.16 v1, x3, x1, 8 659 660 addp v0.8h, v0.8h, v0.8h 661 addp v1.8h, v1.8h, v1.8h 662 trn1 v2.2s, v0.2s, v1.2s 663 trn2 v3.2s, v0.2s, v1.2s 664 addp v4.4h, v2.4h, v3.4h 665 addp v5.4h, v4.4h, v4.4h 666 urshr v6.4h, v5.4h, #3 667 urshr v7.4h, v4.4h, #2 668 dup v0.8h, v6.h[0] 669 dup v2.8h, v7.h[2] 670 dup v1.8h, v7.h[3] 671 dup v3.8h, v6.h[1] 672 zip1 v0.2d, v0.2d, v2.2d 673 zip1 v1.2d, v1.2d, v3.2d 674.L_pred8x8_dc_10_end: 675 mov w3, #4 676 add x2, x0, x1, lsl #2 677 6786: st1 {v0.8h}, [x0], x1 679 subs w3, w3, #1 680 st1 {v1.8h}, [x2], x1 681 b.ne 6b 682 ret 683endfunc 684 685function ff_pred8x8_l0t_dc_neon_10, export=1 686 sub x2, x0, x1 687 sub x3, x0, #2 688 689 ld1 {v0.8h}, [x2] 690 ldcol.16 v1, x3, x1, 4 691 692 addp v0.8h, v0.8h, v0.8h 693 addp v1.4h, v1.4h, v1.4h 694 addp v0.4h, v0.4h, v0.4h 695 addp v1.4h, v1.4h, v1.4h 696 add v1.4h, v1.4h, v0.4h 697 698 urshr v2.4h, v0.4h, #2 699 urshr v3.4h, v1.4h, #3 // the pred4x4 part 700 701 dup v4.4h, v3.h[0] 702 dup v5.4h, v2.h[0] 703 dup v6.4h, v2.h[1] 704 705 zip1 v0.2d, v4.2d, v6.2d 706 zip1 v1.2d, v5.2d, v6.2d 707 b .L_pred8x8_dc_10_end 708endfunc 709 710function ff_pred8x8_l00_dc_neon_10, export=1 711 sub x2, x0, #2 712 713 ldcol.16 v0, x2, x1, 4 714 715 addp v0.4h, v0.4h, v0.4h 716 addp v0.4h, v0.4h, v0.4h 717 urshr v0.4h, v0.4h, #2 718 719 movi v1.8h, #2, lsl #8 // 512 720 dup v0.8h, v0.h[0] 721 b .L_pred8x8_dc_10_end 722endfunc 723 724function ff_pred8x8_0lt_dc_neon_10, export=1 725 add x3, x0, x1, lsl #2 726 sub x2, x0, x1 727 sub x3, x3, #2 728 729 ld1 {v0.8h}, [x2] 730 ldcol.16 v1, x3, x1, hi=1 731 732 addp v0.8h, v0.8h, v0.8h 733 addp v1.8h, v1.8h, v1.8h 734 addp v0.4h, v0.4h, v0.4h 735 addp v1.4h, v1.4h, v1.4h 736 zip1 v0.2s, v0.2s, v1.2s 737 add v1.4h, v0.4h, v1.4h 738 739 urshr v2.4h, v0.4h, #2 740 urshr v3.4h, v1.4h, #3 741 742 dup v4.4h, v2.h[0] 743 dup v5.4h, v2.h[3] 744 dup v6.4h, v2.h[1] 745 dup v7.4h, v3.h[1] 746 747 zip1 v0.2d, v4.2d, v6.2d 748 zip1 v1.2d, v5.2d, v7.2d 749 b .L_pred8x8_dc_10_end 750endfunc 751 752function ff_pred8x8_0l0_dc_neon_10, export=1 753 add x2, x0, x1, lsl #2 754 sub x2, x2, #2 755 756 ldcol.16 v1, x2, x1, 4 757 758 addp v2.8h, v1.8h, v1.8h 759 addp v2.4h, v2.4h, v2.4h 760 urshr v1.4h, v2.4h, #2 761 762 movi v0.8h, #2, lsl #8 // 512 763 dup v1.8h, v1.h[0] 764 b .L_pred8x8_dc_10_end 765endfunc 766