1;****************************************************************************** 2;* VP9 Intra prediction SIMD optimizations 3;* 4;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> 5;* Copyright (c) 2015 Henrik Gramner <henrik gramner com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 32 27 28pd_2: times 8 dd 2 29pd_4: times 8 dd 4 30pd_8: times 8 dd 8 31 32pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15 33pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0 34pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7 35 36cextern pw_1 37cextern pw_1023 38cextern pw_4095 39cextern pd_16 40cextern pd_32 41cextern pd_65535; 42 43; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take 44; only 3 registers on x86-32, which would make it one cycle faster, but that 45; would make the code quite a bit uglier... 46 47SECTION .text 48 49%macro SCRATCH 3-4 50%if ARCH_X86_64 51 SWAP %1, %2 52%if %0 == 4 53%define reg_%4 m%2 54%endif 55%else 56 mova [%3], m%1 57%if %0 == 4 58%define reg_%4 [%3] 59%endif 60%endif 61%endmacro 62 63%macro UNSCRATCH 3-4 64%if ARCH_X86_64 65 SWAP %1, %2 66%else 67 mova m%1, [%3] 68%endif 69%if %0 == 4 70%undef reg_%4 71%endif 72%endmacro 73 74%macro PRELOAD 2-3 75%if ARCH_X86_64 76 mova m%1, [%2] 77%if %0 == 3 78%define reg_%3 m%1 79%endif 80%elif %0 == 3 81%define reg_%3 [%2] 82%endif 83%endmacro 84 85INIT_MMX mmx 86cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a 87 movifnidn aq, amp 88 mova m0, [aq] 89 DEFINE_ARGS dst, stride, stride3 90 lea stride3q, [strideq*3] 91 mova [dstq+strideq*0], m0 92 mova [dstq+strideq*1], m0 93 mova [dstq+strideq*2], m0 94 mova [dstq+stride3q ], m0 95 RET 96 97INIT_XMM sse 98cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a 99 movifnidn aq, amp 100 mova m0, [aq] 101 DEFINE_ARGS dst, stride, stride3 102 lea stride3q, [strideq*3] 103 mova [dstq+strideq*0], m0 104 mova [dstq+strideq*1], m0 105 mova [dstq+strideq*2], m0 106 mova [dstq+stride3q ], m0 107 lea dstq, [dstq+strideq*4] 108 mova [dstq+strideq*0], m0 109 mova [dstq+strideq*1], m0 110 mova [dstq+strideq*2], m0 111 mova [dstq+stride3q ], m0 112 RET 113 114INIT_XMM sse 115cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a 116 movifnidn aq, amp 117 mova m0, [aq] 118 mova m1, [aq+mmsize] 119 DEFINE_ARGS dst, stride, stride3, cnt 120 lea stride3q, [strideq*3] 121 mov cntd, 4 122.loop: 123 mova [dstq+strideq*0+ 0], m0 124 mova [dstq+strideq*0+16], m1 125 mova [dstq+strideq*1+ 0], m0 126 mova [dstq+strideq*1+16], m1 127 mova [dstq+strideq*2+ 0], m0 128 mova [dstq+strideq*2+16], m1 129 mova [dstq+stride3q + 0], m0 130 mova [dstq+stride3q +16], m1 131 lea dstq, [dstq+strideq*4] 132 dec cntd 133 jg .loop 134 RET 135 136INIT_XMM sse 137cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a 138 movifnidn aq, amp 139 mova m0, [aq+mmsize*0] 140 mova m1, [aq+mmsize*1] 141 mova m2, [aq+mmsize*2] 142 mova m3, [aq+mmsize*3] 143 DEFINE_ARGS dst, stride, cnt 144 mov cntd, 16 145.loop: 146 mova [dstq+strideq*0+ 0], m0 147 mova [dstq+strideq*0+16], m1 148 mova [dstq+strideq*0+32], m2 149 mova [dstq+strideq*0+48], m3 150 mova [dstq+strideq*1+ 0], m0 151 mova [dstq+strideq*1+16], m1 152 mova [dstq+strideq*1+32], m2 153 mova [dstq+strideq*1+48], m3 154 lea dstq, [dstq+strideq*2] 155 dec cntd 156 jg .loop 157 RET 158 159INIT_MMX mmxext 160cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a 161 mova m3, [lq] 162 DEFINE_ARGS dst, stride, stride3 163 lea stride3q, [strideq*3] 164 pshufw m0, m3, q3333 165 pshufw m1, m3, q2222 166 pshufw m2, m3, q1111 167 pshufw m3, m3, q0000 168 mova [dstq+strideq*0], m0 169 mova [dstq+strideq*1], m1 170 mova [dstq+strideq*2], m2 171 mova [dstq+stride3q ], m3 172 RET 173 174INIT_XMM sse2 175cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a 176 mova m2, [lq] 177 DEFINE_ARGS dst, stride, stride3 178 lea stride3q, [strideq*3] 179 punpckhwd m3, m2, m2 180 pshufd m0, m3, q3333 181 pshufd m1, m3, q2222 182 mova [dstq+strideq*0], m0 183 mova [dstq+strideq*1], m1 184 pshufd m0, m3, q1111 185 pshufd m1, m3, q0000 186 mova [dstq+strideq*2], m0 187 mova [dstq+stride3q ], m1 188 lea dstq, [dstq+strideq*4] 189 punpcklwd m2, m2 190 pshufd m0, m2, q3333 191 pshufd m1, m2, q2222 192 mova [dstq+strideq*0], m0 193 mova [dstq+strideq*1], m1 194 pshufd m0, m2, q1111 195 pshufd m1, m2, q0000 196 mova [dstq+strideq*2], m0 197 mova [dstq+stride3q ], m1 198 RET 199 200INIT_XMM sse2 201cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt 202 mov cntd, 3 203 lea stride3q, [strideq*3] 204.loop: 205 movh m3, [lq+cntq*8] 206 punpcklwd m3, m3 207 pshufd m0, m3, q3333 208 pshufd m1, m3, q2222 209 pshufd m2, m3, q1111 210 pshufd m3, m3, q0000 211 mova [dstq+strideq*0+ 0], m0 212 mova [dstq+strideq*0+16], m0 213 mova [dstq+strideq*1+ 0], m1 214 mova [dstq+strideq*1+16], m1 215 mova [dstq+strideq*2+ 0], m2 216 mova [dstq+strideq*2+16], m2 217 mova [dstq+stride3q + 0], m3 218 mova [dstq+stride3q +16], m3 219 lea dstq, [dstq+strideq*4] 220 dec cntd 221 jge .loop 222 RET 223 224INIT_XMM sse2 225cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt 226 mov cntd, 7 227 lea stride3q, [strideq*3] 228.loop: 229 movh m3, [lq+cntq*8] 230 punpcklwd m3, m3 231 pshufd m0, m3, q3333 232 pshufd m1, m3, q2222 233 pshufd m2, m3, q1111 234 pshufd m3, m3, q0000 235 mova [dstq+strideq*0+ 0], m0 236 mova [dstq+strideq*0+16], m0 237 mova [dstq+strideq*0+32], m0 238 mova [dstq+strideq*0+48], m0 239 mova [dstq+strideq*1+ 0], m1 240 mova [dstq+strideq*1+16], m1 241 mova [dstq+strideq*1+32], m1 242 mova [dstq+strideq*1+48], m1 243 mova [dstq+strideq*2+ 0], m2 244 mova [dstq+strideq*2+16], m2 245 mova [dstq+strideq*2+32], m2 246 mova [dstq+strideq*2+48], m2 247 mova [dstq+stride3q + 0], m3 248 mova [dstq+stride3q +16], m3 249 mova [dstq+stride3q +32], m3 250 mova [dstq+stride3q +48], m3 251 lea dstq, [dstq+strideq*4] 252 dec cntd 253 jge .loop 254 RET 255 256INIT_MMX mmxext 257cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a 258 mova m0, [lq] 259 paddw m0, [aq] 260 DEFINE_ARGS dst, stride, stride3 261 lea stride3q, [strideq*3] 262 pmaddwd m0, [pw_1] 263 pshufw m1, m0, q3232 264 paddd m0, [pd_4] 265 paddd m0, m1 266 psrad m0, 3 267 pshufw m0, m0, q0000 268 mova [dstq+strideq*0], m0 269 mova [dstq+strideq*1], m0 270 mova [dstq+strideq*2], m0 271 mova [dstq+stride3q ], m0 272 RET 273 274INIT_XMM sse2 275cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a 276 mova m0, [lq] 277 paddw m0, [aq] 278 DEFINE_ARGS dst, stride, stride3 279 lea stride3q, [strideq*3] 280 pmaddwd m0, [pw_1] 281 pshufd m1, m0, q3232 282 paddd m0, m1 283 pshufd m1, m0, q1111 284 paddd m0, [pd_8] 285 paddd m0, m1 286 psrad m0, 4 287 pshuflw m0, m0, q0000 288 punpcklqdq m0, m0 289 mova [dstq+strideq*0], m0 290 mova [dstq+strideq*1], m0 291 mova [dstq+strideq*2], m0 292 mova [dstq+stride3q ], m0 293 lea dstq, [dstq+strideq*4] 294 mova [dstq+strideq*0], m0 295 mova [dstq+strideq*1], m0 296 mova [dstq+strideq*2], m0 297 mova [dstq+stride3q ], m0 298 RET 299 300INIT_XMM sse2 301cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a 302 mova m0, [lq] 303 paddw m0, [lq+mmsize] 304 paddw m0, [aq] 305 paddw m0, [aq+mmsize] 306 DEFINE_ARGS dst, stride, stride3, cnt 307 lea stride3q, [strideq*3] 308 mov cntd, 4 309 pmaddwd m0, [pw_1] 310 pshufd m1, m0, q3232 311 paddd m0, m1 312 pshufd m1, m0, q1111 313 paddd m0, [pd_16] 314 paddd m0, m1 315 psrad m0, 5 316 pshuflw m0, m0, q0000 317 punpcklqdq m0, m0 318.loop: 319 mova [dstq+strideq*0+ 0], m0 320 mova [dstq+strideq*0+16], m0 321 mova [dstq+strideq*1+ 0], m0 322 mova [dstq+strideq*1+16], m0 323 mova [dstq+strideq*2+ 0], m0 324 mova [dstq+strideq*2+16], m0 325 mova [dstq+stride3q + 0], m0 326 mova [dstq+stride3q +16], m0 327 lea dstq, [dstq+strideq*4] 328 dec cntd 329 jg .loop 330 RET 331 332INIT_XMM sse2 333cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a 334 mova m0, [lq+mmsize*0] 335 paddw m0, [lq+mmsize*1] 336 paddw m0, [lq+mmsize*2] 337 paddw m0, [lq+mmsize*3] 338 paddw m0, [aq+mmsize*0] 339 paddw m0, [aq+mmsize*1] 340 paddw m0, [aq+mmsize*2] 341 paddw m0, [aq+mmsize*3] 342 DEFINE_ARGS dst, stride, stride3, cnt 343 lea stride3q, [strideq*3] 344 mov cntd, 16 345 pmaddwd m0, [pw_1] 346 pshufd m1, m0, q3232 347 paddd m0, m1 348 pshufd m1, m0, q1111 349 paddd m0, [pd_32] 350 paddd m0, m1 351 psrad m0, 6 352 pshuflw m0, m0, q0000 353 punpcklqdq m0, m0 354.loop: 355 mova [dstq+strideq*0+ 0], m0 356 mova [dstq+strideq*0+16], m0 357 mova [dstq+strideq*0+32], m0 358 mova [dstq+strideq*0+48], m0 359 mova [dstq+strideq*1+ 0], m0 360 mova [dstq+strideq*1+16], m0 361 mova [dstq+strideq*1+32], m0 362 mova [dstq+strideq*1+48], m0 363 lea dstq, [dstq+strideq*2] 364 dec cntd 365 jg .loop 366 RET 367 368%macro DC_1D_FNS 2 369INIT_MMX mmxext 370cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a 371 mova m0, [%2] 372 DEFINE_ARGS dst, stride, stride3 373 lea stride3q, [strideq*3] 374 pmaddwd m0, [pw_1] 375 pshufw m1, m0, q3232 376 paddd m0, [pd_2] 377 paddd m0, m1 378 psrad m0, 2 379 pshufw m0, m0, q0000 380 mova [dstq+strideq*0], m0 381 mova [dstq+strideq*1], m0 382 mova [dstq+strideq*2], m0 383 mova [dstq+stride3q ], m0 384 RET 385 386INIT_XMM sse2 387cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a 388 mova m0, [%2] 389 DEFINE_ARGS dst, stride, stride3 390 lea stride3q, [strideq*3] 391 pmaddwd m0, [pw_1] 392 pshufd m1, m0, q3232 393 paddd m0, m1 394 pshufd m1, m0, q1111 395 paddd m0, [pd_4] 396 paddd m0, m1 397 psrad m0, 3 398 pshuflw m0, m0, q0000 399 punpcklqdq m0, m0 400 mova [dstq+strideq*0], m0 401 mova [dstq+strideq*1], m0 402 mova [dstq+strideq*2], m0 403 mova [dstq+stride3q ], m0 404 lea dstq, [dstq+strideq*4] 405 mova [dstq+strideq*0], m0 406 mova [dstq+strideq*1], m0 407 mova [dstq+strideq*2], m0 408 mova [dstq+stride3q ], m0 409 RET 410 411INIT_XMM sse2 412cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a 413 mova m0, [%2] 414 paddw m0, [%2+mmsize] 415 DEFINE_ARGS dst, stride, stride3, cnt 416 lea stride3q, [strideq*3] 417 mov cntd, 4 418 pmaddwd m0, [pw_1] 419 pshufd m1, m0, q3232 420 paddd m0, m1 421 pshufd m1, m0, q1111 422 paddd m0, [pd_8] 423 paddd m0, m1 424 psrad m0, 4 425 pshuflw m0, m0, q0000 426 punpcklqdq m0, m0 427.loop: 428 mova [dstq+strideq*0+ 0], m0 429 mova [dstq+strideq*0+16], m0 430 mova [dstq+strideq*1+ 0], m0 431 mova [dstq+strideq*1+16], m0 432 mova [dstq+strideq*2+ 0], m0 433 mova [dstq+strideq*2+16], m0 434 mova [dstq+stride3q + 0], m0 435 mova [dstq+stride3q +16], m0 436 lea dstq, [dstq+strideq*4] 437 dec cntd 438 jg .loop 439 RET 440 441INIT_XMM sse2 442cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a 443 mova m0, [%2+mmsize*0] 444 paddw m0, [%2+mmsize*1] 445 paddw m0, [%2+mmsize*2] 446 paddw m0, [%2+mmsize*3] 447 DEFINE_ARGS dst, stride, cnt 448 mov cntd, 16 449 pmaddwd m0, [pw_1] 450 pshufd m1, m0, q3232 451 paddd m0, m1 452 pshufd m1, m0, q1111 453 paddd m0, [pd_16] 454 paddd m0, m1 455 psrad m0, 5 456 pshuflw m0, m0, q0000 457 punpcklqdq m0, m0 458.loop: 459 mova [dstq+strideq*0+ 0], m0 460 mova [dstq+strideq*0+16], m0 461 mova [dstq+strideq*0+32], m0 462 mova [dstq+strideq*0+48], m0 463 mova [dstq+strideq*1+ 0], m0 464 mova [dstq+strideq*1+16], m0 465 mova [dstq+strideq*1+32], m0 466 mova [dstq+strideq*1+48], m0 467 lea dstq, [dstq+strideq*2] 468 dec cntd 469 jg .loop 470 RET 471%endmacro 472 473DC_1D_FNS top, aq 474DC_1D_FNS left, lq 475 476INIT_MMX mmxext 477cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a 478 mova m5, [pw_1023] 479.body: 480 mova m4, [aq] 481 mova m3, [lq] 482 movd m0, [aq-4] 483 pshufw m0, m0, q1111 484 psubw m4, m0 485 DEFINE_ARGS dst, stride, stride3 486 lea stride3q, [strideq*3] 487 pshufw m0, m3, q3333 488 pshufw m1, m3, q2222 489 pshufw m2, m3, q1111 490 pshufw m3, m3, q0000 491 paddw m0, m4 492 paddw m1, m4 493 paddw m2, m4 494 paddw m3, m4 495 pxor m4, m4 496 pmaxsw m0, m4 497 pmaxsw m1, m4 498 pmaxsw m2, m4 499 pmaxsw m3, m4 500 pminsw m0, m5 501 pminsw m1, m5 502 pminsw m2, m5 503 pminsw m3, m5 504 mova [dstq+strideq*0], m0 505 mova [dstq+strideq*1], m1 506 mova [dstq+strideq*2], m2 507 mova [dstq+stride3q ], m3 508 RET 509 510cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a 511 mova m5, [pw_4095] 512 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body 513 514INIT_XMM sse2 515cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a 516 mova m4, [pw_1023] 517.body: 518 pxor m6, m6 519 mova m5, [aq] 520 movd m0, [aq-4] 521 pshuflw m0, m0, q1111 522 punpcklqdq m0, m0 523 psubw m5, m0 524 DEFINE_ARGS dst, stride, l, stride3, cnt 525 lea stride3q, [strideq*3] 526 mov cntd, 1 527.loop: 528 movh m3, [lq+cntq*8] 529 punpcklwd m3, m3 530 pshufd m0, m3, q3333 531 pshufd m1, m3, q2222 532 pshufd m2, m3, q1111 533 pshufd m3, m3, q0000 534 paddw m0, m5 535 paddw m1, m5 536 paddw m2, m5 537 paddw m3, m5 538 pmaxsw m0, m6 539 pmaxsw m1, m6 540 pmaxsw m2, m6 541 pmaxsw m3, m6 542 pminsw m0, m4 543 pminsw m1, m4 544 pminsw m2, m4 545 pminsw m3, m4 546 mova [dstq+strideq*0], m0 547 mova [dstq+strideq*1], m1 548 mova [dstq+strideq*2], m2 549 mova [dstq+stride3q ], m3 550 lea dstq, [dstq+strideq*4] 551 dec cntd 552 jge .loop 553 RET 554 555cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a 556 mova m4, [pw_4095] 557 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body 558 559INIT_XMM sse2 560cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a 561 mova m7, [pw_1023] 562.body: 563 pxor m6, m6 564 mova m4, [aq] 565 mova m5, [aq+mmsize] 566 movd m0, [aq-4] 567 pshuflw m0, m0, q1111 568 punpcklqdq m0, m0 569 psubw m4, m0 570 psubw m5, m0 571 DEFINE_ARGS dst, stride, l, cnt 572 mov cntd, 7 573.loop: 574 movd m3, [lq+cntq*4] 575 punpcklwd m3, m3 576 pshufd m2, m3, q1111 577 pshufd m3, m3, q0000 578 paddw m0, m2, m4 579 paddw m2, m5 580 paddw m1, m3, m4 581 paddw m3, m5 582 pmaxsw m0, m6 583 pmaxsw m2, m6 584 pmaxsw m1, m6 585 pmaxsw m3, m6 586 pminsw m0, m7 587 pminsw m2, m7 588 pminsw m1, m7 589 pminsw m3, m7 590 mova [dstq+strideq*0+ 0], m0 591 mova [dstq+strideq*0+16], m2 592 mova [dstq+strideq*1+ 0], m1 593 mova [dstq+strideq*1+16], m3 594 lea dstq, [dstq+strideq*2] 595 dec cntd 596 jge .loop 597 RET 598 599cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a 600 mova m7, [pw_4095] 601 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body 602 603INIT_XMM sse2 604cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a 605 mova m0, [pw_1023] 606.body: 607 pxor m1, m1 608%if ARCH_X86_64 609 SWAP 0, 8 610 SWAP 1, 9 611%define reg_min m9 612%define reg_max m8 613%else 614 mova [rsp+ 0], m0 615 mova [rsp+16], m1 616%define reg_min [rsp+16] 617%define reg_max [rsp+ 0] 618%endif 619 620 mova m4, [aq+mmsize*0] 621 mova m5, [aq+mmsize*1] 622 mova m6, [aq+mmsize*2] 623 mova m7, [aq+mmsize*3] 624 movd m0, [aq-4] 625 pshuflw m0, m0, q1111 626 punpcklqdq m0, m0 627 psubw m4, m0 628 psubw m5, m0 629 psubw m6, m0 630 psubw m7, m0 631 DEFINE_ARGS dst, stride, l, cnt 632 mov cntd, 31 633.loop: 634 pinsrw m3, [lq+cntq*2], 0 635 punpcklwd m3, m3 636 pshufd m3, m3, q0000 637 paddw m0, m3, m4 638 paddw m1, m3, m5 639 paddw m2, m3, m6 640 paddw m3, m7 641 pmaxsw m0, reg_min 642 pmaxsw m1, reg_min 643 pmaxsw m2, reg_min 644 pmaxsw m3, reg_min 645 pminsw m0, reg_max 646 pminsw m1, reg_max 647 pminsw m2, reg_max 648 pminsw m3, reg_max 649 mova [dstq+strideq*0+ 0], m0 650 mova [dstq+strideq*0+16], m1 651 mova [dstq+strideq*0+32], m2 652 mova [dstq+strideq*0+48], m3 653 add dstq, strideq 654 dec cntd 655 jge .loop 656 RET 657 658cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a 659 mova m0, [pw_4095] 660 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body 661 662; Directional intra predicion functions 663; 664; in the functions below, 'abcdefgh' refers to above data (sometimes simply 665; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply 666; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered 667; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered 668; top-left data. 669 670; left=(left+2*center+right+2)>>2 671%macro LOWPASS 3 ; left [dst], center, right 672 paddw m%1, m%3 673 psraw m%1, 1 674 pavgw m%1, m%2 675%endmacro 676 677; abcdefgh (src) -> bcdefghh (dst) 678; dst/src can be the same register 679%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg] 680%if cpuflag(ssse3) 681 pshufb %1, %2, %3 ; abcdefgh -> bcdefghh 682%else 683 psrldq %1, %2, 2 ; abcdefgh -> bcdefgh. 684 pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh 685%endif 686%endmacro 687 688; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2) 689%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg] 690%if cpuflag(ssse3) 691 pshufb %1, %3, %4 ; abcdefgh -> bcdefghh 692 pshufb %2, %1, %4 ; bcdefghh -> cdefghhh 693%else 694 psrldq %1, %3, 2 ; abcdefgh -> bcdefgh. 695 psrldq %2, %3, 4 ; abcdefgh -> cdefgh.. 696 pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh 697 pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh 698%endif 699%endmacro 700 701%macro DL_FUNCS 0 702cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a 703 movifnidn aq, amp 704 movu m1, [aq] ; abcdefgh 705 pshufhw m0, m1, q3310 ; abcdefhh 706 SHIFT_RIGHT m1, m1 ; bcdefghh 707 psrldq m2, m1, 2 ; cdefghh. 708 LOWPASS 0, 1, 2 ; BCDEFGh. 709 pshufd m1, m0, q3321 ; DEFGh... 710 movh [dstq+strideq*0], m0 711 movh [dstq+strideq*2], m1 712 add dstq, strideq 713 psrldq m0, 2 ; CDEFGh.. 714 psrldq m1, 2 ; EFGh.... 715 movh [dstq+strideq*0], m0 716 movh [dstq+strideq*2], m1 717 RET 718 719cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a 720 movifnidn aq, amp 721 mova m0, [aq] ; abcdefgh 722%if cpuflag(ssse3) 723 mova m4, [pb_2to15_14_15] 724%endif 725 SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh 726 LOWPASS 0, 1, 2 ; BCDEFGHh 727 shufps m1, m0, m2, q3332 ; FGHhhhhh 728 shufps m3, m0, m1, q2121 ; DEFGHhhh 729 DEFINE_ARGS dst, stride, stride5 730 lea stride5q, [strideq*5] 731 732 mova [dstq+strideq*0], m0 733 mova [dstq+strideq*4], m1 734 SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh 735 pshuflw m1, m1, q3321 ; GHhhhhhh 736 pshufd m2, m0, q3321 ; EFGHhhhh 737 mova [dstq+strideq*1], m0 738 mova [dstq+stride5q ], m1 739 lea dstq, [dstq+strideq*2] 740 pshuflw m1, m1, q3321 ; Hhhhhhhh 741 mova [dstq+strideq*0], m3 742 mova [dstq+strideq*4], m1 743 pshuflw m1, m1, q3321 ; hhhhhhhh 744 mova [dstq+strideq*1], m2 745 mova [dstq+stride5q ], m1 746 RET 747 748cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a 749 movifnidn aq, amp 750 mova m0, [aq] ; abcdefgh 751 mova m3, [aq+mmsize] ; ijklmnop 752 PALIGNR m1, m3, m0, 2, m4 ; bcdefghi 753 PALIGNR m2, m3, m0, 4, m4 ; cdefghij 754 LOWPASS 0, 1, 2 ; BCDEFGHI 755%if cpuflag(ssse3) 756 mova m4, [pb_2to15_14_15] 757%endif 758 SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp 759 LOWPASS 1, 2, 3 ; JKLMNOPp 760 pshufd m2, m2, q3333 ; pppppppp 761 DEFINE_ARGS dst, stride, cnt 762 mov cntd, 8 763 764.loop: 765 mova [dstq+strideq*0+ 0], m0 766 mova [dstq+strideq*0+16], m1 767 mova [dstq+strideq*8+ 0], m1 768 mova [dstq+strideq*8+16], m2 769 add dstq, strideq 770%if cpuflag(avx) 771 vpalignr m0, m1, m0, 2 772%else 773 PALIGNR m3, m1, m0, 2, m4 774 mova m0, m3 775%endif 776 SHIFT_RIGHT m1, m1, m4 777 dec cntd 778 jg .loop 779 RET 780 781cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a 782 movifnidn aq, amp 783 mova m0, [aq+mmsize*0] ; abcdefgh 784 mova m1, [aq+mmsize*1] ; ijklmnop 785 mova m2, [aq+mmsize*2] ; qrstuvwx 786 mova m3, [aq+mmsize*3] ; yz012345 787 PALIGNR m4, m1, m0, 2, m6 788 PALIGNR m5, m1, m0, 4, m6 789 LOWPASS 0, 4, 5 ; BCDEFGHI 790 PALIGNR m4, m2, m1, 2, m6 791 PALIGNR m5, m2, m1, 4, m6 792 LOWPASS 1, 4, 5 ; JKLMNOPQ 793 PALIGNR m4, m3, m2, 2, m6 794 PALIGNR m5, m3, m2, 4, m6 795 LOWPASS 2, 4, 5 ; RSTUVWXY 796%if cpuflag(ssse3) 797 mova m6, [pb_2to15_14_15] 798%endif 799 SHIFT_RIGHTx2 m4, m5, m3, m6 800 LOWPASS 3, 4, 5 ; Z0123455 801 pshufd m4, m4, q3333 ; 55555555 802 DEFINE_ARGS dst, stride, stride8, stride24, cnt 803 mov cntd, 8 804 lea stride8q, [strideq*8] 805 lea stride24q, [stride8q*3] 806 807.loop: 808 mova [dstq+stride8q*0+ 0], m0 809 mova [dstq+stride8q*0+16], m1 810 mova [dstq+stride8q*0+32], m2 811 mova [dstq+stride8q*0+48], m3 812 mova [dstq+stride8q*1+ 0], m1 813 mova [dstq+stride8q*1+16], m2 814 mova [dstq+stride8q*1+32], m3 815 mova [dstq+stride8q*1+48], m4 816 mova [dstq+stride8q*2+ 0], m2 817 mova [dstq+stride8q*2+16], m3 818 mova [dstq+stride8q*2+32], m4 819 mova [dstq+stride8q*2+48], m4 820 mova [dstq+stride24q + 0], m3 821 mova [dstq+stride24q +16], m4 822 mova [dstq+stride24q +32], m4 823 mova [dstq+stride24q +48], m4 824 add dstq, strideq 825%if cpuflag(avx) 826 vpalignr m0, m1, m0, 2 827 vpalignr m1, m2, m1, 2 828 vpalignr m2, m3, m2, 2 829%else 830 PALIGNR m5, m1, m0, 2, m6 831 mova m0, m5 832 PALIGNR m5, m2, m1, 2, m6 833 mova m1, m5 834 PALIGNR m5, m3, m2, 2, m6 835 mova m2, m5 836%endif 837 SHIFT_RIGHT m3, m3, m6 838 dec cntd 839 jg .loop 840 RET 841%endmacro 842 843INIT_XMM sse2 844DL_FUNCS 845INIT_XMM ssse3 846DL_FUNCS 847INIT_XMM avx 848DL_FUNCS 849 850%if HAVE_AVX2_EXTERNAL 851INIT_YMM avx2 852cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a 853 movifnidn aq, amp 854 mova m0, [aq] ; abcdefghijklmnop 855 vpbroadcastw xm1, [aq+30] ; pppppppp 856 vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp 857 vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp 858 vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp 859 LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp 860 vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp 861 DEFINE_ARGS dst, stride, stride3, cnt 862 mov cntd, 2 863 lea stride3q, [strideq*3] 864 865.loop: 866 mova [dstq+strideq*0], m0 867 vpalignr m3, m2, m0, 2 868 vpalignr m4, m2, m0, 4 869 mova [dstq+strideq*1], m3 870 mova [dstq+strideq*2], m4 871 vpalignr m3, m2, m0, 6 872 vpalignr m4, m2, m0, 8 873 mova [dstq+stride3q ], m3 874 lea dstq, [dstq+strideq*4] 875 mova [dstq+strideq*0], m4 876 vpalignr m3, m2, m0, 10 877 vpalignr m4, m2, m0, 12 878 mova [dstq+strideq*1], m3 879 mova [dstq+strideq*2], m4 880 vpalignr m3, m2, m0, 14 881 mova [dstq+stride3q ], m3 882 lea dstq, [dstq+strideq*4] 883 mova m0, m2 884 vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp 885 dec cntd 886 jg .loop 887 RET 888 889cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a 890 movifnidn aq, amp 891 mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop 892 mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 893 vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 894 vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx 895 vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq 896 vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr 897 LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ 898 vperm2i128 m5, m1, m4, q0201 ; yz01234555555555 899 vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455 900 vpalignr m3, m5, m1, 4 ; stuvwxyz01234555 901 LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 902 vperm2i128 m2, m1, m4, q0201 ; Z......555555555 903 vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY 904 DEFINE_ARGS dst, stride, stride3, cnt 905 lea stride3q, [strideq*3] 906 mov cntd, 4 907 908.loop: 909 mova [dstq+strideq*0 + 0], m0 910 mova [dstq+strideq*0 +32], m1 911 vpalignr m3, m5, m0, 2 912 vpalignr m4, m2, m1, 2 913 mova [dstq+strideq*1 + 0], m3 914 mova [dstq+strideq*1 +32], m4 915 vpalignr m3, m5, m0, 4 916 vpalignr m4, m2, m1, 4 917 mova [dstq+strideq*2 + 0], m3 918 mova [dstq+strideq*2 +32], m4 919 vpalignr m3, m5, m0, 6 920 vpalignr m4, m2, m1, 6 921 mova [dstq+stride3q*1+ 0], m3 922 mova [dstq+stride3q*1+32], m4 923 lea dstq, [dstq+strideq*4] 924 vpalignr m3, m5, m0, 8 925 vpalignr m4, m2, m1, 8 926 mova [dstq+strideq*0 + 0], m3 927 mova [dstq+strideq*0 +32], m4 928 vpalignr m3, m5, m0, 10 929 vpalignr m4, m2, m1, 10 930 mova [dstq+strideq*1 + 0], m3 931 mova [dstq+strideq*1 +32], m4 932 vpalignr m3, m5, m0, 12 933 vpalignr m4, m2, m1, 12 934 mova [dstq+strideq*2+ 0], m3 935 mova [dstq+strideq*2+32], m4 936 vpalignr m3, m5, m0, 14 937 vpalignr m4, m2, m1, 14 938 mova [dstq+stride3q+ 0], m3 939 mova [dstq+stride3q+ 32], m4 940 vpalignr m3, m5, m0, 16 941 vpalignr m4, m2, m1, 16 942 vperm2i128 m5, m3, m4, q0201 943 vperm2i128 m2, m4, m4, q0101 944 mova m0, m3 945 mova m1, m4 946 lea dstq, [dstq+strideq*4] 947 dec cntd 948 jg .loop 949 RET 950%endif 951 952%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function 953cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a 954 movh m0, [lq] ; wxyz.... 955 movhps m0, [aq-2] ; wxyz*abc 956 movd m1, [aq+6] ; d....... 957 PALIGNR m1, m0, 2, m2 ; xyz*abcd 958 psrldq m2, m1, 2 ; yz*abcd. 959 LOWPASS 0, 1, 2 ; XYZ#ABC. 960 DEFINE_ARGS dst, stride, stride3 961 lea stride3q, [strideq*3] 962 963 movh [dstq+stride3q ], m0 964 psrldq m0, 2 ; YZ#ABC.. 965 movh [dstq+strideq*2], m0 966 psrldq m0, 2 ; Z#ABC... 967 movh [dstq+strideq*1], m0 968 psrldq m0, 2 ; #ABC.... 969 movh [dstq+strideq*0], m0 970 RET 971 972cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a 973 mova m0, [lq] ; stuvwxyz 974 movu m1, [aq-2] ; *abcdefg 975 mova m2, [aq] ; abcdefgh 976 psrldq m3, m2, 2 ; bcdefgh. 977 LOWPASS 3, 2, 1 ; ABCDEFG. 978 PALIGNR m1, m0, 2, m4 ; tuvwxyz* 979 PALIGNR m2, m1, 2, m4 ; uvwxyz*a 980 LOWPASS 2, 1, 0 ; TUVWXYZ# 981 DEFINE_ARGS dst, stride, dst4, stride3 982 lea stride3q, [strideq*3] 983 lea dst4q, [dstq+strideq*4] 984 985 movhps [dstq +stride3q +0], m2 986 movh [dstq+ stride3q +8], m3 987 mova [dst4q+stride3q +0], m2 988 PALIGNR m1, m3, m2, 2, m0 989 psrldq m3, 2 990 movhps [dstq +strideq*2+0], m1 991 movh [dstq+ strideq*2+8], m3 992 mova [dst4q+strideq*2+0], m1 993 PALIGNR m2, m3, m1, 2, m0 994 psrldq m3, 2 995 movhps [dstq +strideq*1+0], m2 996 movh [dstq+ strideq*1+8], m3 997 mova [dst4q+strideq*1+0], m2 998 PALIGNR m1, m3, m2, 2, m0 999 psrldq m3, 2 1000 movhps [dstq +strideq*0+0], m1 1001 movh [dstq+ strideq*0+8], m3 1002 mova [dst4q+strideq*0+0], m1 1003 RET 1004 1005cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a 1006 mova m0, [lq] ; klmnopqr 1007 mova m1, [lq+mmsize] ; stuvwxyz 1008 movu m2, [aq-2] ; *abcdefg 1009 movu m3, [aq+mmsize-2] ; hijklmno 1010 mova m4, [aq] ; abcdefgh 1011 mova m5, [aq+mmsize] ; ijklmnop 1012 psrldq m6, m5, 2 ; jklmnop. 1013 LOWPASS 6, 5, 3 ; IJKLMNO. 1014 PALIGNR m5, m4, 2, m3 ; bcdefghi 1015 LOWPASS 5, 4, 2 ; ABCDEFGH 1016 PALIGNR m2, m1, 2, m3 ; tuvwxyz* 1017 PALIGNR m4, m2, 2, m3 ; uvwxyz*a 1018 LOWPASS 4, 2, 1 ; TUVWXYZ# 1019 PALIGNR m1, m0, 2, m3 ; lmnopqrs 1020 PALIGNR m2, m1, 2, m3 ; mnopqrst 1021 LOWPASS 2, 1, 0 ; LMNOPQRS 1022 DEFINE_ARGS dst, stride, dst8, cnt 1023 lea dst8q, [dstq+strideq*8] 1024 mov cntd, 8 1025 1026.loop: 1027 sub dst8q, strideq 1028 mova [dst8q+strideq*0+ 0], m4 1029 mova [dst8q+strideq*0+16], m5 1030 mova [dst8q+strideq*8+ 0], m2 1031 mova [dst8q+strideq*8+16], m4 1032%if cpuflag(avx) 1033 vpalignr m2, m4, m2, 2 1034 vpalignr m4, m5, m4, 2 1035 vpalignr m5, m6, m5, 2 1036%else 1037 PALIGNR m0, m4, m2, 2, m1 1038 mova m2, m0 1039 PALIGNR m0, m5, m4, 2, m1 1040 mova m4, m0 1041 PALIGNR m0, m6, m5, 2, m1 1042 mova m5, m0 1043%endif 1044 psrldq m6, 2 1045 dec cntd 1046 jg .loop 1047 RET 1048 1049cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \ 1050 %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a 1051 mova m0, [aq+mmsize*3] ; a[24-31] 1052 movu m1, [aq+mmsize*3-2] ; a[23-30] 1053 psrldq m2, m0, 2 ; a[25-31]. 1054 LOWPASS 2, 0, 1 ; A[24-30]. 1055 mova m1, [aq+mmsize*2] ; a[16-23] 1056 movu m3, [aq+mmsize*2-2] ; a[15-22] 1057 PALIGNR m0, m1, 2, m4 ; a[17-24] 1058 LOWPASS 0, 1, 3 ; A[16-23] 1059 mova m3, [aq+mmsize*1] ; a[8-15] 1060 movu m4, [aq+mmsize*1-2] ; a[7-14] 1061 PALIGNR m1, m3, 2, m5 ; a[9-16] 1062 LOWPASS 1, 3, 4 ; A[8-15] 1063 mova m4, [aq+mmsize*0] ; a[0-7] 1064 movu m5, [aq+mmsize*0-2] ; *a[0-6] 1065 PALIGNR m3, m4, 2, m6 ; a[1-8] 1066 LOWPASS 3, 4, 5 ; A[0-7] 1067 SCRATCH 1, 8, rsp+0*mmsize 1068 SCRATCH 3, 9, rsp+1*mmsize 1069%if notcpuflag(ssse3) 1070 SCRATCH 0, 10, rsp+2*mmsize 1071%endif 1072 mova m6, [lq+mmsize*3] ; l[24-31] 1073 PALIGNR m5, m6, 2, m0 ; l[25-31]* 1074 PALIGNR m4, m5, 2, m0 ; l[26-31]*a 1075 LOWPASS 4, 5, 6 ; L[25-31]# 1076 mova m7, [lq+mmsize*2] ; l[16-23] 1077 PALIGNR m6, m7, 2, m0 ; l[17-24] 1078 PALIGNR m5, m6, 2, m0 ; l[18-25] 1079 LOWPASS 5, 6, 7 ; L[17-24] 1080 mova m1, [lq+mmsize*1] ; l[8-15] 1081 PALIGNR m7, m1, 2, m0 ; l[9-16] 1082 PALIGNR m6, m7, 2, m0 ; l[10-17] 1083 LOWPASS 6, 7, 1 ; L[9-16] 1084 mova m3, [lq+mmsize*0] ; l[0-7] 1085 PALIGNR m1, m3, 2, m0 ; l[1-8] 1086 PALIGNR m7, m1, 2, m0 ; l[2-9] 1087 LOWPASS 7, 1, 3 ; L[1-8] 1088%if cpuflag(ssse3) 1089%if cpuflag(avx) 1090 UNSCRATCH 1, 8, rsp+0*mmsize 1091%endif 1092 UNSCRATCH 3, 9, rsp+1*mmsize 1093%else 1094 UNSCRATCH 0, 10, rsp+2*mmsize 1095%endif 1096 DEFINE_ARGS dst8, stride, stride8, stride24, cnt 1097 lea stride8q, [strideq*8] 1098 lea stride24q, [stride8q*3] 1099 lea dst8q, [dst8q+strideq*8] 1100 mov cntd, 8 1101 1102.loop: 1103 sub dst8q, strideq 1104%if notcpuflag(avx) 1105 UNSCRATCH 1, 8, rsp+0*mmsize 1106%if notcpuflag(ssse3) 1107 UNSCRATCH 3, 9, rsp+1*mmsize 1108%endif 1109%endif 1110 mova [dst8q+stride8q*0+ 0], m4 1111 mova [dst8q+stride8q*0+16], m3 1112 mova [dst8q+stride8q*0+32], m1 1113 mova [dst8q+stride8q*0+48], m0 1114 mova [dst8q+stride8q*1+ 0], m5 1115 mova [dst8q+stride8q*1+16], m4 1116 mova [dst8q+stride8q*1+32], m3 1117 mova [dst8q+stride8q*1+48], m1 1118 mova [dst8q+stride8q*2+ 0], m6 1119 mova [dst8q+stride8q*2+16], m5 1120 mova [dst8q+stride8q*2+32], m4 1121 mova [dst8q+stride8q*2+48], m3 1122 mova [dst8q+stride24q + 0], m7 1123 mova [dst8q+stride24q +16], m6 1124 mova [dst8q+stride24q +32], m5 1125 mova [dst8q+stride24q +48], m4 1126%if cpuflag(avx) 1127 vpalignr m7, m6, m7, 2 1128 vpalignr m6, m5, m6, 2 1129 vpalignr m5, m4, m5, 2 1130 vpalignr m4, m3, m4, 2 1131 vpalignr m3, m1, m3, 2 1132 vpalignr m1, m0, m1, 2 1133 vpalignr m0, m2, m0, 2 1134%else 1135 SCRATCH 2, 8, rsp+0*mmsize 1136%if notcpuflag(ssse3) 1137 SCRATCH 0, 9, rsp+1*mmsize 1138%endif 1139 PALIGNR m2, m6, m7, 2, m0 1140 mova m7, m2 1141 PALIGNR m2, m5, m6, 2, m0 1142 mova m6, m2 1143 PALIGNR m2, m4, m5, 2, m0 1144 mova m5, m2 1145 PALIGNR m2, m3, m4, 2, m0 1146 mova m4, m2 1147 PALIGNR m2, m1, m3, 2, m0 1148 mova m3, m2 1149%if notcpuflag(ssse3) 1150 UNSCRATCH 0, 9, rsp+1*mmsize 1151 SCRATCH 3, 9, rsp+1*mmsize 1152%endif 1153 PALIGNR m2, m0, m1, 2, m3 1154 mova m1, m2 1155 UNSCRATCH 2, 8, rsp+0*mmsize 1156 SCRATCH 1, 8, rsp+0*mmsize 1157 PALIGNR m1, m2, m0, 2, m3 1158 mova m0, m1 1159%endif 1160 psrldq m2, 2 1161 dec cntd 1162 jg .loop 1163 RET 1164%endmacro 1165 1166INIT_XMM sse2 1167DR_FUNCS 3 1168INIT_XMM ssse3 1169DR_FUNCS 2 1170INIT_XMM avx 1171DR_FUNCS 2 1172 1173%if HAVE_AVX2_EXTERNAL 1174INIT_YMM avx2 1175cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a 1176 mova m0, [lq] ; klmnopqrstuvwxyz 1177 movu m1, [aq-2] ; *abcdefghijklmno 1178 mova m2, [aq] ; abcdefghijklmnop 1179 vperm2i128 m4, m2, m2, q2001 ; ijklmnop........ 1180 vpalignr m5, m4, m2, 2 ; bcdefghijklmnop. 1181 vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg 1182 LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. 1183 vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz* 1184 vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a 1185 LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# 1186 vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH 1187 DEFINE_ARGS dst, stride, stride3, stride5, dst3 1188 lea dst3q, [dstq+strideq*4] 1189 lea stride3q, [strideq*3] 1190 lea stride5q, [stride3q+strideq*2] 1191 1192 vpalignr m3, m5, m0, 2 1193 vpalignr m4, m1, m5, 2 1194 mova [dst3q+stride5q*2], m3 ; 14 1195 mova [ dstq+stride3q*2], m4 ; 6 1196 vpalignr m3, m5, m0, 4 1197 vpalignr m4, m1, m5, 4 1198 sub dst3q, strideq 1199 mova [dst3q+stride5q*2], m3 ; 13 1200 mova [dst3q+strideq*2 ], m4 ; 5 1201 mova [dst3q+stride3q*4], m0 ; 15 1202 vpalignr m3, m5, m0, 6 1203 vpalignr m4, m1, m5, 6 1204 mova [dstq+stride3q*4], m3 ; 12 1205 mova [dst3q+strideq*1], m4 ; 4 1206 vpalignr m3, m5, m0, 8 1207 vpalignr m4, m1, m5, 8 1208 mova [dst3q+strideq*8], m3 ; 11 1209 mova [dst3q+strideq*0], m4 ; 3 1210 vpalignr m3, m5, m0, 10 1211 vpalignr m4, m1, m5, 10 1212 mova [dstq+stride5q*2], m3 ; 10 1213 mova [dstq+strideq*2 ], m4 ; 2 1214 vpalignr m3, m5, m0, 12 1215 vpalignr m4, m1, m5, 12 1216 mova [dst3q+stride3q*2], m3 ; 9 1217 mova [dstq+strideq*1 ], m4 ; 1 1218 vpalignr m3, m5, m0, 14 1219 vpalignr m4, m1, m5, 14 1220 mova [dstq+strideq*8], m3 ; 8 1221 mova [dstq+strideq*0], m4 ; 0 1222 mova [dst3q+strideq*4], m5 ; 7 1223 RET 1224 1225cglobal vp9_ipred_vl_16x16_16, 4, 5, 7, dst, stride, l, a 1226 movifnidn aq, amp 1227 mova m0, [aq] ; abcdefghijklmnop 1228 vpbroadcastw xm1, [aq+30] ; pppppppp 1229 vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp 1230 vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp 1231 vperm2i128 m4, m3, m1, q0201 ; jklmnopppppppppp 1232 vpalignr m5, m2, m0, 4 ; cdefghijklmnoppp 1233 vperm2i128 m6, m5, m1, q0201 ; klmnoppppppppppp 1234 LOWPASS 5, 3, 0 ; BCDEFGHIJKLMNOPP 1235 LOWPASS 6, 4, 2 ; JKLMNOPPPPPPPPPP 1236 pavgw m3, m0 ; abcdefghijklmnop 1237 pavgw m4, m2 ; ijklmnoppppppppp 1238 DEFINE_ARGS dst, stride, stride3, stride5, dst4 1239 lea dst4q, [dstq+strideq*4] 1240 lea stride3q, [strideq*3] 1241 lea stride5q, [stride3q+strideq*2] 1242 1243 mova [dstq+strideq*0], m3 ; 0 abcdefghijklmnop 1244 mova [dstq+strideq*1], m5 ; 1 BCDEFGHIJKLMNOPP 1245 vpalignr m0, m4, m3, 2 1246 vpalignr m1, m6, m5, 2 1247 mova [dstq+strideq*2 ], m0 ; 2 bcdefghijklmnopp 1248 mova [dstq+stride3q*1], m1 ; 3 CDEFGHIJKLMNOPPP 1249 vpalignr m0, m4, m3, 4 1250 vpalignr m1, m6, m5, 4 1251 mova [dst4q+strideq*0], m0 ; 4 cdefghijklmnoppp 1252 mova [dstq+stride5q*1], m1 ; 5 DEFGHIJKLMNOPPPP 1253 vpalignr m0, m4, m3, 6 1254 vpalignr m1, m6, m5, 6 1255 mova [ dstq+stride3q*2], m0 ; 6 defghijklmnopppp 1256 mova [dst4q+stride3q*1], m1 ; 7 EFGHIJKLMNOPPPPP 1257 vpalignr m0, m4, m3, 8 1258 vpalignr m1, m6, m5, 8 1259 mova [ dstq+strideq*8], m0 ; 8 efghijklmnoppppp 1260 mova [dst4q+stride5q*1], m1 ; 9 FGHIJKLMNOPPPPPP 1261 vpalignr m0, m4, m3, 10 1262 mova [dstq+stride5q*2], m0 ; 10 fghijklmnopppppp 1263 vpalignr m0, m4, m3, 12 1264 mova [dst4q+strideq*8], m0 ; 12 ghijklmnoppppppp 1265 vpalignr m0, m4, m3, 14 1266 mova [dst4q+stride5q*2], m0 ; 14 hijklmnopppppppp 1267 sub dst4q, strideq 1268 vpalignr m1, m6, m5, 10 1269 mova [dst4q+strideq*8], m1 ; 11 GHIJKLMNOPPPPPPP 1270 vpalignr m1, m6, m5, 12 1271 mova [dst4q+stride5q*2], m1 ; 13 HIJKLMNOPPPPPPPP 1272 vpalignr m1, m6, m5, 14 1273 mova [dst4q+stride3q*4], m1 ; 15 IJKLMNOPPPPPPPPP 1274 RET 1275 1276cglobal vp9_ipred_hd_16x16_16, 4, 5, 7, dst, stride, l, a 1277 movu m0, [aq-2] ; *abcdefghijklmno 1278 mova m1, [lq] ; klmnopqrstuvwxyz 1279 vperm2i128 m2, m1, m0, q0201 ; stuvwxyz*abcdefg 1280 vpalignr m3, m2, m1, 2 ; lmnopqrstuvwxyz* 1281 vpalignr m4, m2, m1, 4 ; mnopqrstuvwxyz*a 1282 LOWPASS 4, 3, 1 ; LMNOPQRSTUVWXYZ# 1283 pavgw m3, m1 ; klmnopqrstuvwxyz 1284 mova m1, [aq] ; abcdefghijklmnop 1285 movu m2, [aq+2] ; bcdefghijklmnop. 1286 LOWPASS 2, 1, 0 ; ABCDEFGHIJKLMNO. 1287 vpunpcklwd m0, m3, m4 ; kLlMmNnOsTtUuVvW 1288 vpunpckhwd m1, m3, m4 ; oPpQqRrSwXxYyZz# 1289 vperm2i128 m3, m1, m0, q0002 ; kLlMmNnOoPpQqRrS 1290 vperm2i128 m4, m0, m1, q0301 ; sTtUuVvWwXxYyZz# 1291 vperm2i128 m0, m4, m2, q0201 ; wXxYyZz#ABCDEFGH 1292 vperm2i128 m1, m3, m4, q0201 ; oPpQqRrSsTtUuVvW 1293 DEFINE_ARGS dst, stride, stride3, stride5, dst5 1294 lea stride3q, [strideq*3] 1295 lea stride5q, [stride3q+strideq*2] 1296 lea dst5q, [dstq+stride5q] 1297 1298 mova [dst5q+stride5q*2], m3 ; 15 kLlMmNnOoPpQqRrS 1299 mova [dst5q+stride3q*2], m1 ; 11 oPpQqRrSsTtUuVvW 1300 mova [dst5q+strideq*2], m4 ; 7 sTtUuVvWwXxYyZz# 1301 mova [dstq+stride3q*1], m0 ; 3 wXxYyZz#ABCDEFGH 1302 vpalignr m5, m4, m1, 4 1303 mova [dstq+stride5q*2], m5 ; 10 pQqRrSsTtUuVvWwX 1304 vpalignr m5, m0, m4, 4 1305 vpalignr m6, m2, m0, 4 1306 mova [dstq+stride3q*2], m5 ; 6 tUuVvWwXxYyZz#AB 1307 mova [dstq+strideq*2], m6 ; 2 xYyZz#ABCDEFGHIJ 1308 vpalignr m5, m4, m1, 8 1309 mova [dst5q+strideq*4], m5 ; 9 qRrSsTtUuVvWwXxY 1310 vpalignr m5, m0, m4, 8 1311 vpalignr m6, m2, m0, 8 1312 mova [dstq+stride5q*1], m5 ; 5 uVvWwXxYyZz#ABCD 1313 mova [dstq+strideq*1], m6 ; 1 yZz#ABCDEFGHIJKL 1314 vpalignr m5, m1, m3, 12 1315 vpalignr m6, m4, m1, 12 1316 mova [dstq+stride3q*4], m5 ; 12 nOoPpQqRrSsTtUuV 1317 mova [dst5q+stride3q], m6 ; 8 rSsTtUuVvWwXxYyZ 1318 vpalignr m5, m0, m4, 12 1319 vpalignr m6, m2, m0, 12 1320 mova [dstq+strideq*4], m5 ; 4 nOoPpQqRrSsTtUuV 1321 mova [dstq+strideq*0], m6 ; 0 z#ABCDEFGHIJKLMN 1322 sub dst5q, strideq 1323 vpalignr m5, m1, m3, 4 1324 mova [dst5q+stride5q*2], m5 ; 14 lMmNnOoPpQqRrSsT 1325 sub dst5q, strideq 1326 vpalignr m5, m1, m3, 8 1327 mova [dst5q+stride5q*2], m5 ; 13 mNnOoPpQqRrSsTtU 1328 RET 1329 1330%if ARCH_X86_64 1331cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a 1332 mova m0, [lq+mmsize*0+0] ; l[0-15] 1333 mova m1, [lq+mmsize*1+0] ; l[16-31] 1334 movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno 1335 mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop 1336 mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345 1337 vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0 1338 vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01 1339 vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012 1340 LOWPASS 0, 6, 7 ; L[0-15] 1341 vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg 1342 vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz* 1343 vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a 1344 LOWPASS 1, 5, 6 ; L[16-31]# 1345 vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx 1346 vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq 1347 LOWPASS 2, 3, 6 ; A[0-15] 1348 movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234 1349 vperm2i128 m6, m4, m4, q2001 ; yz012345........ 1350 vpalignr m7, m6, m4, 2 ; rstuvwxyz012345. 1351 LOWPASS 3, 4, 7 ; A[16-31]. 1352 vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH 1353 vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23] 1354 vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX 1355 DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt 1356 lea stride3q, [strideq*3] 1357 lea stride5q, [stride3q+strideq*2] 1358 lea stride7q, [strideq*4+stride3q] 1359 lea dst24q, [dst8q+stride3q*8] 1360 lea dst8q, [dst8q+strideq*8] 1361 mov cntd, 2 1362 1363.loop: 1364 mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7 1365 mova [dst24q+stride7q+32], m1 1366 mova [dst8q+stride7q+0], m1 1367 mova [dst8q+stride7q+32], m2 1368 vpalignr m6, m4, m1, 2 1369 vpalignr m7, m5, m0, 2 1370 vpalignr m9, m8, m2, 2 1371 mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6 1372 mova [dst24q+stride3q*2+32], m6 1373 mova [dst8q+stride3q*2+0], m6 1374 mova [dst8q+stride3q*2+32], m9 1375 vpalignr m6, m4, m1, 4 1376 vpalignr m7, m5, m0, 4 1377 vpalignr m9, m8, m2, 4 1378 mova [dst24q+stride5q+0], m7 ; 29 21 13 5 1379 mova [dst24q+stride5q+32], m6 1380 mova [dst8q+stride5q+0], m6 1381 mova [dst8q+stride5q+32], m9 1382 vpalignr m6, m4, m1, 6 1383 vpalignr m7, m5, m0, 6 1384 vpalignr m9, m8, m2, 6 1385 mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4 1386 mova [dst24q+strideq*4+32], m6 1387 mova [dst8q+strideq*4+0], m6 1388 mova [dst8q+strideq*4+32], m9 1389 vpalignr m6, m4, m1, 8 1390 vpalignr m7, m5, m0, 8 1391 vpalignr m9, m8, m2, 8 1392 mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3 1393 mova [dst24q+stride3q+32], m6 1394 mova [dst8q+stride3q+0], m6 1395 mova [dst8q+stride3q+32], m9 1396 vpalignr m6, m4, m1, 10 1397 vpalignr m7, m5, m0, 10 1398 vpalignr m9, m8, m2, 10 1399 mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2 1400 mova [dst24q+strideq*2+32], m6 1401 mova [dst8q+strideq*2+0], m6 1402 mova [dst8q+strideq*2+32], m9 1403 vpalignr m6, m4, m1, 12 1404 vpalignr m7, m5, m0, 12 1405 vpalignr m9, m8, m2, 12 1406 mova [dst24q+strideq+0 ], m7 ; 25 17 9 1 1407 mova [dst24q+strideq+32], m6 1408 mova [dst8q+strideq+0], m6 1409 mova [dst8q+strideq+32], m9 1410 vpalignr m6, m4, m1, 14 1411 vpalignr m7, m5, m0, 14 1412 vpalignr m9, m8, m2, 14 1413 mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0 1414 mova [dst24q+strideq*0+32], m6 1415 mova [dst8q+strideq*0+0], m6 1416 mova [dst8q+strideq*0+32], m9 1417 mova m0, m5 1418 mova m5, m1 1419 mova m1, m4 1420 mova m4, m2 1421 mova m2, m8 1422 mova m8, m3 1423 sub dst24q, stride7q 1424 sub dst24q, strideq 1425 sub dst8q, stride7q 1426 sub dst8q, strideq 1427 dec cntd 1428 jg .loop 1429 RET 1430%endif 1431%endif 1432 1433%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function 1434cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a 1435 movifnidn aq, amp 1436 movu m0, [aq] ; abcdefgh 1437 psrldq m1, m0, 2 ; bcdefgh. 1438 psrldq m2, m0, 4 ; cdefgh.. 1439 LOWPASS 2, 1, 0 ; BCDEFGH. 1440 pavgw m1, m0 ; ABCDEFG. 1441 DEFINE_ARGS dst, stride, stride3 1442 lea stride3q, [strideq*3] 1443 1444 movh [dstq+strideq*0], m1 1445 movh [dstq+strideq*1], m2 1446 psrldq m1, 2 1447 psrldq m2, 2 1448 movh [dstq+strideq*2], m1 1449 movh [dstq+stride3q ], m2 1450 RET 1451 1452cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a 1453 movifnidn aq, amp 1454 mova m0, [aq] ; abcdefgh 1455%if cpuflag(ssse3) 1456 mova m3, [pb_2to15_14_15] 1457%endif 1458 SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh 1459 LOWPASS 2, 1, 0 ; BCDEFGHh 1460 pavgw m1, m0 ; ABCDEFGh 1461 DEFINE_ARGS dst, stride, stride3 1462 lea stride3q, [strideq*3] 1463 1464 mova [dstq+strideq*0], m1 1465 mova [dstq+strideq*1], m2 1466 SHIFT_RIGHT m1, m1, m3 1467 SHIFT_RIGHT m2, m2, m3 1468 mova [dstq+strideq*2], m1 1469 mova [dstq+stride3q ], m2 1470 lea dstq, [dstq+strideq*4] 1471 SHIFT_RIGHT m1, m1, m3 1472 SHIFT_RIGHT m2, m2, m3 1473 mova [dstq+strideq*0], m1 1474 mova [dstq+strideq*1], m2 1475 SHIFT_RIGHT m1, m1, m3 1476 SHIFT_RIGHT m2, m2, m3 1477 mova [dstq+strideq*2], m1 1478 mova [dstq+stride3q ], m2 1479 RET 1480 1481cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a 1482 movifnidn aq, amp 1483 mova m0, [aq] 1484 mova m1, [aq+mmsize] 1485 PALIGNR m2, m1, m0, 2, m3 1486 PALIGNR m3, m1, m0, 4, m4 1487 LOWPASS 3, 2, 0 1488 pavgw m2, m0 1489%if cpuflag(ssse3) 1490 mova m4, [pb_2to15_14_15] 1491%endif 1492 SHIFT_RIGHTx2 m5, m0, m1, m4 1493 LOWPASS 0, 5, 1 1494 pavgw m1, m5 1495 DEFINE_ARGS dst, stride, cnt 1496 mov cntd, 8 1497 1498.loop: 1499 mova [dstq+strideq*0+ 0], m2 1500 mova [dstq+strideq*0+16], m1 1501 mova [dstq+strideq*1+ 0], m3 1502 mova [dstq+strideq*1+16], m0 1503 lea dstq, [dstq+strideq*2] 1504%if cpuflag(avx) 1505 vpalignr m2, m1, m2, 2 1506 vpalignr m3, m0, m3, 2 1507%else 1508 PALIGNR m5, m1, m2, 2, m4 1509 mova m2, m5 1510 PALIGNR m5, m0, m3, 2, m4 1511 mova m3, m5 1512%endif 1513 SHIFT_RIGHT m1, m1, m4 1514 SHIFT_RIGHT m0, m0, m4 1515 dec cntd 1516 jg .loop 1517 RET 1518 1519cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a 1520 movifnidn aq, amp 1521 mova m0, [aq+mmsize*0] 1522 mova m1, [aq+mmsize*1] 1523 mova m2, [aq+mmsize*2] 1524 PALIGNR m6, m1, m0, 2, m5 1525 PALIGNR m7, m1, m0, 4, m5 1526 LOWPASS 7, 6, 0 1527 pavgw m6, m0 1528 SCRATCH 6, 8, rsp+0*mmsize 1529 PALIGNR m4, m2, m1, 2, m0 1530 PALIGNR m5, m2, m1, 4, m0 1531 LOWPASS 5, 4, 1 1532 pavgw m4, m1 1533 mova m0, [aq+mmsize*3] 1534 PALIGNR m1, m0, m2, 2, m6 1535 PALIGNR m3, m0, m2, 4, m6 1536 LOWPASS 3, 1, 2 1537 pavgw m2, m1 1538%if cpuflag(ssse3) 1539 PRELOAD 10, pb_2to15_14_15, shuf 1540%endif 1541 SHIFT_RIGHTx2 m6, m1, m0, reg_shuf 1542 LOWPASS 1, 6, 0 1543 pavgw m0, m6 1544%if ARCH_X86_64 1545 pshufd m9, m6, q3333 1546%endif 1547%if cpuflag(avx) 1548 UNSCRATCH 6, 8, rsp+0*mmsize 1549%endif 1550 DEFINE_ARGS dst, stride, cnt, stride16, stride17 1551 mov stride16q, strideq 1552 mov cntd, 8 1553 shl stride16q, 4 1554 lea stride17q, [stride16q+strideq] 1555 1556 ; FIXME m8 is unused for avx, so we could save one register here for win64 1557.loop: 1558%if notcpuflag(avx) 1559 UNSCRATCH 6, 8, rsp+0*mmsize 1560%endif 1561 mova [dstq+strideq*0+ 0], m6 1562 mova [dstq+strideq*0+16], m4 1563 mova [dstq+strideq*0+32], m2 1564 mova [dstq+strideq*0+48], m0 1565 mova [dstq+strideq*1+ 0], m7 1566 mova [dstq+strideq*1+16], m5 1567 mova [dstq+strideq*1+32], m3 1568 mova [dstq+strideq*1+48], m1 1569 mova [dstq+stride16q+ 0], m4 1570 mova [dstq+stride16q+16], m2 1571 mova [dstq+stride16q+32], m0 1572%if ARCH_X86_64 1573 mova [dstq+stride16q+48], m9 1574%endif 1575 mova [dstq+stride17q+ 0], m5 1576 mova [dstq+stride17q+16], m3 1577 mova [dstq+stride17q+32], m1 1578%if ARCH_X86_64 1579 mova [dstq+stride17q+48], m9 1580%endif 1581 lea dstq, [dstq+strideq*2] 1582%if cpuflag(avx) 1583 vpalignr m6, m4, m6, 2 1584 vpalignr m4, m2, m4, 2 1585 vpalignr m2, m0, m2, 2 1586 vpalignr m7, m5, m7, 2 1587 vpalignr m5, m3, m5, 2 1588 vpalignr m3, m1, m3, 2 1589%else 1590 SCRATCH 3, 8, rsp+0*mmsize 1591%if notcpuflag(ssse3) 1592 SCRATCH 1, 10, rsp+1*mmsize 1593%endif 1594 PALIGNR m3, m4, m6, 2, m1 1595 mova m6, m3 1596 PALIGNR m3, m2, m4, 2, m1 1597 mova m4, m3 1598 PALIGNR m3, m0, m2, 2, m1 1599 mova m2, m3 1600 PALIGNR m3, m5, m7, 2, m1 1601 mova m7, m3 1602 UNSCRATCH 3, 8, rsp+0*mmsize 1603 SCRATCH 6, 8, rsp+0*mmsize 1604%if notcpuflag(ssse3) 1605 UNSCRATCH 1, 10, rsp+1*mmsize 1606 SCRATCH 7, 10, rsp+1*mmsize 1607%endif 1608 PALIGNR m6, m3, m5, 2, m7 1609 mova m5, m6 1610 PALIGNR m6, m1, m3, 2, m7 1611 mova m3, m6 1612%if notcpuflag(ssse3) 1613 UNSCRATCH 7, 10, rsp+1*mmsize 1614%endif 1615%endif 1616 SHIFT_RIGHT m1, m1, reg_shuf 1617 SHIFT_RIGHT m0, m0, reg_shuf 1618 dec cntd 1619 jg .loop 1620 1621%if ARCH_X86_32 1622 DEFINE_ARGS dst, stride, stride3 1623 lea stride3q, [strideq*3] 1624%assign %%n 0 1625%rep 4 1626 mova [dstq+strideq*0+48], m0 1627 mova [dstq+strideq*1+48], m0 1628 mova [dstq+strideq*2+48], m0 1629 mova [dstq+stride3q +48], m0 1630%if %%n < 3 1631 lea dstq, [dstq+strideq*4] 1632%endif 1633%assign %%n (%%n+1) 1634%endrep 1635%endif 1636 RET 1637%endmacro 1638 1639INIT_XMM sse2 1640VL_FUNCS 2 1641INIT_XMM ssse3 1642VL_FUNCS 1 1643INIT_XMM avx 1644VL_FUNCS 1 1645 1646%macro VR_FUNCS 0 1647cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a 1648 movu m0, [aq-2] 1649 movhps m1, [lq] 1650 PALIGNR m0, m1, 10, m2 ; xyz*abcd 1651 pslldq m1, m0, 2 ; .xyz*abc 1652 pslldq m2, m0, 4 ; ..xyz*ab 1653 LOWPASS 2, 1, 0 ; ..YZ#ABC 1654 pavgw m1, m0 ; ....#ABC 1655 DEFINE_ARGS dst, stride, stride3 1656 lea stride3q, [strideq*3] 1657 1658 movhps [dstq+strideq*0], m1 1659 movhps [dstq+strideq*1], m2 1660 shufps m0, m2, m1, q3210 1661%if cpuflag(ssse3) 1662 pshufb m2, [pb_4_5_8to13_8x0] 1663%else 1664 pshuflw m2, m2, q2222 1665 psrldq m2, 6 1666%endif 1667 psrldq m0, 6 1668 movh [dstq+strideq*2], m0 1669 movh [dstq+stride3q ], m2 1670 RET 1671 1672cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a 1673 movu m1, [aq-2] ; *abcdefg 1674 movu m2, [lq] ; stuvwxyz 1675 mova m0, [aq] ; abcdefgh 1676 PALIGNR m3, m1, m2, 14, m4 ; z*abcdef 1677 LOWPASS 3, 1, 0 1678 pavgw m0, m1 1679 PALIGNR m1, m2, 2, m4 ; tuvwxyz* 1680 pslldq m4, m2, 2 ; .stuvwxy 1681 LOWPASS 4, 2, 1 1682 DEFINE_ARGS dst, stride, stride3 1683 lea stride3q, [strideq*3] 1684 1685 mova [dstq+strideq*0], m0 1686 mova [dstq+strideq*1], m3 1687 PALIGNR m0, m4, 14, m1 1688 pslldq m4, 2 1689 PALIGNR m3, m4, 14, m1 1690 pslldq m4, 2 1691 mova [dstq+strideq*2], m0 1692 mova [dstq+stride3q ], m3 1693 lea dstq, [dstq+strideq*4] 1694 PALIGNR m0, m4, 14, m1 1695 pslldq m4, 2 1696 PALIGNR m3, m4, 14, m1 1697 pslldq m4, 2 1698 mova [dstq+strideq*0], m0 1699 mova [dstq+strideq*1], m3 1700 PALIGNR m0, m4, 14, m1 1701 pslldq m4, 2 1702 PALIGNR m3, m4, 14, m4 1703 mova [dstq+strideq*2], m0 1704 mova [dstq+stride3q ], m3 1705 RET 1706 1707cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a 1708 movu m1, [aq-2] ; *abcdefg 1709 movu m2, [aq+mmsize-2] ; hijklmno 1710 mova m3, [aq] ; abcdefgh 1711 mova m4, [aq+mmsize] ; ijklmnop 1712 mova m5, [lq+mmsize] ; stuvwxyz 1713 PALIGNR m0, m1, m5, 14, m6 ; z*abcdef 1714 movu m6, [aq+mmsize-4] ; ghijklmn 1715 LOWPASS 6, 2, 4 1716 pavgw m2, m4 1717 LOWPASS 0, 1, 3 1718 pavgw m3, m1 1719 PALIGNR m1, m5, 2, m7 ; tuvwxyz* 1720 movu m7, [lq+mmsize-2] ; rstuvwxy 1721 LOWPASS 1, 5, 7 1722 movu m5, [lq+2] ; lmnopqrs 1723 pslldq m4, m5, 2 ; .lmnopqr 1724 pslldq m7, m5, 4 ; ..lmnopq 1725 LOWPASS 5, 4, 7 1726 psrld m4, m1, 16 1727 psrld m7, m5, 16 1728 pand m1, [pd_65535] 1729 pand m5, [pd_65535] 1730 packssdw m7, m4 1731 packssdw m5, m1 1732 DEFINE_ARGS dst, stride, cnt 1733 mov cntd, 8 1734 1735.loop: 1736 mova [dstq+strideq*0+ 0], m3 1737 mova [dstq+strideq*0+16], m2 1738 mova [dstq+strideq*1+ 0], m0 1739 mova [dstq+strideq*1+16], m6 1740 lea dstq, [dstq+strideq*2] 1741 PALIGNR m2, m3, 14, m4 1742 PALIGNR m3, m7, 14, m4 1743 pslldq m7, 2 1744 PALIGNR m6, m0, 14, m4 1745 PALIGNR m0, m5, 14, m4 1746 pslldq m5, 2 1747 dec cntd 1748 jg .loop 1749 RET 1750 1751cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a 1752 movu m0, [aq+mmsize*0-2] ; *a[0-6] 1753 movu m1, [aq+mmsize*1-2] ; a[7-14] 1754 movu m2, [aq+mmsize*2-2] ; a[15-22] 1755 movu m3, [aq+mmsize*3-2] ; a[23-30] 1756 mova m4, [aq+mmsize*3+0] ; a[24-31] 1757 movu m5, [aq+mmsize*3-4] ; a[22-29] 1758 LOWPASS 5, 3, 4 ; A[23-30] 1759 SCRATCH 5, 8, rsp+0*mmsize 1760 pavgw m3, m4 1761 mova m4, [aq+mmsize*2+0] ; a[16-23] 1762 movu m6, [aq+mmsize*2-4] ; a[14-21] 1763 LOWPASS 6, 2, 4 ; A[15-22] 1764 SCRATCH 6, 9, rsp+1*mmsize 1765 pavgw m2, m4 1766 mova m4, [aq+mmsize*1+0] ; a[8-15] 1767 movu m7, [aq+mmsize*1-4] ; a[6-13] 1768 LOWPASS 7, 1, 4 ; A[7-14] 1769 SCRATCH 7, 10, rsp+2*mmsize 1770 pavgw m1, m4 1771 mova m4, [aq+mmsize*0+0] ; a[0-7] 1772 mova m5, [lq+mmsize*3+0] ; l[24-31] 1773 PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5] 1774 LOWPASS 6, 0, 4 ; #A[0-6] 1775 SCRATCH 6, 11, rsp+3*mmsize 1776 pavgw m4, m0 1777 PALIGNR m0, m5, 2, m7 ; l[25-31]* 1778 movu m7, [lq+mmsize*3-2] ; l[23-30] 1779 LOWPASS 0, 5, 7 ; L[24-31] 1780 movu m5, [lq+mmsize*2-2] ; l[15-22] 1781 mova m7, [lq+mmsize*2+0] ; l[16-23] 1782 movu m6, [lq+mmsize*2+2] ; l[17-24] 1783 LOWPASS 5, 7, 6 ; L[16-23] 1784 psrld m7, m0, 16 1785 psrld m6, m5, 16 1786 pand m0, [pd_65535] 1787 pand m5, [pd_65535] 1788 packssdw m6, m7 1789 packssdw m5, m0 1790 SCRATCH 5, 12, rsp+4*mmsize 1791 SCRATCH 6, 13, rsp+5*mmsize 1792 movu m6, [lq+mmsize*1-2] ; l[7-14] 1793 mova m0, [lq+mmsize*1+0] ; l[8-15] 1794 movu m5, [lq+mmsize*1+2] ; l[9-16] 1795 LOWPASS 6, 0, 5 ; L[8-15] 1796 movu m0, [lq+mmsize*0+2] ; l[1-8] 1797 pslldq m5, m0, 2 ; .l[1-7] 1798 pslldq m7, m0, 4 ; ..l[1-6] 1799 LOWPASS 0, 5, 7 1800 psrld m5, m6, 16 1801 psrld m7, m0, 16 1802 pand m6, [pd_65535] 1803 pand m0, [pd_65535] 1804 packssdw m7, m5 1805 packssdw m0, m6 1806 UNSCRATCH 6, 13, rsp+5*mmsize 1807 DEFINE_ARGS dst, stride, stride16, cnt, stride17 1808 mov stride16q, strideq 1809 mov cntd, 8 1810 shl stride16q, 4 1811%if ARCH_X86_64 1812 lea stride17q, [stride16q+strideq] 1813%endif 1814 1815.loop: 1816 mova [dstq+strideq*0+ 0], m4 1817 mova [dstq+strideq*0+16], m1 1818 mova [dstq+strideq*0+32], m2 1819 mova [dstq+strideq*0+48], m3 1820%if ARCH_X86_64 1821 mova [dstq+strideq*1+ 0], m11 1822 mova [dstq+strideq*1+16], m10 1823 mova [dstq+strideq*1+32], m9 1824 mova [dstq+strideq*1+48], m8 1825%endif 1826 mova [dstq+stride16q+ 0], m6 1827 mova [dstq+stride16q+16], m4 1828 mova [dstq+stride16q+32], m1 1829 mova [dstq+stride16q+48], m2 1830%if ARCH_X86_64 1831 mova [dstq+stride17q+ 0], m12 1832 mova [dstq+stride17q+16], m11 1833 mova [dstq+stride17q+32], m10 1834 mova [dstq+stride17q+48], m9 1835%endif 1836 lea dstq, [dstq+strideq*2] 1837 PALIGNR m3, m2, 14, m5 1838 PALIGNR m2, m1, 14, m5 1839 PALIGNR m1, m4, 14, m5 1840 PALIGNR m4, m6, 14, m5 1841 PALIGNR m6, m7, 14, m5 1842 pslldq m7, 2 1843%if ARCH_X86_64 1844 PALIGNR m8, m9, 14, m5 1845 PALIGNR m9, m10, 14, m5 1846 PALIGNR m10, m11, 14, m5 1847 PALIGNR m11, m12, 14, m5 1848 PALIGNR m12, m0, 14, m5 1849 pslldq m0, 2 1850%endif 1851 dec cntd 1852 jg .loop 1853 1854%if ARCH_X86_32 1855 UNSCRATCH 5, 12, rsp+4*mmsize 1856 UNSCRATCH 4, 11, rsp+3*mmsize 1857 UNSCRATCH 3, 10, rsp+2*mmsize 1858 UNSCRATCH 2, 9, rsp+1*mmsize 1859 UNSCRATCH 1, 8, rsp+0*mmsize 1860 mov dstq, dstm 1861 mov cntd, 8 1862 add dstq, strideq 1863.loop2: 1864 mova [dstq+strideq*0+ 0], m4 1865 mova [dstq+strideq*0+16], m3 1866 mova [dstq+strideq*0+32], m2 1867 mova [dstq+strideq*0+48], m1 1868 mova [dstq+stride16q+ 0], m5 1869 mova [dstq+stride16q+16], m4 1870 mova [dstq+stride16q+32], m3 1871 mova [dstq+stride16q+48], m2 1872 lea dstq, [dstq+strideq*2] 1873 PALIGNR m1, m2, 14, m6 1874 PALIGNR m2, m3, 14, m6 1875 PALIGNR m3, m4, 14, m6 1876 PALIGNR m4, m5, 14, m6 1877 PALIGNR m5, m0, 14, m6 1878 pslldq m0, 2 1879 dec cntd 1880 jg .loop2 1881%endif 1882 RET 1883%endmacro 1884 1885INIT_XMM sse2 1886VR_FUNCS 1887INIT_XMM ssse3 1888VR_FUNCS 1889INIT_XMM avx 1890VR_FUNCS 1891 1892%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function 1893cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a 1894 movh m0, [lq] ; abcd 1895%if cpuflag(ssse3) 1896 pshufb m0, [pb_0to7_67x4] ; abcddddd 1897%else 1898 punpcklqdq m0, m0 1899 pshufhw m0, m0, q3333 ; abcddddd 1900%endif 1901 psrldq m1, m0, 2 ; bcddddd. 1902 psrldq m2, m0, 4 ; cddddd.. 1903 LOWPASS 2, 1, 0 ; BCDddd.. 1904 pavgw m1, m0 ; abcddddd 1905 SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd 1906 PALIGNR m2, m1, 4, m0 ; bCcDdddd 1907 DEFINE_ARGS dst, stride, stride3 1908 lea stride3q, [strideq*3] 1909 1910 movh [dstq+strideq*0], m1 ; aBbC 1911 movh [dstq+strideq*1], m2 ; bCcD 1912 movhps [dstq+strideq*2], m1 ; cDdd 1913 movhps [dstq+stride3q ], m2 ; dddd 1914 RET 1915 1916cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a 1917 mova m0, [lq] 1918%if cpuflag(ssse3) 1919 mova m3, [pb_2to15_14_15] 1920%endif 1921 SHIFT_RIGHTx2 m1, m2, m0, m3 1922 LOWPASS 2, 1, 0 1923 pavgw m1, m0 1924 SBUTTERFLY wd, 1, 2, 0 1925 shufps m0, m1, m2, q1032 1926 pshufd m3, m2, q3332 1927 DEFINE_ARGS dst, stride, stride3 1928 lea stride3q, [strideq*3] 1929 1930 mova [dstq+strideq *0], m1 1931 mova [dstq+strideq *2], m0 1932 mova [dstq+strideq *4], m2 1933 mova [dstq+stride3q*2], m3 1934 add dstq, strideq 1935%if cpuflag(avx) 1936 vpalignr m1, m2, m1, 4 1937%else 1938 PALIGNR m0, m2, m1, 4, m3 1939 mova m1, m0 1940%endif 1941 pshufd m2, m2, q3321 1942 shufps m0, m1, m2, q1032 1943 pshufd m3, m2, q3332 1944 mova [dstq+strideq *0], m1 1945 mova [dstq+strideq *2], m0 1946 mova [dstq+strideq *4], m2 1947 mova [dstq+stride3q*2], m3 1948 RET 1949 1950cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a 1951 mova m0, [lq] 1952 mova m3, [lq+mmsize] 1953 movu m1, [lq+2] 1954 movu m2, [lq+4] 1955 LOWPASS 2, 1, 0 1956 pavgw m1, m0 1957 SBUTTERFLY wd, 1, 2, 0 1958%if cpuflag(ssse3) 1959 mova m5, [pb_2to15_14_15] 1960%endif 1961 SHIFT_RIGHTx2 m0, m4, m3, m5 1962 LOWPASS 4, 0, 3 1963 pavgw m3, m0 1964 SBUTTERFLY wd, 3, 4, 5 1965 pshufd m0, m0, q3333 1966 DEFINE_ARGS dst, stride, stride3, cnt 1967 lea stride3q, [strideq*3] 1968 mov cntd, 4 1969 1970.loop: 1971 mova [dstq+strideq *0+ 0], m1 1972 mova [dstq+strideq *0+16], m2 1973 mova [dstq+strideq *4+ 0], m2 1974 mova [dstq+strideq *4+16], m3 1975 mova [dstq+strideq *8+ 0], m3 1976 mova [dstq+strideq *8+16], m4 1977 mova [dstq+stride3q*4+ 0], m4 1978 mova [dstq+stride3q*4+16], m0 1979 add dstq, strideq 1980%if cpuflag(avx) 1981 vpalignr m1, m2, m1, 4 1982 vpalignr m2, m3, m2, 4 1983 vpalignr m3, m4, m3, 4 1984 vpalignr m4, m0, m4, 4 1985%else 1986 PALIGNR m5, m2, m1, 4, m6 1987 mova m1, m5 1988 PALIGNR m5, m3, m2, 4, m6 1989 mova m2, m5 1990 PALIGNR m5, m4, m3, 4, m6 1991 mova m3, m5 1992 PALIGNR m5, m0, m4, 4, m6 1993 mova m4, m5 1994%endif 1995 dec cntd 1996 jg .loop 1997 RET 1998 1999cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \ 2000 %1 * -mmsize * ARCH_X86_32, dst, stride, l, a 2001 mova m2, [lq+mmsize*0+0] 2002 movu m1, [lq+mmsize*0+2] 2003 movu m0, [lq+mmsize*0+4] 2004 LOWPASS 0, 1, 2 2005 pavgw m1, m2 2006 SBUTTERFLY wd, 1, 0, 2 2007 SCRATCH 1, 8, rsp+0*mmsize 2008 mova m4, [lq+mmsize*1+0] 2009 movu m3, [lq+mmsize*1+2] 2010 movu m2, [lq+mmsize*1+4] 2011 LOWPASS 2, 3, 4 2012 pavgw m3, m4 2013 SBUTTERFLY wd, 3, 2, 4 2014 mova m6, [lq+mmsize*2+0] 2015 movu m5, [lq+mmsize*2+2] 2016 movu m4, [lq+mmsize*2+4] 2017 LOWPASS 4, 5, 6 2018 pavgw m5, m6 2019 SBUTTERFLY wd, 5, 4, 6 2020 mova m7, [lq+mmsize*3+0] 2021 SCRATCH 0, 9, rsp+1*mmsize 2022%if cpuflag(ssse3) 2023 mova m0, [pb_2to15_14_15] 2024%endif 2025 SHIFT_RIGHTx2 m1, m6, m7, m0 2026 LOWPASS 6, 1, 7 2027 pavgw m7, m1 2028 SBUTTERFLY wd, 7, 6, 0 2029 pshufd m1, m1, q3333 2030 UNSCRATCH 0, 9, rsp+1*mmsize 2031 DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 2032 lea stride3q, [strideq*3] 2033 lea stride4q, [strideq*4] 2034 lea stride28q, [stride4q*8] 2035 lea stride20q, [stride4q*5] 2036 sub stride28q, stride4q 2037 mov cntd, 4 2038 2039.loop: 2040%if ARCH_X86_64 2041 SWAP 1, 8 2042%else 2043 mova [rsp+1*mmsize], m1 2044 mova m1, [rsp+0*mmsize] 2045%endif 2046 mova [dstq+strideq *0+ 0], m1 2047 mova [dstq+strideq *0+16], m0 2048 mova [dstq+strideq *0+32], m3 2049 mova [dstq+strideq *0+48], m2 2050 mova [dstq+stride4q*1+ 0], m0 2051 mova [dstq+stride4q*1+16], m3 2052 mova [dstq+stride4q*1+32], m2 2053 mova [dstq+stride4q*1+48], m5 2054 mova [dstq+stride4q*2+ 0], m3 2055 mova [dstq+stride4q*2+16], m2 2056 mova [dstq+stride4q*2+32], m5 2057 mova [dstq+stride4q*2+48], m4 2058%if cpuflag(avx) 2059 vpalignr m1, m0, m1, 4 2060 vpalignr m0, m3, m0, 4 2061 vpalignr m3, m2, m3, 4 2062%else 2063 SCRATCH 6, 9, rsp+2*mmsize 2064%if notcpuflag(ssse3) 2065 SCRATCH 7, 10, rsp+3*mmsize 2066%endif 2067 PALIGNR m6, m0, m1, 4, m7 2068 mova m1, m6 2069 PALIGNR m6, m3, m0, 4, m7 2070 mova m0, m6 2071 PALIGNR m6, m2, m3, 4, m7 2072 mova m3, m6 2073 UNSCRATCH 6, 9, rsp+2*mmsize 2074 SCRATCH 0, 9, rsp+2*mmsize 2075%if notcpuflag(ssse3) 2076 UNSCRATCH 7, 10, rsp+3*mmsize 2077 SCRATCH 3, 10, rsp+3*mmsize 2078%endif 2079%endif 2080%if ARCH_X86_64 2081 SWAP 1, 8 2082%else 2083 mova [rsp+0*mmsize], m1 2084 mova m1, [rsp+1*mmsize] 2085%endif 2086 mova [dstq+stride3q*4+ 0], m2 2087 mova [dstq+stride3q*4+16], m5 2088 mova [dstq+stride3q*4+32], m4 2089 mova [dstq+stride3q*4+48], m7 2090 mova [dstq+stride4q*4+ 0], m5 2091 mova [dstq+stride4q*4+16], m4 2092 mova [dstq+stride4q*4+32], m7 2093 mova [dstq+stride4q*4+48], m6 2094 mova [dstq+stride20q + 0], m4 2095 mova [dstq+stride20q +16], m7 2096 mova [dstq+stride20q +32], m6 2097 mova [dstq+stride20q +48], m1 2098 mova [dstq+stride3q*8+ 0], m7 2099 mova [dstq+stride3q*8+16], m6 2100 mova [dstq+stride3q*8+32], m1 2101 mova [dstq+stride3q*8+48], m1 2102 mova [dstq+stride28q + 0], m6 2103 mova [dstq+stride28q +16], m1 2104 mova [dstq+stride28q +32], m1 2105 mova [dstq+stride28q +48], m1 2106%if cpuflag(avx) 2107 vpalignr m2, m5, m2, 4 2108 vpalignr m5, m4, m5, 4 2109 vpalignr m4, m7, m4, 4 2110 vpalignr m7, m6, m7, 4 2111 vpalignr m6, m1, m6, 4 2112%else 2113 PALIGNR m0, m5, m2, 4, m3 2114 mova m2, m0 2115 PALIGNR m0, m4, m5, 4, m3 2116 mova m5, m0 2117 PALIGNR m0, m7, m4, 4, m3 2118 mova m4, m0 2119 PALIGNR m0, m6, m7, 4, m3 2120 mova m7, m0 2121 PALIGNR m0, m1, m6, 4, m3 2122 mova m6, m0 2123 UNSCRATCH 0, 9, rsp+2*mmsize 2124%if notcpuflag(ssse3) 2125 UNSCRATCH 3, 10, rsp+3*mmsize 2126%endif 2127%endif 2128 add dstq, strideq 2129 dec cntd 2130 jg .loop 2131 RET 2132%endmacro 2133 2134INIT_XMM sse2 2135HU_FUNCS 4 2136INIT_XMM ssse3 2137HU_FUNCS 3 2138INIT_XMM avx 2139HU_FUNCS 2 2140 2141%macro HD_FUNCS 0 2142cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a 2143 movh m0, [lq] 2144 movhps m0, [aq-2] 2145 psrldq m1, m0, 2 2146 psrldq m2, m0, 4 2147 LOWPASS 2, 1, 0 2148 pavgw m1, m0 2149 punpcklwd m1, m2 2150 DEFINE_ARGS dst, stride, stride3 2151 lea stride3q, [strideq*3] 2152 2153 movh [dstq+stride3q ], m1 2154 movhps [dstq+strideq*1], m1 2155 movhlps m2, m2 2156 PALIGNR m2, m1, 4, m0 2157 movh [dstq+strideq*2], m2 2158 movhps [dstq+strideq*0], m2 2159 RET 2160 2161cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a 2162 mova m0, [lq] 2163 movu m1, [aq-2] 2164 PALIGNR m2, m1, m0, 2, m3 2165 PALIGNR m3, m1, m0, 4, m4 2166 LOWPASS 3, 2, 0 2167 pavgw m2, m0 2168 SBUTTERFLY wd, 2, 3, 0 2169 psrldq m0, m1, 2 2170 psrldq m4, m1, 4 2171 LOWPASS 1, 0, 4 2172 DEFINE_ARGS dst8, mstride, cnt 2173 lea dst8q, [dst8q+mstrideq*8] 2174 neg mstrideq 2175 mov cntd, 4 2176 2177.loop: 2178 add dst8q, mstrideq 2179 mova [dst8q+mstrideq*0], m2 2180 mova [dst8q+mstrideq*4], m3 2181%if cpuflag(avx) 2182 vpalignr m2, m3, m2, 4 2183 vpalignr m3, m1, m3, 4 2184%else 2185 PALIGNR m0, m3, m2, 4, m4 2186 mova m2, m0 2187 PALIGNR m0, m1, m3, 4, m4 2188 mova m3, m0 2189%endif 2190 psrldq m1, 4 2191 dec cntd 2192 jg .loop 2193 RET 2194 2195cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a 2196 mova m2, [lq] 2197 movu m1, [lq+2] 2198 movu m0, [lq+4] 2199 LOWPASS 0, 1, 2 2200 pavgw m1, m2 2201 mova m4, [lq+mmsize] 2202 movu m5, [aq-2] 2203 PALIGNR m3, m5, m4, 2, m6 2204 PALIGNR m2, m5, m4, 4, m6 2205 LOWPASS 2, 3, 4 2206 pavgw m3, m4 2207 SBUTTERFLY wd, 1, 0, 4 2208 SBUTTERFLY wd, 3, 2, 4 2209 mova m6, [aq] 2210 movu m4, [aq+2] 2211 LOWPASS 4, 6, 5 2212 movu m5, [aq+mmsize-2] 2213 psrldq m6, m5, 2 2214 psrldq m7, m5, 4 2215 LOWPASS 5, 6, 7 2216 DEFINE_ARGS dst, mstride, mstride3, cnt 2217 lea dstq, [dstq+mstrideq*8] 2218 lea dstq, [dstq+mstrideq*8] 2219 neg mstrideq 2220 lea mstride3q, [mstrideq*3] 2221 mov cntd, 4 2222 2223.loop: 2224 add dstq, mstrideq 2225 mova [dstq+mstride3q*4+ 0], m2 2226 mova [dstq+mstride3q*4+16], m4 2227 mova [dstq+mstrideq *8+ 0], m3 2228 mova [dstq+mstrideq *8+16], m2 2229 mova [dstq+mstrideq *4+ 0], m0 2230 mova [dstq+mstrideq *4+16], m3 2231 mova [dstq+mstrideq *0+ 0], m1 2232 mova [dstq+mstrideq *0+16], m0 2233%if cpuflag(avx) 2234 vpalignr m1, m0, m1, 4 2235 vpalignr m0, m3, m0, 4 2236 vpalignr m3, m2, m3, 4 2237 vpalignr m2, m4, m2, 4 2238 vpalignr m4, m5, m4, 4 2239%else 2240 PALIGNR m6, m0, m1, 4, m7 2241 mova m1, m6 2242 PALIGNR m6, m3, m0, 4, m7 2243 mova m0, m6 2244 PALIGNR m6, m2, m3, 4, m7 2245 mova m3, m6 2246 PALIGNR m6, m4, m2, 4, m7 2247 mova m2, m6 2248 PALIGNR m6, m5, m4, 4, m7 2249 mova m4, m6 2250%endif 2251 psrldq m5, 4 2252 dec cntd 2253 jg .loop 2254 RET 2255 2256cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \ 2257 10 * -mmsize * ARCH_X86_32, dst, stride, l, a 2258 mova m2, [lq+mmsize*0+0] 2259 movu m1, [lq+mmsize*0+2] 2260 movu m0, [lq+mmsize*0+4] 2261 LOWPASS 0, 1, 2 2262 pavgw m1, m2 2263 SBUTTERFLY wd, 1, 0, 2 2264 mova m4, [lq+mmsize*1+0] 2265 movu m3, [lq+mmsize*1+2] 2266 movu m2, [lq+mmsize*1+4] 2267 LOWPASS 2, 3, 4 2268 pavgw m3, m4 2269 SBUTTERFLY wd, 3, 2, 4 2270 SCRATCH 0, 8, rsp+0*mmsize 2271 SCRATCH 1, 9, rsp+1*mmsize 2272 SCRATCH 2, 10, rsp+2*mmsize 2273 SCRATCH 3, 11, rsp+3*mmsize 2274 mova m6, [lq+mmsize*2+0] 2275 movu m5, [lq+mmsize*2+2] 2276 movu m4, [lq+mmsize*2+4] 2277 LOWPASS 4, 5, 6 2278 pavgw m5, m6 2279 SBUTTERFLY wd, 5, 4, 6 2280 mova m0, [lq+mmsize*3+0] 2281 movu m1, [aq+mmsize*0-2] 2282 PALIGNR m7, m1, m0, 2, m2 2283 PALIGNR m6, m1, m0, 4, m2 2284 LOWPASS 6, 7, 0 2285 pavgw m7, m0 2286 SBUTTERFLY wd, 7, 6, 0 2287 mova m2, [aq+mmsize*0+0] 2288 movu m0, [aq+mmsize*0+2] 2289 LOWPASS 0, 2, 1 2290 movu m1, [aq+mmsize*1-2] 2291 mova m2, [aq+mmsize*1+0] 2292 movu m3, [aq+mmsize*1+2] 2293 LOWPASS 1, 2, 3 2294 SCRATCH 6, 12, rsp+6*mmsize 2295 SCRATCH 7, 13, rsp+7*mmsize 2296 movu m2, [aq+mmsize*2-2] 2297 mova m3, [aq+mmsize*2+0] 2298 movu m6, [aq+mmsize*2+2] 2299 LOWPASS 2, 3, 6 2300 movu m3, [aq+mmsize*3-2] 2301 psrldq m6, m3, 2 2302 psrldq m7, m3, 4 2303 LOWPASS 3, 6, 7 2304 UNSCRATCH 6, 12, rsp+6*mmsize 2305 UNSCRATCH 7, 13, rsp+7*mmsize 2306%if ARCH_X86_32 2307 mova [rsp+4*mmsize], m4 2308 mova [rsp+5*mmsize], m5 2309 ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need 2310 ; to do it again here 2311%endif 2312 DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 2313 mov cntd, 4 2314 lea stride3q, [strideq*3] 2315%if ARCH_X86_64 2316 lea stride4q, [strideq*4] 2317 lea stride28q, [stride4q*8] 2318 lea stride20q, [stride4q*5] 2319 sub stride28q, stride4q 2320%endif 2321 add dstq, stride3q 2322 2323 ; x86-32 doesn't have enough registers, so on that platform, we split 2324 ; the loop in 2... Otherwise you spend most of the loop (un)scratching 2325.loop: 2326%if ARCH_X86_64 2327 mova [dstq+stride28q + 0], m9 2328 mova [dstq+stride28q +16], m8 2329 mova [dstq+stride28q +32], m11 2330 mova [dstq+stride28q +48], m10 2331 mova [dstq+stride3q*8+ 0], m8 2332 mova [dstq+stride3q*8+16], m11 2333 mova [dstq+stride3q*8+32], m10 2334 mova [dstq+stride3q*8+48], m5 2335 mova [dstq+stride20q + 0], m11 2336 mova [dstq+stride20q +16], m10 2337 mova [dstq+stride20q +32], m5 2338 mova [dstq+stride20q +48], m4 2339 mova [dstq+stride4q*4+ 0], m10 2340 mova [dstq+stride4q*4+16], m5 2341 mova [dstq+stride4q*4+32], m4 2342 mova [dstq+stride4q*4+48], m7 2343%endif 2344 mova [dstq+stride3q*4+ 0], m5 2345 mova [dstq+stride3q*4+16], m4 2346 mova [dstq+stride3q*4+32], m7 2347 mova [dstq+stride3q*4+48], m6 2348 mova [dstq+strideq* 8+ 0], m4 2349 mova [dstq+strideq* 8+16], m7 2350 mova [dstq+strideq* 8+32], m6 2351 mova [dstq+strideq* 8+48], m0 2352 mova [dstq+strideq* 4+ 0], m7 2353 mova [dstq+strideq* 4+16], m6 2354 mova [dstq+strideq* 4+32], m0 2355 mova [dstq+strideq* 4+48], m1 2356 mova [dstq+strideq* 0+ 0], m6 2357 mova [dstq+strideq* 0+16], m0 2358 mova [dstq+strideq* 0+32], m1 2359 mova [dstq+strideq* 0+48], m2 2360 sub dstq, strideq 2361%if cpuflag(avx) 2362%if ARCH_X86_64 2363 vpalignr m9, m8, m9, 4 2364 vpalignr m8, m11, m8, 4 2365 vpalignr m11, m10, m11, 4 2366 vpalignr m10, m5, m10, 4 2367%endif 2368 vpalignr m5, m4, m5, 4 2369 vpalignr m4, m7, m4, 4 2370 vpalignr m7, m6, m7, 4 2371 vpalignr m6, m0, m6, 4 2372 vpalignr m0, m1, m0, 4 2373 vpalignr m1, m2, m1, 4 2374 vpalignr m2, m3, m2, 4 2375%else 2376%if ARCH_X86_64 2377 PALIGNR m12, m8, m9, 4, m13 2378 mova m9, m12 2379 PALIGNR m12, m11, m8, 4, m13 2380 mova m8, m12 2381 PALIGNR m12, m10, m11, 4, m13 2382 mova m11, m12 2383 PALIGNR m12, m5, m10, 4, m13 2384 mova m10, m12 2385%endif 2386 SCRATCH 3, 12, rsp+8*mmsize, sh 2387%if notcpuflag(ssse3) 2388 SCRATCH 2, 13, rsp+9*mmsize 2389%endif 2390 PALIGNR m3, m4, m5, 4, m2 2391 mova m5, m3 2392 PALIGNR m3, m7, m4, 4, m2 2393 mova m4, m3 2394 PALIGNR m3, m6, m7, 4, m2 2395 mova m7, m3 2396 PALIGNR m3, m0, m6, 4, m2 2397 mova m6, m3 2398 PALIGNR m3, m1, m0, 4, m2 2399 mova m0, m3 2400%if notcpuflag(ssse3) 2401 UNSCRATCH 2, 13, rsp+9*mmsize 2402 SCRATCH 0, 13, rsp+9*mmsize 2403%endif 2404 PALIGNR m3, m2, m1, 4, m0 2405 mova m1, m3 2406 PALIGNR m3, reg_sh, m2, 4, m0 2407 mova m2, m3 2408%if notcpuflag(ssse3) 2409 UNSCRATCH 0, 13, rsp+9*mmsize 2410%endif 2411 UNSCRATCH 3, 12, rsp+8*mmsize, sh 2412%endif 2413 psrldq m3, 4 2414 dec cntd 2415 jg .loop 2416 2417%if ARCH_X86_32 2418 UNSCRATCH 0, 8, rsp+0*mmsize 2419 UNSCRATCH 1, 9, rsp+1*mmsize 2420 UNSCRATCH 2, 10, rsp+2*mmsize 2421 UNSCRATCH 3, 11, rsp+3*mmsize 2422 mova m4, [rsp+4*mmsize] 2423 mova m5, [rsp+5*mmsize] 2424 mova m6, [rsp+6*mmsize] 2425 mova m7, [rsp+7*mmsize] 2426 DEFINE_ARGS dst, stride, stride5, stride3 2427 lea stride5q, [strideq*5] 2428 lea dstq, [dstq+stride5q*4] 2429 DEFINE_ARGS dst, stride, cnt, stride3 2430 mov cntd, 4 2431.loop_2: 2432 mova [dstq+stride3q*4+ 0], m1 2433 mova [dstq+stride3q*4+16], m0 2434 mova [dstq+stride3q*4+32], m3 2435 mova [dstq+stride3q*4+48], m2 2436 mova [dstq+strideq* 8+ 0], m0 2437 mova [dstq+strideq* 8+16], m3 2438 mova [dstq+strideq* 8+32], m2 2439 mova [dstq+strideq* 8+48], m5 2440 mova [dstq+strideq* 4+ 0], m3 2441 mova [dstq+strideq* 4+16], m2 2442 mova [dstq+strideq* 4+32], m5 2443 mova [dstq+strideq* 4+48], m4 2444 mova [dstq+strideq* 0+ 0], m2 2445 mova [dstq+strideq* 0+16], m5 2446 mova [dstq+strideq* 0+32], m4 2447 mova [dstq+strideq* 0+48], m7 2448 sub dstq, strideq 2449%if cpuflag(avx) 2450 vpalignr m1, m0, m1, 4 2451 vpalignr m0, m3, m0, 4 2452 vpalignr m3, m2, m3, 4 2453 vpalignr m2, m5, m2, 4 2454 vpalignr m5, m4, m5, 4 2455 vpalignr m4, m7, m4, 4 2456 vpalignr m7, m6, m7, 4 2457%else 2458 SCRATCH 6, 12, rsp+8*mmsize, sh 2459%if notcpuflag(ssse3) 2460 SCRATCH 7, 13, rsp+9*mmsize 2461%endif 2462 PALIGNR m6, m0, m1, 4, m7 2463 mova m1, m6 2464 PALIGNR m6, m3, m0, 4, m7 2465 mova m0, m6 2466 PALIGNR m6, m2, m3, 4, m7 2467 mova m3, m6 2468 PALIGNR m6, m5, m2, 4, m7 2469 mova m2, m6 2470 PALIGNR m6, m4, m5, 4, m7 2471 mova m5, m6 2472%if notcpuflag(ssse3) 2473 UNSCRATCH 7, 13, rsp+9*mmsize 2474 SCRATCH 5, 13, rsp+9*mmsize 2475%endif 2476 PALIGNR m6, m7, m4, 4, m5 2477 mova m4, m6 2478 PALIGNR m6, reg_sh, m7, 4, m5 2479 mova m7, m6 2480%if notcpuflag(ssse3) 2481 UNSCRATCH 5, 13, rsp+9*mmsize 2482%endif 2483 UNSCRATCH 6, 12, rsp+8*mmsize, sh 2484%endif 2485 psrldq m6, 4 2486 dec cntd 2487 jg .loop_2 2488%endif 2489 RET 2490%endmacro 2491 2492INIT_XMM sse2 2493HD_FUNCS 2494INIT_XMM ssse3 2495HD_FUNCS 2496INIT_XMM avx 2497HD_FUNCS 2498