Lines Matching refs:T1

392 my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
430 &movdqa ($T1,$D1);
432 &pslld ($T1,2);
434 &paddd ($T1,$D1); # *5
436 &movdqa (&QWP(16*5,"esp"),$T1);
438 &movdqa ($T1,$D3);
440 &pslld ($T1,2);
442 &paddd ($T1,$D3); # *5
444 &movdqa (&QWP(16*7,"esp"),$T1);
447 &pshufd ($T1,$D0,0b01000100);
453 &movdqa (&QWP(16*0,"edx"),$T1);
470 &pmuludq ($D0,$T1); # h0*r0
482 &movdqa ($T1,$T0);
484 &movdqa ($T2,$T1);
485 &pmuludq ($T1,&QWP(16*2,$base)); # r1*h2
489 &paddq ($D3,$T1);
490 &$load ($T1,5); # s1
493 &pmuludq ($T1,&QWP(16*4,$base)); # s1*h4
499 &paddq ($D0,$T1);
500 &movdqa ($T1,$T0);
504 &pmuludq ($T1,&QWP(16*0,$base)); # r2*h0
508 &paddq ($D2,$T1);
510 &$load ($T1,3); # r3^n
513 &movdqa ($T2,$T1);
514 &pmuludq ($T1,&QWP(16*1,$base)); # r3*h1
518 &paddq ($D4,$T1);
519 &movdqa ($T1,$T0);
522 &movdqa ($T2,$T1);
523 &pmuludq ($T1,&QWP(16*3,$base)); # s3*h3
527 &paddq ($D1,$T1);
529 &$load ($T1,8); # s4^n
532 &movdqa ($T2,$T1);
533 &pmuludq ($T1,&QWP(16*4,$base)); # s4*h4
537 &paddq ($D3,$T1);
538 &movdqa ($T1,$T0);
541 &pmuludq ($T1,&QWP(16*3,$base)); # s4*h3
544 &paddq ($D2,$T1);
564 &movdqa ($T1,$D0);
566 &psrlq ($T1,26);
568 &paddq ($T1,$D1); # h0 -> h1
571 &movdqa ($D1,$T1);
572 &psrlq ($T1,26);
578 &paddq ($T1,$D2); # h1 -> h2
581 &movdqa ($D2,$T1);
582 &psrlq ($T1,26);
584 &paddd ($T1,$D3); # h2 -> h3
587 &movdqa ($D3,$T1);
588 &psrlq ($T1,26);
592 &paddd ($D4,$T1); # h3 -> h4
630 &movdqa ($T1,$D1);
632 &pslld ($T1,2);
634 &paddd ($T1,$D1); # *5
636 &movdqu (&QWP(16*5,"edi"),$T1);
638 &movdqa ($T1,$D3);
640 &pslld ($T1,2);
642 &paddd ($T1,$D3); # *5
644 &movdqu (&QWP(16*7,"edi"),$T1);
727 &movdqu ($T1,&QWP(0,"esi")); # input
730 &movdqa ($T0,$T1); # -> base 2^26 ...
731 &pand ($T1,$MASK);
732 &paddd ($D0,$T1); # ... and accumulate
734 &movdqa ($T1,$T0);
736 &psrldq ($T1,6);
740 &movdqa ($T0,$T1);
741 &psrlq ($T1,4);
742 &pand ($T1,$MASK);
743 &paddd ($D2,$T1);
745 &movdqa ($T1,$T0);
748 &psrldq ($T1,7);
752 &paddd ($D4,$T1);
753 &movd ($T1,&DWP(16*0+12,"edi")); # r0
769 &pmuludq ($D0,$T1); # h4*r0
770 &pmuludq ($D1,$T1); # h3*r0
771 &pmuludq ($D2,$T1); # h2*r0
773 &pmuludq ($D3,$T1); # h1*r0
774 &pmuludq ($D4,$T1); # h0*r0
794 &pshufd ($T1,$T0,0b01000100); # duplicate r^3:r^4
797 &movdqa (&QWP(16*0,"edx"),$T1);
799 &movdqu ($T1,&QWP(16*1,"edi"));
801 &pshufd ($T0,$T1,0b01000100);
802 &pshufd ($T1,$T1,0b11101110);
805 &movdqa (&QWP(16*(1-9),"edx"),$T1);
806 &pshufd ($T1,$T0,0b01000100);
808 &movdqa (&QWP(16*2,"edx"),$T1);
809 &movdqu ($T1,&QWP(16*3,"edi"));
811 &pshufd ($T0,$T1,0b01000100);
812 &pshufd ($T1,$T1,0b11101110);
815 &movdqa (&QWP(16*(3-9),"edx"),$T1);
816 &pshufd ($T1,$T0,0b01000100);
818 &movdqa (&QWP(16*4,"edx"),$T1);
819 &movdqu ($T1,&QWP(16*5,"edi"));
821 &pshufd ($T0,$T1,0b01000100);
822 &pshufd ($T1,$T1,0b11101110);
825 &movdqa (&QWP(16*(5-9),"edx"),$T1);
826 &pshufd ($T1,$T0,0b01000100);
828 &movdqa (&QWP(16*6,"edx"),$T1);
829 &movdqu ($T1,&QWP(16*7,"edi"));
831 &pshufd ($T0,$T1,0b01000100);
832 &pshufd ($T1,$T1,0b11101110);
835 &movdqa (&QWP(16*(7-9),"edx"),$T1);
836 &pshufd ($T1,$T0,0b01000100);
838 &movdqa (&QWP(16*8,"edx"),$T1);
845 &movdqu ($T1,&QWP($inpbase+16,"esi"));
853 &movdqa ($D3,$T1);
858 &punpckhqdq ($D4,$T1); # 4
859 &punpcklqdq ($T0,$T1); # 0:1
864 &movdqa ($T1,$T0);
866 &psrlq ($T1,26);
868 &pand ($T1,$MASK); # 1
892 &movdqa (&QWP(16*1,"eax"),$T1);
906 &movdqa ($D0,$T1);
907 &pmuludq ($T1,$T2); # h1*r0
921 &paddq ($D1,$T1);
922 &movdqa ($T1,$T0);
926 &pmuludq ($T1,&$addr(4)); # h0*r4
931 &paddq ($D4,$T1);
932 &movdqa ($T1,$T0);
936 &pmuludq ($T1,&$addr(3)); # h1*r3
940 &paddq ($D4,$T1);
941 &movdqa ($T1,$T0);
945 &movdqa ($T2,$T1);
946 &pmuludq ($T1,&$addr(1)); # h2*r1
950 &paddq ($D3,$T1);
951 &movdqa ($T1,$T0);
954 &movdqa ($T2,$T1);
955 &pmuludq ($T1,&$addr(7)); # h3*s3
959 &paddq ($D1,$T1);
961 &movdqa ($T1,&QWP(16*4,"eax")); # pull h4
964 &movdqa ($T2,$T1);
965 &pmuludq ($T1,&$addr(8)); # h4*s4
969 &paddq ($D3,$T1);
970 &movdqa ($T1,$T0);
974 &pmuludq ($T1,&$addr(7)); # h4*s3
976 &paddq ($D2,$T1);
985 &paddd ($T1,&QWP(16*(5+1),"esp"));
995 &movdqa (&QWP(16*1,"eax"),$T1);
1010 &movdqa ($D0,$T1);
1011 &pmuludq ($T1,$T2); # h1*r0
1016 &paddq ($T1,&QWP(16*1,"esp"));
1038 &paddd ($T1,$D1);
1046 &movdqa (&QWP(16*1,"eax"),$T1);
1059 &pmuludq ($T1,$T2); # h1*r0
1064 &movdqa ($D1,$T1);
1077 &paddd ($T1,&QWP(16*6,"esp"));
1087 &movdqa (&QWP(16*1,"esp"),$T1);
1088 &pmuludq ($T1,$T2); # h1*r0
1092 &paddq ($D1,$T1);
1093 &movdqa ($T1,$D3);
1099 &movdqa (&QWP(16*3,"esp"),$T1);
1100 &movdqa ($T1,$D4);
1103 &movdqa (&QWP(16*4,"esp"),$T1);
1114 &pshufd ($T1,$D4,0b01001110);
1116 &paddq ($D4,$T1);
1118 &pshufd ($T1,$D0,0b01001110);
1120 &paddq ($D0,$T1);
1122 &pshufd ($T1,$D2,0b01001110);
1123 #&paddq ($D2,$T1);
1125 &lazy_reduction (sub { &paddq ($D2,$T1) });
1266 &vpslld ($T1,$D1,2);
1268 &vpaddd ($T1,$T1,$D1); # *5
1270 &vmovdqa (&QWP(16*5,"esp"),$T1);
1272 &vpslld ($T1,$D3,2);
1274 &vpaddd ($T1,$T1,$D3); # *5
1276 &vmovdqa (&QWP(16*7,"esp"),$T1);
1280 &vmovdqa ($T1,$D1);
1304 &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # r1*h3
1306 &vpmuludq ($T2,$T1,&QWP(16*2,"edx")); # r1*h2
1308 &vpmuludq ($T0,$T1,&QWP(16*1,"edx")); # r1*h1
1311 &vpmuludq ($T1,$T1,&QWP(16*0,"edx")); # r1*h0
1312 &vpaddq ($D1,$D1,$T1);
1317 &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # r2*h2
1318 &vpaddq ($D4,$D4,$T1);
1321 &vmovdqa ($T1,&QWP(16*6,"esp")); # s2
1324 &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s2*h4
1327 &vpmuludq ($T1,$T1,&QWP(16*3,"edx")); # s2*h3
1328 &vpaddq ($D0,$D0,$T1);
1332 &vmovdqa ($T1,&QWP(16*7,"esp")); # s3
1335 &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s3*h4
1337 &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # s3*h3
1340 &vpmuludq ($T1,$T1,&QWP(16*2,"edx")); # s3*h2
1341 &vpaddq ($D0,$D0,$T1);
1346 &vpmuludq ($T1,$T0,&QWP(16*4,"edx")); # s4*h4
1347 &vpaddq ($D3,$D3,$T1);
1350 &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # s4*h2
1351 &vpaddq ($D1,$D1,$T1);
1360 &vpsrlq ($T1,$D0,26);
1363 &vpaddq ($D1,$D1,$T1); # h0 -> h1
1366 &vpsrlq ($T1,$D1,26);
1368 &vpaddq ($D2,$D2,$T1); # h1 -> h2
1371 &vpsrlq ($T1,$D2,26);
1374 &vpaddd ($D3,$D3,$T1); # h2 -> h3
1375 &vpsrlq ($T1,$D3,26);
1380 &vpaddd ($D4,$D4,$T1); # h3 -> h4
1416 &vpslld ($T1,$D1,2);
1418 &vpaddd ($T1,$T1,$D1); # *5
1420 &vmovdqu (&QWP(16*5,"edi"),$T1);
1422 &vpslld ($T1,$D3,2);
1424 &vpaddd ($T1,$T1,$D3); # *5
1426 &vmovdqu (&QWP(16*7,"edi"),$T1);
1437 my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7));
1565 &vmovdqu (&X($T1),&QWP(16*1,"esi"));
1582 &vpxor ($T1,$T1,$T1);
1592 &vmovdqu (&X($T1),&QWP(16*1,"esi"));
1594 &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1);
1612 &vpsrldq ($D0,$T1,6);
1614 &vpunpckhqdq ($D1,$T0,$T1); # 4
1615 &vpunpcklqdq ($T0,$T0,$T1); # 0:1
1620 &vpsrlq ($T1,$T0,26);
1624 &vpand ($T1,$T1,$MASK); # 1
1638 &vpaddq ($T1,$T1,&QWP(32*1,"esp"));
1650 &vmovdqa (QWP(32*1,"esp"),$T1);
1660 &vpmuludq ($T1,$T0,&$addr(4)); # h0*r4
1661 &vpaddq ($D4,$D4,$T1); # d4 + h0*r4
1665 &vpmuludq ($T1,$T0,&$addr(1)); # h0*r1
1666 &vpaddq ($D1,$D1,$T1); # d1 += h0*r1
1670 &vpmuludq ($T1,$T2,&$addr(2)); # h1*r2
1671 &vpaddq ($D3,$D3,$T1); # d3 += h1*r2
1674 &vpmuludq ($T1,$T2,&$addr(8)); # h1*s4
1675 &vpaddq ($D0,$D0,$T1); # d0 += h1*s4
1676 &vmovdqa ($T1,&QWP(32*3,"esp")); # h3
1682 &vpmuludq ($T0,$T1,&$addr(0)); # h3*r0
1684 &vpmuludq ($T2,$T1,&$addr(1)); # h3*r1
1686 &vpmuludq ($T0,$T1,&$addr(6)); # h3*s2
1689 &vpmuludq ($T2,$T1,&$addr(7)); # h3*s3
1691 &vpmuludq ($T1,$T1,&$addr(8)); # h3*s4
1692 &vpaddq ($D2,$D2,$T1); # d2 += h3*s4
1696 &vpmuludq ($T1,$T0,&$addr(5)); # h4*s1
1697 &vpaddq ($D0,$D0,$T1); # d0 += h4*s1
1701 &vpmuludq ($T1,$T0,&$addr(6)); # h4*s2
1702 &vpaddq ($D1,$D1,$T1); # d1 += h4*s2
1714 &vpsrlq ($T1,$D0,26);
1717 &vpaddq ($D1,$D1,$T1); # h0 -> h1
1720 &vpsrlq ($T1,$D1,26);
1722 &vpaddq ($D2,$D2,$T1); # h1 -> h2
1725 &vpsrlq ($T1,$D2,26);
1728 &vpaddq ($D3,$D3,$T1); # h2 -> h3
1729 &vpsrlq ($T1,$D3,26);
1734 &vpaddq ($D4,$D4,$T1); # h3 -> h4
1739 &vmovdqu (&X($T1),&QWP(16*1,"esi"));
1741 &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1);
1756 &vpsrldq ($T1,$D3,8);
1759 &vpaddq ($D3,$D3,$T1);
1760 &vpsrldq ($T1,$D1,8);
1763 &vpaddq ($D1,$D1,$T1);
1764 &vpermq ($T1,$D4,2); # keep folding
1767 &vpaddq ($D4,$D4,$T1);
1768 &vpermq ($T1,$D0,2);
1771 &vpaddq ($D0,$D0,$T1);
1772 &vpermq ($T1,$D2,2);
1774 &vpaddq ($D2,$D2,$T1);