Lines Matching refs:dst
30 void SharedTurboAssembler::Move(Register dst, uint32_t src) {
33 mov(dst, Immediate(src));
35 movl(dst, Immediate(src));
41 void SharedTurboAssembler::Move(Register dst, Register src) {
43 if (dst != src) {
45 mov(dst, src);
47 movq(dst, src);
54 void SharedTurboAssembler::Add(Register dst, Immediate src) {
57 add(dst, src);
59 addq(dst, src);
65 void SharedTurboAssembler::And(Register dst, Immediate src) {
68 and_(dst, src);
71 andl(dst, src);
73 andq(dst, src);
80 void SharedTurboAssembler::Movhps(XMMRegister dst, XMMRegister src1,
84 vmovhps(dst, src1, src2);
86 if (dst != src1) {
87 movaps(dst, src1);
89 movhps(dst, src2);
93 void SharedTurboAssembler::Movlps(XMMRegister dst, XMMRegister src1,
97 vmovlps(dst, src1, src2);
99 if (dst != src1) {
100 movaps(dst, src1);
102 movlps(dst, src2);
106 void SharedTurboAssembler::Pblendvb(XMMRegister dst, XMMRegister src1,
110 vpblendvb(dst, src1, src2, mask);
114 DCHECK_EQ(dst, src1);
115 pblendvb(dst, src2);
119 void SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1,
123 vshufps(dst, src1, src2, imm8);
125 if (dst != src1) {
126 movaps(dst, src1);
128 shufps(dst, src2, imm8);
132 void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
136 if (dst != src) {
137 Movaps(dst, src);
143 // Pass src as operand to avoid false-dependency on dst.
144 vmovhlps(dst, src, src);
146 movhlps(dst, src);
151 void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src,
157 vmovsd(dst, src, rep);
159 vmovlhps(dst, src, rep);
163 if (dst != src) {
164 DCHECK_NE(dst, rep); // Ensure rep is not overwritten.
165 movaps(dst, src);
168 movsd(dst, rep);
170 movlhps(dst, rep);
175 void SharedTurboAssembler::F32x4Min(XMMRegister dst, XMMRegister lhs,
183 vminps(dst, rhs, lhs);
184 } else if (dst == lhs || dst == rhs) {
185 XMMRegister src = dst == lhs ? rhs : lhs;
187 minps(scratch, dst);
188 minps(dst, src);
192 movaps(dst, rhs);
193 minps(dst, lhs);
196 Orps(scratch, dst);
198 Cmpunordps(dst, dst, scratch);
199 Orps(scratch, dst);
200 Psrld(dst, dst, byte{10});
201 Andnps(dst, dst, scratch);
204 void SharedTurboAssembler::F32x4Max(XMMRegister dst, XMMRegister lhs,
212 vmaxps(dst, rhs, lhs);
213 } else if (dst == lhs || dst == rhs) {
214 XMMRegister src = dst == lhs ? rhs : lhs;
216 maxps(scratch, dst);
217 maxps(dst, src);
221 movaps(dst, rhs);
222 maxps(dst, lhs);
225 Xorps(dst, scratch);
227 Orps(scratch, dst);
229 Subps(scratch, scratch, dst);
231 Cmpunordps(dst, dst, scratch);
232 Psrld(dst, dst, byte{10});
233 Andnps(dst, dst, scratch);
236 void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs,
244 vminpd(dst, rhs, lhs);
246 vorpd(scratch, scratch, dst);
248 vcmpunordpd(dst, dst, scratch);
249 vorpd(scratch, scratch, dst);
250 vpsrlq(dst, dst, byte{13});
251 vandnpd(dst, dst, scratch);
254 // and dst. If dst overlaps with lhs or rhs, we can save a move.
255 if (dst == lhs || dst == rhs) {
256 XMMRegister src = dst == lhs ? rhs : lhs;
258 minpd(scratch, dst);
259 minpd(dst, src);
262 movaps(dst, rhs);
264 minpd(dst, lhs);
266 orpd(scratch, dst);
267 cmpunordpd(dst, scratch);
268 orpd(scratch, dst);
269 psrlq(dst, byte{13});
270 andnpd(dst, scratch);
274 void SharedTurboAssembler::F64x2Max(XMMRegister dst, XMMRegister lhs,
282 vmaxpd(dst, rhs, lhs);
284 vxorpd(dst, dst, scratch);
286 vorpd(scratch, scratch, dst);
288 vsubpd(scratch, scratch, dst);
290 vcmpunordpd(dst, dst, scratch);
291 vpsrlq(dst, dst, byte{13});
292 vandnpd(dst, dst, scratch);
294 if (dst == lhs || dst == rhs) {
295 XMMRegister src = dst == lhs ? rhs : lhs;
297 maxpd(scratch, dst);
298 maxpd(dst, src);
301 movaps(dst, rhs);
303 maxpd(dst, lhs);
305 xorpd(dst, scratch);
306 orpd(scratch, dst);
307 subpd(scratch, dst);
308 cmpunordpd(dst, scratch);
309 psrlq(dst, byte{13});
310 andnpd(dst, scratch);
314 void SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) {
318 vbroadcastss(dst, src);
321 vshufps(dst, src, src, 0);
323 if (dst == src) {
325 shufps(dst, src, 0);
327 pshufd(dst, src, 0);
332 void SharedTurboAssembler::F32x4ExtractLane(FloatRegister dst, XMMRegister src,
337 // the top lanes of dst.
339 if (dst != src) {
340 Movaps(dst, src);
343 Movshdup(dst, src);
344 } else if (lane == 2 && dst == src) {
345 // Check dst == src to avoid false dependency on dst.
346 Movhlps(dst, src);
347 } else if (dst == src) {
348 Shufps(dst, src, src, lane);
350 Pshufd(dst, src, lane);
354 void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
358 Movss(dst, src);
361 Extractps(dst, src, laneidx);
366 void SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src,
371 Movd(dst, src);
373 Pshufb(dst, scratch);
376 void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
382 vpbroadcastb(dst, scratch);
384 I8x16SplatPreAvx2(dst, src, scratch);
388 void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
394 vpbroadcastb(dst, src);
396 I8x16SplatPreAvx2(dst, src, scratch);
400 void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
404 DCHECK_NE(dst, tmp2);
406 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
407 movaps(dst, src1);
408 src1 = dst;
412 Psllw(dst, src1, byte{shift});
419 Pand(dst, tmp2);
422 void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
426 DCHECK(!AreAliased(dst, tmp2, tmp3));
438 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
439 movaps(dst, src1);
440 src1 = dst;
443 Pand(dst, src1, tmp2);
446 Psllw(dst, dst, tmp3);
449 void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
453 DCHECK_NE(dst, tmp);
457 Punpcklbw(dst, src1);
459 Psraw(dst, shift);
460 Packsswb(dst, tmp);
463 void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
467 DCHECK(!AreAliased(dst, tmp2, tmp3));
472 Punpcklbw(dst, src1);
480 Psraw(dst, tmp3);
481 Packsswb(dst, tmp2);
484 void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
488 DCHECK_NE(dst, tmp2);
489 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
490 movaps(dst, src1);
491 src1 = dst;
496 Psrlw(dst, src1, shift);
503 Pand(dst, tmp2);
506 void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
510 DCHECK(!AreAliased(dst, tmp2, tmp3));
515 Punpcklbw(dst, src1);
523 Psrlw(dst, tmp3);
524 Packuswb(dst, tmp2);
528 void SharedTurboAssembler::I16x8SplatPreAvx2(XMMRegister dst, Op src) {
530 Movd(dst, src);
531 Pshuflw(dst, dst, uint8_t{0x0});
532 Punpcklqdq(dst, dst);
535 void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Register src) {
539 Movd(dst, src);
540 vpbroadcastw(dst, dst);
542 I16x8SplatPreAvx2(dst, src);
546 void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Operand src) {
551 vpbroadcastw(dst, src);
553 I16x8SplatPreAvx2(dst, src);
557 void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
562 is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
563 Pmullw(dst, scratch);
566 void SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
574 vpunpckhbw(dst, src2, src2);
575 vpsraw(dst, dst, 8);
576 vpmullw(dst, dst, scratch);
578 if (dst != src1) {
579 movaps(dst, src1);
582 punpckhbw(dst, dst);
583 psraw(dst, 8);
586 pmullw(dst, scratch);
590 void SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
600 vpunpckhbw(dst, src1, scratch);
601 vpmullw(dst, dst, dst);
603 if (dst == src2) {
604 // We overwrite dst, then use src2, so swap src1 and src2.
608 vpunpckhbw(dst, src1, scratch);
610 vpmullw(dst, dst, scratch);
615 if (dst != src1) {
616 movaps(dst, src1);
618 punpckhbw(dst, scratch);
619 pmullw(dst, scratch);
621 // When dst == src1, nothing special needs to be done.
622 // When dst == src2, swap src1 and src2, since we overwrite dst.
623 // When dst is unique, copy src1 to dst first.
624 if (dst == src2) {
626 // Now, dst == src1.
627 } else if (dst != src1) {
628 // dst != src1 && dst != src2.
629 movaps(dst, src1);
632 punpckhbw(dst, scratch);
635 pmullw(dst, scratch);
640 void SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst,
646 // dst = |i|i|j|j|k|k|l|l|m|m|n|n|o|o|p|p|
647 vpunpckhbw(dst, src, src);
648 vpsraw(dst, dst, 8);
651 if (dst == src) {
652 // 2 bytes shorter than pshufd, but has depdency on dst.
653 movhlps(dst, src);
654 pmovsxbw(dst, dst);
656 // No dependency on dst.
657 pshufd(dst, src, 0xEE);
658 pmovsxbw(dst, dst);
663 void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
671 // dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h|
672 XMMRegister tmp = dst == src ? scratch : dst;
674 vpunpckhbw(dst, src, tmp);
677 if (dst == src) {
680 punpckhbw(dst, scratch);
682 // No dependency on dst.
683 pshufd(dst, src, 0xEE);
684 pmovzxbw(dst, dst);
689 void SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
697 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
698 movaps(dst, src1);
699 src1 = dst;
702 Pmulhrsw(dst, src1, src2);
703 Pcmpeqw(scratch, dst);
704 Pxor(dst, scratch);
707 void SharedTurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
716 // dst = |0|b|0|d|0|f|0|h|
717 vpblendw(dst, src, tmp, 0xAA);
718 // dst = |a+b|c+d|e+f|g+h|
719 vpaddd(dst, tmp, dst);
726 if (dst != src) {
727 movaps(dst, src);
729 pblendw(dst, tmp, 0xAA);
730 paddd(dst, tmp);
738 // dst = |0|a|0|c|0|e|0|g|
739 if (dst != src) {
740 movaps(dst, src);
742 psrld(dst, byte{16});
743 // dst = |a+b|c+d|e+f|g+h|
744 paddd(dst, tmp);
749 // 2. Multiply high word (can be signed or unsigned) into dst.
750 // 3. Unpack and interleave scratch and dst into dst.
751 void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
758 is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
759 low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
761 DCHECK_EQ(dst, src1);
763 pmullw(dst, src2);
765 low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
769 void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst,
775 // dst = |e|e|f|f|g|g|h|h|
776 vpunpckhwd(dst, src, src);
777 vpsrad(dst, dst, 16);
780 if (dst == src) {
781 // 2 bytes shorter than pshufd, but has depdency on dst.
782 movhlps(dst, src);
783 pmovsxwd(dst, dst);
785 // No dependency on dst.
786 pshufd(dst, src, 0xEE);
787 pmovsxwd(dst, dst);
792 void SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst,
800 // dst = |0|a|0|b|0|c|0|d|
801 XMMRegister tmp = dst == src ? scratch : dst;
803 vpunpckhwd(dst, src, tmp);
805 if (dst == src) {
808 punpckhwd(dst, scratch);
811 // No dependency on dst.
812 pshufd(dst, src, 0xEE);
813 pmovzxwd(dst, dst);
818 void SharedTurboAssembler::I64x2Neg(XMMRegister dst, XMMRegister src,
824 vpsubq(dst, scratch, src);
826 if (dst == src) {
830 pxor(dst, dst);
831 psubq(dst, src);
835 void SharedTurboAssembler::I64x2Abs(XMMRegister dst, XMMRegister src,
840 XMMRegister tmp = dst == src ? scratch : dst;
843 vblendvpd(dst, src, tmp, src);
847 if (dst != src) {
848 movaps(dst, src);
851 xorps(dst, scratch);
852 psubq(dst, scratch);
856 void SharedTurboAssembler::I64x2GtS(XMMRegister dst, XMMRegister src0,
861 vpcmpgtq(dst, src0, src1);
864 if (dst == src0) {
865 pcmpgtq(dst, src1);
866 } else if (dst == src1) {
869 movaps(dst, scratch);
871 movaps(dst, src0);
872 pcmpgtq(dst, src1);
876 DCHECK_NE(dst, src0);
877 DCHECK_NE(dst, src1);
878 movaps(dst, src1);
880 psubq(dst, src0);
882 andps(dst, scratch);
885 orps(dst, scratch);
886 movshdup(dst, dst);
890 void SharedTurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0,
895 vpcmpgtq(dst, src1, src0);
897 vpxor(dst, dst, scratch);
900 DCHECK_NE(dst, src0);
901 if (dst != src1) {
902 movaps(dst, src1);
904 pcmpgtq(dst, src0);
906 xorps(dst, scratch);
909 DCHECK_NE(dst, src0);
910 DCHECK_NE(dst, src1);
911 movaps(dst, src0);
913 psubq(dst, src1);
915 andps(dst, scratch);
918 orps(dst, scratch);
919 movshdup(dst, dst);
921 xorps(dst, scratch);
925 void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
929 DCHECK_NE(xmm_tmp, dst);
944 if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
945 movaps(dst, src);
946 src = dst;
950 Pxor(dst, src, xmm_tmp);
952 Psrlq(dst, shift);
955 Psubq(dst, xmm_tmp);
958 void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
963 DCHECK_NE(xmm_tmp, dst);
965 DCHECK_NE(xmm_shift, dst);
978 if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
979 movaps(dst, src);
980 src = dst;
982 Pxor(dst, src, xmm_tmp);
983 Psrlq(dst, xmm_shift);
985 Psubq(dst, xmm_tmp);
988 void SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs,
992 DCHECK(!AreAliased(dst, tmp1, tmp2));
1008 vpmuludq(dst, lhs, rhs);
1010 vpaddq(dst, dst, tmp2);
1021 if (dst == rhs) {
1023 pmuludq(dst, lhs);
1025 if (dst != lhs) {
1026 movaps(dst, lhs);
1028 pmuludq(dst, rhs);
1030 paddq(dst, tmp2);
1035 // 2. Unpack src1, src0 into even-number elements of dst.
1038 void SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
1046 vpunpckldq(dst, src2, src2);
1049 vpunpckhdq(dst, src2, src2);
1052 vpmuldq(dst, scratch, dst);
1054 vpmuludq(dst, scratch, dst);
1059 pshufd(dst, src2, mask);
1062 pmuldq(dst, scratch);
1064 pmuludq(dst, scratch);
1069 void SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst,
1074 vpunpckhqdq(dst, src, src);
1075 vpmovsxdq(dst, dst);
1078 if (dst == src) {
1079 movhlps(dst, src);
1081 pshufd(dst, src, 0xEE);
1083 pmovsxdq(dst, dst);
1087 void SharedTurboAssembler::I64x2UConvertI32x4High(XMMRegister dst,
1094 vpunpckhdq(dst, src, scratch);
1096 if (dst == src) {
1099 punpckhdq(dst, scratch);
1102 // No dependency on dst.
1103 pshufd(dst, src, 0xEE);
1104 pmovzxdq(dst, dst);
1109 void SharedTurboAssembler::S128Not(XMMRegister dst, XMMRegister src,
1112 if (dst == src) {
1114 Pxor(dst, scratch);
1116 Pcmpeqd(dst, dst);
1117 Pxor(dst, src);
1121 void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
1130 vpand(dst, src1, mask);
1131 vpor(dst, dst, scratch);
1133 DCHECK_EQ(dst, mask);
1137 andps(dst, src1);
1138 orps(dst, scratch);
1142 void SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src,
1150 vpbroadcastb(dst, src);
1153 // Avoid dependency on previous value of dst.
1154 vpinsrb(dst, scratch, src, uint8_t{0});
1156 vpshufb(dst, dst, scratch);
1159 pinsrb(dst, src, uint8_t{0});
1161 pshufb(dst, scratch);
1165 void SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src,
1173 vpbroadcastw(dst, src);
1176 // Avoid dependency on previous value of dst.
1177 vpinsrw(dst, scratch, src, uint8_t{0});
1178 vpshuflw(dst, dst, uint8_t{0});
1179 vpunpcklqdq(dst, dst, dst);
1181 pinsrw(dst, src, uint8_t{0});
1182 pshuflw(dst, dst, uint8_t{0});
1183 movlhps(dst, dst);
1187 void SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) {
1194 vbroadcastss(dst, src);
1196 movss(dst, src);
1197 shufps(dst, dst, byte{0});
1201 void SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
1205 Movlps(dst, src);
1208 Movhps(dst, src);
1217 if (dst == src1) { \
1218 vfmadd231##ps_or_pd(dst, src2, src3); \
1219 } else if (dst == src2) { \
1220 vfmadd132##ps_or_pd(dst, src1, src3); \
1221 } else if (dst == src3) { \
1222 vfmadd213##ps_or_pd(dst, src2, src1); \
1225 vmovups(dst, src1); \
1226 vfmadd231##ps_or_pd(dst, src2, src3); \
1231 vadd##ps_or_pd(dst, src1, tmp); \
1233 if (dst == src1) { \
1236 add##ps_or_pd(dst, tmp); \
1237 } else if (dst == src2) { \
1241 } else if (dst == src3) { \
1246 movaps(dst, src2); \
1247 mul##ps_or_pd(dst, src3); \
1248 add##ps_or_pd(dst, src1); \
1257 if (dst == src1) { \
1258 vfnmadd231##ps_or_pd(dst, src2, src3); \
1259 } else if (dst == src2) { \
1260 vfnmadd132##ps_or_pd(dst, src1, src3); \
1261 } else if (dst == src3) { \
1262 vfnmadd213##ps_or_pd(dst, src2, src1); \
1265 vmovups(dst, src1); \
1266 vfnmadd231##ps_or_pd(dst, src2, src3); \
1271 vsub##ps_or_pd(dst, src1, tmp); \
1275 if (dst != src1) { \
1276 movaps(dst, src1); \
1278 sub##ps_or_pd(dst, tmp); \
1281 void SharedTurboAssembler::F32x4Qfma(XMMRegister dst, XMMRegister src1,
1287 void SharedTurboAssembler::F32x4Qfms(XMMRegister dst, XMMRegister src1,
1293 void SharedTurboAssembler::F64x2Qfma(XMMRegister dst, XMMRegister src1,
1299 void SharedTurboAssembler::F64x2Qfms(XMMRegister dst, XMMRegister src1,