1 // Copyright 2021 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_
6 #define V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_
7 
8 #include "src/base/macros.h"
9 #include "src/codegen/cpu-features.h"
10 #include "src/codegen/external-reference.h"
11 #include "src/codegen/turbo-assembler.h"
12 
13 #if V8_TARGET_ARCH_IA32
14 #include "src/codegen/ia32/register-ia32.h"
15 #elif V8_TARGET_ARCH_X64
16 #include "src/codegen/x64/register-x64.h"
17 #else
18 #error Unsupported target architecture.
19 #endif
20 
21 namespace v8 {
22 namespace internal {
23 class Assembler;
24 
25 // For WebAssembly we care about the full floating point register. If we are not
26 // running Wasm, we can get away with saving half of those registers.
27 #if V8_ENABLE_WEBASSEMBLY
28 constexpr int kStackSavedSavedFPSize = 2 * kDoubleSize;
29 #else
30 constexpr int kStackSavedSavedFPSize = kDoubleSize;
31 #endif  // V8_ENABLE_WEBASSEMBLY
32 
33 // Base class for SharedTurboAssemblerBase. This class contains macro-assembler
34 // functions that can be shared across ia32 and x64 without any template
35 // machinery, i.e. does not require the CRTP pattern that
36 // SharedTurboAssemblerBase exposes. This allows us to keep the bulk of
37 // definition inside a separate source file, rather than putting everything
38 // inside this header.
39 class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
40  public:
41   using TurboAssemblerBase::TurboAssemblerBase;
42 
43   void Move(Register dst, uint32_t src);
44   // Move if registers are not identical.
45   void Move(Register dst, Register src);
46   void Add(Register dst, Immediate src);
47   void And(Register dst, Immediate src);
48 
49   // Will move src1 to dst if AVX is not supported.
50   void Movhps(XMMRegister dst, XMMRegister src1, Operand src2);
51   void Movlps(XMMRegister dst, XMMRegister src1, Operand src2);
52 
53   void Pblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2,
54                 XMMRegister mask);
55 
56   template <typename Op>
Pinsrb(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, uint32_t* load_pc_offset = nullptr)57   void Pinsrb(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8,
58               uint32_t* load_pc_offset = nullptr) {
59     PinsrHelper(this, &Assembler::vpinsrb, &Assembler::pinsrb, dst, src1, src2,
60                 imm8, load_pc_offset, {SSE4_1});
61   }
62 
63   template <typename Op>
Pinsrw(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, uint32_t* load_pc_offset = nullptr)64   void Pinsrw(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8,
65               uint32_t* load_pc_offset = nullptr) {
66     PinsrHelper(this, &Assembler::vpinsrw, &Assembler::pinsrw, dst, src1, src2,
67                 imm8, load_pc_offset);
68   }
69 
70   // Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
71   template <typename Op>
Pshufb(XMMRegister dst, XMMRegister src, Op mask)72   void Pshufb(XMMRegister dst, XMMRegister src, Op mask) {
73     if (CpuFeatures::IsSupported(AVX)) {
74       CpuFeatureScope avx_scope(this, AVX);
75       vpshufb(dst, src, mask);
76     } else {
77       // Make sure these are different so that we won't overwrite mask.
78       DCHECK_NE(mask, dst);
79       if (dst != src) {
80         movaps(dst, src);
81       }
82       CpuFeatureScope sse_scope(this, SSSE3);
83       pshufb(dst, mask);
84     }
85   }
86 
87   template <typename Op>
Pshufb(XMMRegister dst, Op mask)88   void Pshufb(XMMRegister dst, Op mask) {
89     Pshufb(dst, dst, mask);
90   }
91 
92   // Shufps that will mov src1 into dst if AVX is not supported.
93   void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
94               uint8_t imm8);
95 
96   // Helper struct to implement functions that check for AVX support and
97   // dispatch to the appropriate AVX/SSE instruction.
98   template <typename Dst, typename Arg, typename... Args>
99   struct AvxHelper {
100     Assembler* assm;
101     base::Optional<CpuFeature> feature = base::nullopt;
102     // Call a method where the AVX version expects the dst argument to be
103     // duplicated.
104     // E.g. Andps(x, y) -> vandps(x, x, y)
105     //                  -> andps(x, y)
106     template <void (Assembler::*avx)(Dst, Dst, Arg, Args...),
107               void (Assembler::*no_avx)(Dst, Arg, Args...)>
emitv8::internal::SharedTurboAssembler::AvxHelper108     void emit(Dst dst, Arg arg, Args... args) {
109       if (CpuFeatures::IsSupported(AVX)) {
110         CpuFeatureScope scope(assm, AVX);
111         (assm->*avx)(dst, dst, arg, args...);
112       } else if (feature.has_value()) {
113         DCHECK(CpuFeatures::IsSupported(*feature));
114         CpuFeatureScope scope(assm, *feature);
115         (assm->*no_avx)(dst, arg, args...);
116       } else {
117         (assm->*no_avx)(dst, arg, args...);
118       }
119     }
120 
121     // Call a method in the AVX form (one more operand), but if unsupported will
122     // check that dst == first src.
123     // E.g. Andps(x, y, z) -> vandps(x, y, z)
124     //                     -> andps(x, z) and check that x == y
125     template <void (Assembler::*avx)(Dst, Arg, Args...),
126               void (Assembler::*no_avx)(Dst, Args...)>
emitv8::internal::SharedTurboAssembler::AvxHelper127     void emit(Dst dst, Arg arg, Args... args) {
128       if (CpuFeatures::IsSupported(AVX)) {
129         CpuFeatureScope scope(assm, AVX);
130         (assm->*avx)(dst, arg, args...);
131       } else if (feature.has_value()) {
132         DCHECK_EQ(dst, arg);
133         DCHECK(CpuFeatures::IsSupported(*feature));
134         CpuFeatureScope scope(assm, *feature);
135         (assm->*no_avx)(dst, args...);
136       } else {
137         DCHECK_EQ(dst, arg);
138         (assm->*no_avx)(dst, args...);
139       }
140     }
141 
142     // Call a method where the AVX version expects no duplicated dst argument.
143     // E.g. Movddup(x, y) -> vmovddup(x, y)
144     //                    -> movddup(x, y)
145     template <void (Assembler::*avx)(Dst, Arg, Args...),
146               void (Assembler::*no_avx)(Dst, Arg, Args...)>
emitv8::internal::SharedTurboAssembler::AvxHelper147     void emit(Dst dst, Arg arg, Args... args) {
148       if (CpuFeatures::IsSupported(AVX)) {
149         CpuFeatureScope scope(assm, AVX);
150         (assm->*avx)(dst, arg, args...);
151       } else if (feature.has_value()) {
152         DCHECK(CpuFeatures::IsSupported(*feature));
153         CpuFeatureScope scope(assm, *feature);
154         (assm->*no_avx)(dst, arg, args...);
155       } else {
156         (assm->*no_avx)(dst, arg, args...);
157       }
158     }
159   };
160 
161 #define AVX_OP(macro_name, name)                                        \
162   template <typename Dst, typename Arg, typename... Args>               \
163   void macro_name(Dst dst, Arg arg, Args... args) {                     \
164     AvxHelper<Dst, Arg, Args...>{this}                                  \
165         .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \
166                                                               args...); \
167   }
168 
169 // Define a macro which uses |avx_name| when AVX is supported, and |sse_name|
170 // when AVX is not supported. This is useful for bit-wise instructions like
171 // andpd/andps, where the behavior is exactly the same, but the *ps
172 // version is 1 byte shorter, and on SSE-only processors there is no
173 // performance difference since those processors don't differentiate integer
174 // and floating-point domains.
175 // Note: we require |avx_name| to be the AVX instruction without the "v"
176 // prefix. If we require the full AVX instruction name and the caller
177 // accidentally passes in a SSE instruction, we compile without any issues and
178 // generate the SSE instruction. By appending "v" here, we ensure that we will
179 // generate an AVX instruction.
180 #define AVX_OP_WITH_DIFF_SSE_INSTR(macro_name, avx_name, sse_name)     \
181   template <typename Dst, typename Arg, typename... Args>              \
182   void macro_name(Dst dst, Arg arg, Args... args) {                    \
183     AvxHelper<Dst, Arg, Args...>{this}                                 \
184         .template emit<&Assembler::v##avx_name, &Assembler::sse_name>( \
185             dst, arg, args...);                                        \
186   }
187 
188 #define AVX_OP_SSE3(macro_name, name)                                    \
189   template <typename Dst, typename Arg, typename... Args>                \
190   void macro_name(Dst dst, Arg arg, Args... args) {                      \
191     AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE3)} \
192         .template emit<&Assembler::v##name, &Assembler::name>(dst, arg,  \
193                                                               args...);  \
194   }
195 
196 #define AVX_OP_SSSE3(macro_name, name)                                    \
197   template <typename Dst, typename Arg, typename... Args>                 \
198   void macro_name(Dst dst, Arg arg, Args... args) {                       \
199     AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSSE3)} \
200         .template emit<&Assembler::v##name, &Assembler::name>(dst, arg,   \
201                                                               args...);   \
202   }
203 
204 #define AVX_OP_SSE4_1(macro_name, name)                                    \
205   template <typename Dst, typename Arg, typename... Args>                  \
206   void macro_name(Dst dst, Arg arg, Args... args) {                        \
207     AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_1)} \
208         .template emit<&Assembler::v##name, &Assembler::name>(dst, arg,    \
209                                                               args...);    \
210   }
211 
212 #define AVX_OP_SSE4_2(macro_name, name)                                    \
213   template <typename Dst, typename Arg, typename... Args>                  \
214   void macro_name(Dst dst, Arg arg, Args... args) {                        \
215     AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_2)} \
216         .template emit<&Assembler::v##name, &Assembler::name>(dst, arg,    \
217                                                               args...);    \
218   }
219 
220   // Keep this list sorted by required extension, then instruction name.
221   AVX_OP(Addpd, addpd)
222   AVX_OP(Addps, addps)
223   AVX_OP(Addsd, addsd)
224   AVX_OP(Addss, addss)
225   AVX_OP(Andnpd, andnpd)
226   AVX_OP(Andnps, andnps)
227   AVX_OP(Andpd, andpd)
228   AVX_OP(Andps, andps)
229   AVX_OP(Cmpeqpd, cmpeqpd)
230   AVX_OP(Cmpeqps, cmpeqps)
231   AVX_OP(Cmplepd, cmplepd)
232   AVX_OP(Cmpleps, cmpleps)
233   AVX_OP(Cmpltpd, cmpltpd)
234   AVX_OP(Cmpltps, cmpltps)
235   AVX_OP(Cmpneqpd, cmpneqpd)
236   AVX_OP(Cmpneqps, cmpneqps)
237   AVX_OP(Cmpunordpd, cmpunordpd)
238   AVX_OP(Cmpunordps, cmpunordps)
239   AVX_OP(Cvtdq2pd, cvtdq2pd)
240   AVX_OP(Cvtdq2ps, cvtdq2ps)
241   AVX_OP(Cvtpd2ps, cvtpd2ps)
242   AVX_OP(Cvtps2pd, cvtps2pd)
243   AVX_OP(Cvtsd2ss, cvtsd2ss)
244   AVX_OP(Cvtss2sd, cvtss2sd)
245   AVX_OP(Cvttpd2dq, cvttpd2dq)
246   AVX_OP(Cvttps2dq, cvttps2dq)
247   AVX_OP(Cvttsd2si, cvttsd2si)
248   AVX_OP(Cvttss2si, cvttss2si)
249   AVX_OP(Divpd, divpd)
250   AVX_OP(Divps, divps)
251   AVX_OP(Divsd, divsd)
252   AVX_OP(Divss, divss)
253   AVX_OP(Maxpd, maxpd)
254   AVX_OP(Maxps, maxps)
255   AVX_OP(Minpd, minpd)
256   AVX_OP(Minps, minps)
257   AVX_OP(Movaps, movaps)
258   AVX_OP(Movd, movd)
259   AVX_OP(Movhlps, movhlps)
260   AVX_OP(Movhps, movhps)
261   AVX_OP(Movlps, movlps)
262   AVX_OP(Movmskpd, movmskpd)
263   AVX_OP(Movmskps, movmskps)
264   AVX_OP(Movsd, movsd)
265   AVX_OP(Movss, movss)
266   AVX_OP(Movupd, movupd)
267   AVX_OP(Movups, movups)
268   AVX_OP(Mulpd, mulpd)
269   AVX_OP(Mulps, mulps)
270   AVX_OP(Mulsd, mulsd)
271   AVX_OP(Mulss, mulss)
272   AVX_OP(Orpd, orpd)
273   AVX_OP(Orps, orps)
274   AVX_OP(Packssdw, packssdw)
275   AVX_OP(Packsswb, packsswb)
276   AVX_OP(Packuswb, packuswb)
277   AVX_OP(Paddb, paddb)
278   AVX_OP(Paddd, paddd)
279   AVX_OP(Paddq, paddq)
280   AVX_OP(Paddsb, paddsb)
281   AVX_OP(Paddsw, paddsw)
282   AVX_OP(Paddusb, paddusb)
283   AVX_OP(Paddusw, paddusw)
284   AVX_OP(Paddw, paddw)
285   AVX_OP(Pavgb, pavgb)
286   AVX_OP(Pavgw, pavgw)
287   AVX_OP(Pcmpgtb, pcmpgtb)
288   AVX_OP(Pcmpgtd, pcmpgtd)
289   AVX_OP(Pcmpgtw, pcmpgtw)
290   AVX_OP(Pcmpeqb, pcmpeqb)
291   AVX_OP(Pcmpeqd, pcmpeqd)
292   AVX_OP(Pcmpeqw, pcmpeqw)
293   AVX_OP(Pmaddwd, pmaddwd)
294   AVX_OP(Pmaxsw, pmaxsw)
295   AVX_OP(Pmaxub, pmaxub)
296   AVX_OP(Pminsw, pminsw)
297   AVX_OP(Pminub, pminub)
298   AVX_OP(Pmovmskb, pmovmskb)
299   AVX_OP(Pmullw, pmullw)
300   AVX_OP(Pmuludq, pmuludq)
301   AVX_OP(Pshufd, pshufd)
302   AVX_OP(Pshufhw, pshufhw)
303   AVX_OP(Pshuflw, pshuflw)
304   AVX_OP(Pslld, pslld)
305   AVX_OP(Psllq, psllq)
306   AVX_OP(Psllw, psllw)
307   AVX_OP(Psrad, psrad)
308   AVX_OP(Psraw, psraw)
309   AVX_OP(Psrld, psrld)
310   AVX_OP(Psrlq, psrlq)
311   AVX_OP(Psrlw, psrlw)
312   AVX_OP(Psubb, psubb)
313   AVX_OP(Psubd, psubd)
314   AVX_OP(Psubq, psubq)
315   AVX_OP(Psubsb, psubsb)
316   AVX_OP(Psubsw, psubsw)
317   AVX_OP(Psubusb, psubusb)
318   AVX_OP(Psubusw, psubusw)
319   AVX_OP(Psubw, psubw)
320   AVX_OP(Punpckhbw, punpckhbw)
321   AVX_OP(Punpckhdq, punpckhdq)
322   AVX_OP(Punpckhqdq, punpckhqdq)
323   AVX_OP(Punpckhwd, punpckhwd)
324   AVX_OP(Punpcklbw, punpcklbw)
325   AVX_OP(Punpckldq, punpckldq)
326   AVX_OP(Punpcklqdq, punpcklqdq)
327   AVX_OP(Punpcklwd, punpcklwd)
328   AVX_OP(Rcpps, rcpps)
329   AVX_OP(Rsqrtps, rsqrtps)
330   AVX_OP(Sqrtpd, sqrtpd)
331   AVX_OP(Sqrtps, sqrtps)
332   AVX_OP(Sqrtsd, sqrtsd)
333   AVX_OP(Sqrtss, sqrtss)
334   AVX_OP(Subpd, subpd)
335   AVX_OP(Subps, subps)
336   AVX_OP(Subsd, subsd)
337   AVX_OP(Subss, subss)
338   AVX_OP(Ucomisd, ucomisd)
339   AVX_OP(Ucomiss, ucomiss)
340   AVX_OP(Unpcklps, unpcklps)
341   AVX_OP(Xorpd, xorpd)
342   AVX_OP(Xorps, xorps)
343 
344   // Many AVX processors have separate integer/floating-point domains, so use
345   // vmovaps if AVX is supported. On SSE, movaps is 1 byte shorter than movdqa,
346   // and has the same behavior. Most SSE processors also don't have the same
347   // delay moving between integer and floating-point domains.
348   AVX_OP_WITH_DIFF_SSE_INSTR(Movapd, movapd, movaps)
349   AVX_OP_WITH_DIFF_SSE_INSTR(Movdqa, movdqa, movaps)
350   AVX_OP_WITH_DIFF_SSE_INSTR(Movdqu, movdqu, movups)
351   AVX_OP_WITH_DIFF_SSE_INSTR(Pand, pand, andps)
352   AVX_OP_WITH_DIFF_SSE_INSTR(Por, por, orps)
353   AVX_OP_WITH_DIFF_SSE_INSTR(Pxor, pxor, xorps)
354 
355   AVX_OP_SSE3(Haddps, haddps)
356   AVX_OP_SSE3(Movddup, movddup)
357   AVX_OP_SSE3(Movshdup, movshdup)
358 
359   AVX_OP_SSSE3(Pabsb, pabsb)
360   AVX_OP_SSSE3(Pabsd, pabsd)
361   AVX_OP_SSSE3(Pabsw, pabsw)
362   AVX_OP_SSSE3(Palignr, palignr)
363   AVX_OP_SSSE3(Pmulhrsw, pmulhrsw)
364   AVX_OP_SSSE3(Psignb, psignb)
365   AVX_OP_SSSE3(Psignd, psignd)
366   AVX_OP_SSSE3(Psignw, psignw)
367 
368   AVX_OP_SSE4_1(Extractps, extractps)
369   AVX_OP_SSE4_1(Insertps, insertps)
370   AVX_OP_SSE4_1(Packusdw, packusdw)
371   AVX_OP_SSE4_1(Pblendw, pblendw)
372   AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq)
373   AVX_OP_SSE4_1(Pextrb, pextrb)
374   AVX_OP_SSE4_1(Pextrw, pextrw)
375   AVX_OP_SSE4_1(Pmaxsb, pmaxsb)
376   AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
377   AVX_OP_SSE4_1(Pmaxud, pmaxud)
378   AVX_OP_SSE4_1(Pmaxuw, pmaxuw)
379   AVX_OP_SSE4_1(Pminsb, pminsb)
380   AVX_OP_SSE4_1(Pminsd, pminsd)
381   AVX_OP_SSE4_1(Pminud, pminud)
382   AVX_OP_SSE4_1(Pminuw, pminuw)
383   AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw)
384   AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq)
385   AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd)
386   AVX_OP_SSE4_1(Pmovzxbw, pmovzxbw)
387   AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq)
388   AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd)
389   AVX_OP_SSE4_1(Pmulld, pmulld)
390   AVX_OP_SSE4_1(Ptest, ptest)
391   AVX_OP_SSE4_1(Roundpd, roundpd)
392   AVX_OP_SSE4_1(Roundps, roundps)
393   AVX_OP_SSE4_1(Roundsd, roundsd)
394   AVX_OP_SSE4_1(Roundss, roundss)
395 
396 #undef AVX_OP
397 #undef AVX_OP_SSE3
398 #undef AVX_OP_SSSE3
399 #undef AVX_OP_SSE4_1
400 #undef AVX_OP_SSE4_2
401 
402   void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane);
403   void F64x2ReplaceLane(XMMRegister dst, XMMRegister src, DoubleRegister rep,
404                         uint8_t lane);
405   void F64x2Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
406                 XMMRegister scratch);
407   void F64x2Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
408                 XMMRegister scratch);
409   void F32x4Splat(XMMRegister dst, DoubleRegister src);
410   void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
411   void F32x4Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
412                 XMMRegister scratch);
413   void F32x4Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
414                 XMMRegister scratch);
415   void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
416   void I8x16Splat(XMMRegister dst, Register src, XMMRegister scratch);
417   void I8x16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
418   void I8x16Shl(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
419                 XMMRegister tmp2);
420   void I8x16Shl(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1,
421                 XMMRegister tmp2, XMMRegister tmp3);
422   void I8x16ShrS(XMMRegister dst, XMMRegister src1, uint8_t src2,
423                  XMMRegister tmp);
424   void I8x16ShrS(XMMRegister dst, XMMRegister src1, Register src2,
425                  Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
426   void I8x16ShrU(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
427                  XMMRegister tmp2);
428   void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2,
429                  Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
430   void I16x8Splat(XMMRegister dst, Register src);
431   void I16x8Splat(XMMRegister dst, Operand src);
432   void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
433                       XMMRegister scrat, bool is_signed);
434   void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
435                         XMMRegister scratch);
436   void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2,
437                         XMMRegister scratch);
438   void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
439   void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src,
440                               XMMRegister scratch);
441   // Will move src1 to dst if AVX is not supported.
442   void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
443                         XMMRegister scratch);
444   void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
445                                  XMMRegister tmp);
446   // Requires that dst == src1 if AVX is not supported.
447   void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
448                    XMMRegister scratch, bool low, bool is_signed);
449   void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
450   void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src,
451                               XMMRegister scratch);
452   void I64x2Neg(XMMRegister dst, XMMRegister src, XMMRegister scratch);
453   void I64x2Abs(XMMRegister dst, XMMRegister src, XMMRegister scratch);
454   void I64x2GtS(XMMRegister dst, XMMRegister src0, XMMRegister src1,
455                 XMMRegister scratch);
456   void I64x2GeS(XMMRegister dst, XMMRegister src0, XMMRegister src1,
457                 XMMRegister scratch);
458   void I64x2ShrS(XMMRegister dst, XMMRegister src, uint8_t shift,
459                  XMMRegister xmm_tmp);
460   void I64x2ShrS(XMMRegister dst, XMMRegister src, Register shift,
461                  XMMRegister xmm_tmp, XMMRegister xmm_shift,
462                  Register tmp_shift);
463   void I64x2Mul(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
464                 XMMRegister tmp1, XMMRegister tmp2);
465   void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
466                    XMMRegister scratch, bool low, bool is_signed);
467   void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src);
468   void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src,
469                               XMMRegister scratch);
470   void S128Not(XMMRegister dst, XMMRegister src, XMMRegister scratch);
471   // Requires dst == mask when AVX is not supported.
472   void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
473                   XMMRegister src2, XMMRegister scratch);
474   void S128Load8Splat(XMMRegister dst, Operand src, XMMRegister scratch);
475   void S128Load16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
476   void S128Load32Splat(XMMRegister dst, Operand src);
477   void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx);
478 
479   void F64x2Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2,
480                  XMMRegister src3, XMMRegister tmp);
481   void F64x2Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2,
482                  XMMRegister src3, XMMRegister tmp);
483   void F32x4Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2,
484                  XMMRegister src3, XMMRegister tmp);
485   void F32x4Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2,
486                  XMMRegister src3, XMMRegister tmp);
487 
488  protected:
489   template <typename Op>
490   using AvxFn = void (Assembler::*)(XMMRegister, XMMRegister, Op, uint8_t);
491   template <typename Op>
492   using NoAvxFn = void (Assembler::*)(XMMRegister, Op, uint8_t);
493 
494   template <typename Op>
PinsrHelper(Assembler* assm, AvxFn<Op> avx, NoAvxFn<Op> noavx, XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, uint32_t* load_pc_offset = nullptr, base::Optional<CpuFeature> feature = base::nullopt)495   void PinsrHelper(Assembler* assm, AvxFn<Op> avx, NoAvxFn<Op> noavx,
496                    XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8,
497                    uint32_t* load_pc_offset = nullptr,
498                    base::Optional<CpuFeature> feature = base::nullopt) {
499     if (CpuFeatures::IsSupported(AVX)) {
500       CpuFeatureScope scope(assm, AVX);
501       if (load_pc_offset) *load_pc_offset = assm->pc_offset();
502       (assm->*avx)(dst, src1, src2, imm8);
503       return;
504     }
505 
506     if (dst != src1) assm->movaps(dst, src1);
507     if (load_pc_offset) *load_pc_offset = assm->pc_offset();
508     if (feature.has_value()) {
509       DCHECK(CpuFeatures::IsSupported(*feature));
510       CpuFeatureScope scope(assm, *feature);
511       (assm->*noavx)(dst, src2, imm8);
512     } else {
513       (assm->*noavx)(dst, src2, imm8);
514     }
515   }
516 
517  private:
518   template <typename Op>
519   void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch);
520   template <typename Op>
521   void I16x8SplatPreAvx2(XMMRegister dst, Op src);
522 };
523 
524 // Common base class template shared by ia32 and x64 TurboAssembler. This uses
525 // the Curiously Recurring Template Pattern (CRTP), where Impl is the actual
526 // class (subclass of SharedTurboAssemblerBase instantiated with the actual
527 // class). This allows static polymorphism, where member functions can be move
528 // into SharedTurboAssembler, and we can also call into member functions
529 // defined in ia32 or x64 specific TurboAssembler from within this template
530 // class, via Impl.
531 //
532 // Note: all member functions must be defined in this header file so that the
533 // compiler can generate code for the function definitions. See
534 // https://isocpp.org/wiki/faq/templates#templates-defn-vs-decl for rationale.
535 // If a function does not need polymorphism, move it into SharedTurboAssembler,
536 // and define it outside of this header.
537 template <typename Impl>
538 class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
539   using SharedTurboAssembler::SharedTurboAssembler;
540 
541  public:
Abspd(XMMRegister dst, XMMRegister src, Register tmp)542   void Abspd(XMMRegister dst, XMMRegister src, Register tmp) {
543     FloatUnop(dst, src, tmp, &SharedTurboAssembler::Andps,
544               ExternalReference::address_of_double_abs_constant());
545   }
546 
Absps(XMMRegister dst, XMMRegister src, Register tmp)547   void Absps(XMMRegister dst, XMMRegister src, Register tmp) {
548     FloatUnop(dst, src, tmp, &SharedTurboAssembler::Andps,
549               ExternalReference::address_of_float_abs_constant());
550   }
551 
Negpd(XMMRegister dst, XMMRegister src, Register tmp)552   void Negpd(XMMRegister dst, XMMRegister src, Register tmp) {
553     FloatUnop(dst, src, tmp, &SharedTurboAssembler::Xorps,
554               ExternalReference::address_of_double_neg_constant());
555   }
556 
Negps(XMMRegister dst, XMMRegister src, Register tmp)557   void Negps(XMMRegister dst, XMMRegister src, Register tmp) {
558     FloatUnop(dst, src, tmp, &SharedTurboAssembler::Xorps,
559               ExternalReference::address_of_float_neg_constant());
560   }
561 #undef FLOAT_UNOP
562 
Pextrd(Register dst, XMMRegister src, uint8_t imm8)563   void Pextrd(Register dst, XMMRegister src, uint8_t imm8) {
564     if (imm8 == 0) {
565       Movd(dst, src);
566       return;
567     }
568 
569     if (CpuFeatures::IsSupported(AVX)) {
570       CpuFeatureScope scope(this, AVX);
571       vpextrd(dst, src, imm8);
572     } else if (CpuFeatures::IsSupported(SSE4_1)) {
573       CpuFeatureScope sse_scope(this, SSE4_1);
574       pextrd(dst, src, imm8);
575     } else {
576       DCHECK_LT(imm8, 2);
577       impl()->PextrdPreSse41(dst, src, imm8);
578     }
579   }
580 
581   template <typename Op>
Pinsrd(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, uint32_t* load_pc_offset = nullptr)582   void Pinsrd(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8,
583               uint32_t* load_pc_offset = nullptr) {
584     if (CpuFeatures::IsSupported(SSE4_1)) {
585       PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1,
586                   src2, imm8, load_pc_offset,
587                   base::Optional<CpuFeature>(SSE4_1));
588     } else {
589       if (dst != src1) {
590         movaps(dst, src1);
591       }
592       impl()->PinsrdPreSse41(dst, src2, imm8, load_pc_offset);
593     }
594   }
595 
596   template <typename Op>
Pinsrd(XMMRegister dst, Op src, uint8_t imm8, uint32_t* load_pc_offset = nullptr)597   void Pinsrd(XMMRegister dst, Op src, uint8_t imm8,
598               uint32_t* load_pc_offset = nullptr) {
599     Pinsrd(dst, dst, src, imm8, load_pc_offset);
600   }
601 
F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src, Register scratch)602   void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src,
603                              Register scratch) {
604     ASM_CODE_COMMENT(this);
605     // dst = [ src_low, 0x43300000, src_high, 0x4330000 ];
606     // 0x43300000'00000000 is a special double where the significand bits
607     // precisely represents all uint32 numbers.
608     if (!CpuFeatures::IsSupported(AVX) && dst != src) {
609       movaps(dst, src);
610       src = dst;
611     }
612     Unpcklps(dst, src,
613              ExternalReferenceAsOperand(
614                  ExternalReference::
615                      address_of_wasm_f64x2_convert_low_i32x4_u_int_mask(),
616                  scratch));
617     Subpd(dst,
618           ExternalReferenceAsOperand(
619               ExternalReference::address_of_wasm_double_2_power_52(), scratch));
620   }
621 
I32x4SConvertF32x4(XMMRegister dst, XMMRegister src, XMMRegister tmp, Register scratch)622   void I32x4SConvertF32x4(XMMRegister dst, XMMRegister src, XMMRegister tmp,
623                           Register scratch) {
624     ASM_CODE_COMMENT(this);
625     Operand op = ExternalReferenceAsOperand(
626         ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch);
627 
628     // This algorithm works by:
629     // 1. lanes with NaNs are zero-ed
630     // 2. lanes ge than 2147483648.0f (MAX_INT32+1) set to 0xffff'ffff
631     // 3. cvttps2dq sets all out of range lanes to 0x8000'0000
632     //   a. correct for underflows (< MIN_INT32)
633     //   b. wrong for overflow, and we know which lanes overflow from 2.
634     // 4. adjust for 3b by xor-ing 2 and 3
635     //   a. 0x8000'0000 xor 0xffff'ffff = 0x7fff'ffff (MAX_INT32)
636     if (CpuFeatures::IsSupported(AVX)) {
637       CpuFeatureScope scope(this, AVX);
638       vcmpeqps(tmp, src, src);
639       vandps(dst, src, tmp);
640       vcmpgeps(tmp, src, op);
641       vcvttps2dq(dst, dst);
642       vpxor(dst, dst, tmp);
643     } else {
644       if (src == dst) {
645         movaps(tmp, src);
646         cmpeqps(tmp, tmp);
647         andps(dst, tmp);
648         movaps(tmp, op);
649         cmpleps(tmp, dst);
650         cvttps2dq(dst, dst);
651         xorps(dst, tmp);
652       } else {
653         movaps(tmp, op);
654         cmpleps(tmp, src);
655         cvttps2dq(dst, src);
656         xorps(dst, tmp);
657         movaps(tmp, src);
658         cmpeqps(tmp, tmp);
659         andps(dst, tmp);
660       }
661     }
662   }
663 
I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src, XMMRegister scratch, Register tmp)664   void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src,
665                                XMMRegister scratch, Register tmp) {
666     ASM_CODE_COMMENT(this);
667     if (CpuFeatures::IsSupported(AVX)) {
668       CpuFeatureScope avx_scope(this, AVX);
669       XMMRegister original_dst = dst;
670       // Make sure we don't overwrite src.
671       if (dst == src) {
672         DCHECK_NE(src, scratch);
673         dst = scratch;
674       }
675       // dst = 0 if src == NaN, else all ones.
676       vcmpeqpd(dst, src, src);
677       // dst = 0 if src == NaN, else INT32_MAX as double.
678       vandpd(
679           dst, dst,
680           ExternalReferenceAsOperand(
681               ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
682       // dst = 0 if src == NaN, src is saturated to INT32_MAX as double.
683       vminpd(dst, src, dst);
684       // Values > INT32_MAX already saturated, values < INT32_MIN raises an
685       // exception, which is masked and returns 0x80000000.
686       vcvttpd2dq(original_dst, dst);
687     } else {
688       if (dst != src) {
689         movaps(dst, src);
690       }
691       movaps(scratch, dst);
692       cmpeqpd(scratch, dst);
693       andps(scratch,
694             ExternalReferenceAsOperand(
695                 ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
696       minpd(dst, scratch);
697       cvttpd2dq(dst, dst);
698     }
699   }
700 
I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src, XMMRegister scratch, Register tmp)701   void I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src,
702                                XMMRegister scratch, Register tmp) {
703     ASM_CODE_COMMENT(this);
704     if (CpuFeatures::IsSupported(AVX)) {
705       CpuFeatureScope avx_scope(this, AVX);
706       vxorpd(scratch, scratch, scratch);
707       // Saturate to 0.
708       vmaxpd(dst, src, scratch);
709       // Saturate to UINT32_MAX.
710       vminpd(
711           dst, dst,
712           ExternalReferenceAsOperand(
713               ExternalReference::address_of_wasm_uint32_max_as_double(), tmp));
714       // Truncate.
715       vroundpd(dst, dst, kRoundToZero);
716       // Add to special double where significant bits == uint32.
717       vaddpd(dst, dst,
718              ExternalReferenceAsOperand(
719                  ExternalReference::address_of_wasm_double_2_power_52(), tmp));
720       // Extract low 32 bits of each double's significand, zero top lanes.
721       // dst = [dst[0], dst[2], 0, 0]
722       vshufps(dst, dst, scratch, 0x88);
723     } else {
724       CpuFeatureScope scope(this, SSE4_1);
725       if (dst != src) {
726         movaps(dst, src);
727       }
728       xorps(scratch, scratch);
729       maxpd(dst, scratch);
730       minpd(dst, ExternalReferenceAsOperand(
731                      ExternalReference::address_of_wasm_uint32_max_as_double(),
732                      tmp));
733       roundpd(dst, dst, kRoundToZero);
734       addpd(dst,
735             ExternalReferenceAsOperand(
736                 ExternalReference::address_of_wasm_double_2_power_52(), tmp));
737       shufps(dst, scratch, 0x88);
738     }
739   }
740 
I32x4TruncF64x2UZero(XMMRegister dst, XMMRegister src, Register tmp, XMMRegister scratch)741   void I32x4TruncF64x2UZero(XMMRegister dst, XMMRegister src, Register tmp,
742                             XMMRegister scratch) {
743     // TODO(zhin): call this from I32x4TruncSatF64x2UZero.
744     ASM_CODE_COMMENT(this);
745     if (dst != src && !CpuFeatures::IsSupported(AVX)) {
746       movaps(dst, src);
747       src = dst;
748     }
749     // Same as I32x4TruncSatF64x2UZero but without the saturation.
750     Roundpd(dst, src, kRoundToZero);
751     // Add to special double where significant bits == uint32.
752     Addpd(dst, dst,
753           ExternalReferenceAsOperand(
754               ExternalReference::address_of_wasm_double_2_power_52(), tmp));
755     // Extract low 32 bits of each double's significand, zero top lanes.
756     // dst = [dst[0], dst[2], 0, 0]
757     Shufps(dst, dst, scratch, 0x88);
758   }
759 
I32x4TruncF32x4U(XMMRegister dst, XMMRegister src, Register scratch, XMMRegister tmp)760   void I32x4TruncF32x4U(XMMRegister dst, XMMRegister src, Register scratch,
761                         XMMRegister tmp) {
762     ASM_CODE_COMMENT(this);
763     Operand int32_overflow_op = ExternalReferenceAsOperand(
764         ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch);
765     if (CpuFeatures::IsSupported(AVX)) {
766       CpuFeatureScope avx_scope(this, AVX);
767       vcmpltps(tmp, src, int32_overflow_op);
768     } else {
769       movaps(tmp, src);
770       cmpltps(tmp, int32_overflow_op);
771     }
772     // In tmp, lanes < INT32_MAX are left alone, other lanes are zeroed.
773     Pand(tmp, src);
774     // tmp = src with all the valid conversions
775     if (dst != src) {
776       Movaps(dst, src);
777     }
778     // In dst, lanes < INT32_MAX are zeroed, other lanes left alone.
779     Pxor(dst, tmp);
780     // tmp contains only lanes which can be converted correctly (<INT32_MAX)
781     Cvttps2dq(tmp, tmp);
782     // Bit-trick follows:
783     // All integers from INT32_MAX to UINT32_MAX that are representable as
784     // floats lie between [0x4f00'0000,0x4f80'0000).
785     // The bit representation of the integers is actually shifted right by 8.
786     // For example given 2147483904.0f (which fits in UINT32_MAX):
787     //
788     // 01001111 000000000 000000000 000000001 (float 0x4f00'0001)
789     //          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
790     //          these are exactly the top 24 bits of the int representation
791     //          but needs the top bit to be flipped
792     // 10000000 000000000 000000001 000000000 (int 0x8000'0100)
793     //
794     // So what needs to be done is to flip bit 23, which is the lowest bit of
795     // the exponent, which means multiply by 2 (or addps to itself).
796     Addps(dst, dst, dst);
797     // Then shift to get the bit representation of the int.
798     Pslld(dst, byte{8});
799     // Merge the converted lanes and bit shifted lanes.
800     Paddd(dst, tmp);
801   }
802 
I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src, Register scratch)803   void I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src,
804                                  Register scratch) {
805     ASM_CODE_COMMENT(this);
806     Operand op = ExternalReferenceAsOperand(
807         ExternalReference::address_of_wasm_i16x8_splat_0x0001(), scratch);
808     // pmaddwd multiplies signed words in src and op, producing
809     // signed doublewords, then adds pairwise.
810     // src = |a|b|c|d|e|f|g|h|
811     // dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
812     if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
813       movaps(dst, src);
814       src = dst;
815     }
816 
817     Pmaddwd(dst, src, op);
818   }
819 
I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src, XMMRegister scratch, Register tmp)820   void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src,
821                                  XMMRegister scratch, Register tmp) {
822     ASM_CODE_COMMENT(this);
823     // pmaddubsw treats the first operand as unsigned, so pass the external
824     // reference to it as the first operand.
825     Operand op = ExternalReferenceAsOperand(
826         ExternalReference::address_of_wasm_i8x16_splat_0x01(), tmp);
827     if (CpuFeatures::IsSupported(AVX)) {
828       CpuFeatureScope avx_scope(this, AVX);
829       vmovdqa(scratch, op);
830       vpmaddubsw(dst, scratch, src);
831     } else {
832       CpuFeatureScope sse_scope(this, SSSE3);
833       if (dst == src) {
834         movaps(scratch, op);
835         pmaddubsw(scratch, src);
836         movaps(dst, scratch);
837       } else {
838         movaps(dst, op);
839         pmaddubsw(dst, src);
840       }
841     }
842   }
843 
I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src, Register scratch)844   void I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src,
845                                  Register scratch) {
846     ASM_CODE_COMMENT(this);
847     Operand op = ExternalReferenceAsOperand(
848         ExternalReference::address_of_wasm_i8x16_splat_0x01(), scratch);
849     if (CpuFeatures::IsSupported(AVX)) {
850       CpuFeatureScope avx_scope(this, AVX);
851       vpmaddubsw(dst, src, op);
852     } else {
853       CpuFeatureScope sse_scope(this, SSSE3);
854       if (dst != src) {
855         movaps(dst, src);
856       }
857       pmaddubsw(dst, op);
858     }
859   }
860 
I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask, XMMRegister scratch, Register tmp, bool omit_add = false)861   void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
862                     XMMRegister scratch, Register tmp, bool omit_add = false) {
863     ASM_CODE_COMMENT(this);
864     if (omit_add) {
865       // We have determined that the indices are immediates, and they are either
866       // within bounds, or the top bit is set, so we can omit the add.
867       Pshufb(dst, src, mask);
868       return;
869     }
870 
871     // Out-of-range indices should return 0, add 112 so that any value > 15
872     // saturates to 128 (top bit set), so pshufb will zero that lane.
873     Operand op = ExternalReferenceAsOperand(
874         ExternalReference::address_of_wasm_i8x16_swizzle_mask(), tmp);
875     if (CpuFeatures::IsSupported(AVX)) {
876       CpuFeatureScope avx_scope(this, AVX);
877       vpaddusb(scratch, mask, op);
878       vpshufb(dst, src, scratch);
879     } else {
880       CpuFeatureScope sse_scope(this, SSSE3);
881       movaps(scratch, op);
882       if (dst != src) {
883         DCHECK_NE(dst, mask);
884         movaps(dst, src);
885       }
886       paddusb(scratch, mask);
887       pshufb(dst, scratch);
888     }
889   }
890 
I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1, XMMRegister tmp2, Register scratch)891   void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
892                    XMMRegister tmp2, Register scratch) {
893     ASM_CODE_COMMENT(this);
894     DCHECK_NE(dst, tmp1);
895     DCHECK_NE(src, tmp1);
896     DCHECK_NE(dst, tmp2);
897     DCHECK_NE(src, tmp2);
898     if (CpuFeatures::IsSupported(AVX)) {
899       CpuFeatureScope avx_scope(this, AVX);
900       vmovdqa(tmp1, ExternalReferenceAsOperand(
901                         ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
902                         scratch));
903       vpandn(tmp2, tmp1, src);
904       vpand(dst, tmp1, src);
905       vmovdqa(tmp1, ExternalReferenceAsOperand(
906                         ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
907                         scratch));
908       vpsrlw(tmp2, tmp2, 4);
909       vpshufb(dst, tmp1, dst);
910       vpshufb(tmp2, tmp1, tmp2);
911       vpaddb(dst, dst, tmp2);
912     } else if (CpuFeatures::IsSupported(INTEL_ATOM)) {
913       // Pre-Goldmont low-power Intel microarchitectures have very slow
914       // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
915       // algorithm on these processors. ATOM CPU feature captures exactly
916       // the right set of processors.
917       movaps(tmp1, src);
918       psrlw(tmp1, 1);
919       if (dst != src) {
920         movaps(dst, src);
921       }
922       andps(tmp1, ExternalReferenceAsOperand(
923                       ExternalReference::address_of_wasm_i8x16_splat_0x55(),
924                       scratch));
925       psubb(dst, tmp1);
926       Operand splat_0x33 = ExternalReferenceAsOperand(
927           ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
928       movaps(tmp1, dst);
929       andps(dst, splat_0x33);
930       psrlw(tmp1, 2);
931       andps(tmp1, splat_0x33);
932       paddb(dst, tmp1);
933       movaps(tmp1, dst);
934       psrlw(dst, 4);
935       paddb(dst, tmp1);
936       andps(dst, ExternalReferenceAsOperand(
937                      ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
938                      scratch));
939     } else {
940       CpuFeatureScope sse_scope(this, SSSE3);
941       movaps(tmp1, ExternalReferenceAsOperand(
942                        ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
943                        scratch));
944       Operand mask = ExternalReferenceAsOperand(
945           ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
946       if (tmp2 != tmp1) {
947         movaps(tmp2, tmp1);
948       }
949       andps(tmp1, src);
950       andnps(tmp2, src);
951       psrlw(tmp2, 4);
952       movaps(dst, mask);
953       pshufb(dst, tmp1);
954       movaps(tmp1, mask);
955       pshufb(tmp1, tmp2);
956       paddb(dst, tmp1);
957     }
958   }
959 
960  private:
961   // All implementation-specific methods must be called through this.
impl()962   Impl* impl() { return static_cast<Impl*>(this); }
963 
ExternalReferenceAsOperand(ExternalReference reference, Register scratch)964   Operand ExternalReferenceAsOperand(ExternalReference reference,
965                                      Register scratch) {
966     return impl()->ExternalReferenceAsOperand(reference, scratch);
967   }
968 
969   using FloatInstruction = void (SharedTurboAssembler::*)(XMMRegister,
970                                                           XMMRegister, Operand);
FloatUnop(XMMRegister dst, XMMRegister src, Register tmp, FloatInstruction op, ExternalReference ext)971   void FloatUnop(XMMRegister dst, XMMRegister src, Register tmp,
972                  FloatInstruction op, ExternalReference ext) {
973     if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
974       movaps(dst, src);
975       src = dst;
976     }
977     SharedTurboAssembler* assm = this;
978     (assm->*op)(dst, src, ExternalReferenceAsOperand(ext, tmp));
979   }
980 };
981 
982 }  // namespace internal
983 }  // namespace v8
984 #endif  // V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_
985