1// Copyright 2021 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_
6#define V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_
7
8#include "src/base/macros.h"
9#include "src/codegen/cpu-features.h"
10#include "src/codegen/external-reference.h"
11#include "src/codegen/turbo-assembler.h"
12
13#if V8_TARGET_ARCH_IA32
14#include "src/codegen/ia32/register-ia32.h"
15#elif V8_TARGET_ARCH_X64
16#include "src/codegen/x64/register-x64.h"
17#else
18#error Unsupported target architecture.
19#endif
20
21namespace v8 {
22namespace internal {
23class Assembler;
24
25// For WebAssembly we care about the full floating point register. If we are not
26// running Wasm, we can get away with saving half of those registers.
27#if V8_ENABLE_WEBASSEMBLY
28constexpr int kStackSavedSavedFPSize = 2 * kDoubleSize;
29#else
30constexpr int kStackSavedSavedFPSize = kDoubleSize;
31#endif  // V8_ENABLE_WEBASSEMBLY
32
33// Base class for SharedTurboAssemblerBase. This class contains macro-assembler
34// functions that can be shared across ia32 and x64 without any template
35// machinery, i.e. does not require the CRTP pattern that
36// SharedTurboAssemblerBase exposes. This allows us to keep the bulk of
37// definition inside a separate source file, rather than putting everything
38// inside this header.
39class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
40 public:
41  using TurboAssemblerBase::TurboAssemblerBase;
42
43  void Move(Register dst, uint32_t src);
44  // Move if registers are not identical.
45  void Move(Register dst, Register src);
46  void Add(Register dst, Immediate src);
47  void And(Register dst, Immediate src);
48
49  // Will move src1 to dst if AVX is not supported.
50  void Movhps(XMMRegister dst, XMMRegister src1, Operand src2);
51  void Movlps(XMMRegister dst, XMMRegister src1, Operand src2);
52
53  void Pblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2,
54                XMMRegister mask);
55
56  template <typename Op>
57  void Pinsrb(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8,
58              uint32_t* load_pc_offset = nullptr) {
59    PinsrHelper(this, &Assembler::vpinsrb, &Assembler::pinsrb, dst, src1, src2,
60                imm8, load_pc_offset, {SSE4_1});
61  }
62
63  template <typename Op>
64  void Pinsrw(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8,
65              uint32_t* load_pc_offset = nullptr) {
66    PinsrHelper(this, &Assembler::vpinsrw, &Assembler::pinsrw, dst, src1, src2,
67                imm8, load_pc_offset);
68  }
69
70  // Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
71  template <typename Op>
72  void Pshufb(XMMRegister dst, XMMRegister src, Op mask) {
73    if (CpuFeatures::IsSupported(AVX)) {
74      CpuFeatureScope avx_scope(this, AVX);
75      vpshufb(dst, src, mask);
76    } else {
77      // Make sure these are different so that we won't overwrite mask.
78      DCHECK_NE(mask, dst);
79      if (dst != src) {
80        movaps(dst, src);
81      }
82      CpuFeatureScope sse_scope(this, SSSE3);
83      pshufb(dst, mask);
84    }
85  }
86
87  template <typename Op>
88  void Pshufb(XMMRegister dst, Op mask) {
89    Pshufb(dst, dst, mask);
90  }
91
92  // Shufps that will mov src1 into dst if AVX is not supported.
93  void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
94              uint8_t imm8);
95
96  // Helper struct to implement functions that check for AVX support and
97  // dispatch to the appropriate AVX/SSE instruction.
98  template <typename Dst, typename Arg, typename... Args>
99  struct AvxHelper {
100    Assembler* assm;
101    base::Optional<CpuFeature> feature = base::nullopt;
102    // Call a method where the AVX version expects the dst argument to be
103    // duplicated.
104    // E.g. Andps(x, y) -> vandps(x, x, y)
105    //                  -> andps(x, y)
106    template <void (Assembler::*avx)(Dst, Dst, Arg, Args...),
107              void (Assembler::*no_avx)(Dst, Arg, Args...)>
108    void emit(Dst dst, Arg arg, Args... args) {
109      if (CpuFeatures::IsSupported(AVX)) {
110        CpuFeatureScope scope(assm, AVX);
111        (assm->*avx)(dst, dst, arg, args...);
112      } else if (feature.has_value()) {
113        DCHECK(CpuFeatures::IsSupported(*feature));
114        CpuFeatureScope scope(assm, *feature);
115        (assm->*no_avx)(dst, arg, args...);
116      } else {
117        (assm->*no_avx)(dst, arg, args...);
118      }
119    }
120
121    // Call a method in the AVX form (one more operand), but if unsupported will
122    // check that dst == first src.
123    // E.g. Andps(x, y, z) -> vandps(x, y, z)
124    //                     -> andps(x, z) and check that x == y
125    template <void (Assembler::*avx)(Dst, Arg, Args...),
126              void (Assembler::*no_avx)(Dst, Args...)>
127    void emit(Dst dst, Arg arg, Args... args) {
128      if (CpuFeatures::IsSupported(AVX)) {
129        CpuFeatureScope scope(assm, AVX);
130        (assm->*avx)(dst, arg, args...);
131      } else if (feature.has_value()) {
132        DCHECK_EQ(dst, arg);
133        DCHECK(CpuFeatures::IsSupported(*feature));
134        CpuFeatureScope scope(assm, *feature);
135        (assm->*no_avx)(dst, args...);
136      } else {
137        DCHECK_EQ(dst, arg);
138        (assm->*no_avx)(dst, args...);
139      }
140    }
141
142    // Call a method where the AVX version expects no duplicated dst argument.
143    // E.g. Movddup(x, y) -> vmovddup(x, y)
144    //                    -> movddup(x, y)
145    template <void (Assembler::*avx)(Dst, Arg, Args...),
146              void (Assembler::*no_avx)(Dst, Arg, Args...)>
147    void emit(Dst dst, Arg arg, Args... args) {
148      if (CpuFeatures::IsSupported(AVX)) {
149        CpuFeatureScope scope(assm, AVX);
150        (assm->*avx)(dst, arg, args...);
151      } else if (feature.has_value()) {
152        DCHECK(CpuFeatures::IsSupported(*feature));
153        CpuFeatureScope scope(assm, *feature);
154        (assm->*no_avx)(dst, arg, args...);
155      } else {
156        (assm->*no_avx)(dst, arg, args...);
157      }
158    }
159  };
160
161#define AVX_OP(macro_name, name)                                        \
162  template <typename Dst, typename Arg, typename... Args>               \
163  void macro_name(Dst dst, Arg arg, Args... args) {                     \
164    AvxHelper<Dst, Arg, Args...>{this}                                  \
165        .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \
166                                                              args...); \
167  }
168
169// Define a macro which uses |avx_name| when AVX is supported, and |sse_name|
170// when AVX is not supported. This is useful for bit-wise instructions like
171// andpd/andps, where the behavior is exactly the same, but the *ps
172// version is 1 byte shorter, and on SSE-only processors there is no
173// performance difference since those processors don't differentiate integer
174// and floating-point domains.
175// Note: we require |avx_name| to be the AVX instruction without the "v"
176// prefix. If we require the full AVX instruction name and the caller
177// accidentally passes in a SSE instruction, we compile without any issues and
178// generate the SSE instruction. By appending "v" here, we ensure that we will
179// generate an AVX instruction.
180#define AVX_OP_WITH_DIFF_SSE_INSTR(macro_name, avx_name, sse_name)     \
181  template <typename Dst, typename Arg, typename... Args>              \
182  void macro_name(Dst dst, Arg arg, Args... args) {                    \
183    AvxHelper<Dst, Arg, Args...>{this}                                 \
184        .template emit<&Assembler::v##avx_name, &Assembler::sse_name>( \
185            dst, arg, args...);                                        \
186  }
187
188#define AVX_OP_SSE3(macro_name, name)                                    \
189  template <typename Dst, typename Arg, typename... Args>                \
190  void macro_name(Dst dst, Arg arg, Args... args) {                      \
191    AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE3)} \
192        .template emit<&Assembler::v##name, &Assembler::name>(dst, arg,  \
193                                                              args...);  \
194  }
195
196#define AVX_OP_SSSE3(macro_name, name)                                    \
197  template <typename Dst, typename Arg, typename... Args>                 \
198  void macro_name(Dst dst, Arg arg, Args... args) {                       \
199    AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSSE3)} \
200        .template emit<&Assembler::v##name, &Assembler::name>(dst, arg,   \
201                                                              args...);   \
202  }
203
204#define AVX_OP_SSE4_1(macro_name, name)                                    \
205  template <typename Dst, typename Arg, typename... Args>                  \
206  void macro_name(Dst dst, Arg arg, Args... args) {                        \
207    AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_1)} \
208        .template emit<&Assembler::v##name, &Assembler::name>(dst, arg,    \
209                                                              args...);    \
210  }
211
212#define AVX_OP_SSE4_2(macro_name, name)                                    \
213  template <typename Dst, typename Arg, typename... Args>                  \
214  void macro_name(Dst dst, Arg arg, Args... args) {                        \
215    AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_2)} \
216        .template emit<&Assembler::v##name, &Assembler::name>(dst, arg,    \
217                                                              args...);    \
218  }
219
220  // Keep this list sorted by required extension, then instruction name.
221  AVX_OP(Addpd, addpd)
222  AVX_OP(Addps, addps)
223  AVX_OP(Addsd, addsd)
224  AVX_OP(Addss, addss)
225  AVX_OP(Andnpd, andnpd)
226  AVX_OP(Andnps, andnps)
227  AVX_OP(Andpd, andpd)
228  AVX_OP(Andps, andps)
229  AVX_OP(Cmpeqpd, cmpeqpd)
230  AVX_OP(Cmpeqps, cmpeqps)
231  AVX_OP(Cmplepd, cmplepd)
232  AVX_OP(Cmpleps, cmpleps)
233  AVX_OP(Cmpltpd, cmpltpd)
234  AVX_OP(Cmpltps, cmpltps)
235  AVX_OP(Cmpneqpd, cmpneqpd)
236  AVX_OP(Cmpneqps, cmpneqps)
237  AVX_OP(Cmpunordpd, cmpunordpd)
238  AVX_OP(Cmpunordps, cmpunordps)
239  AVX_OP(Cvtdq2pd, cvtdq2pd)
240  AVX_OP(Cvtdq2ps, cvtdq2ps)
241  AVX_OP(Cvtpd2ps, cvtpd2ps)
242  AVX_OP(Cvtps2pd, cvtps2pd)
243  AVX_OP(Cvtsd2ss, cvtsd2ss)
244  AVX_OP(Cvtss2sd, cvtss2sd)
245  AVX_OP(Cvttpd2dq, cvttpd2dq)
246  AVX_OP(Cvttps2dq, cvttps2dq)
247  AVX_OP(Cvttsd2si, cvttsd2si)
248  AVX_OP(Cvttss2si, cvttss2si)
249  AVX_OP(Divpd, divpd)
250  AVX_OP(Divps, divps)
251  AVX_OP(Divsd, divsd)
252  AVX_OP(Divss, divss)
253  AVX_OP(Maxpd, maxpd)
254  AVX_OP(Maxps, maxps)
255  AVX_OP(Minpd, minpd)
256  AVX_OP(Minps, minps)
257  AVX_OP(Movaps, movaps)
258  AVX_OP(Movd, movd)
259  AVX_OP(Movhlps, movhlps)
260  AVX_OP(Movhps, movhps)
261  AVX_OP(Movlps, movlps)
262  AVX_OP(Movmskpd, movmskpd)
263  AVX_OP(Movmskps, movmskps)
264  AVX_OP(Movsd, movsd)
265  AVX_OP(Movss, movss)
266  AVX_OP(Movupd, movupd)
267  AVX_OP(Movups, movups)
268  AVX_OP(Mulpd, mulpd)
269  AVX_OP(Mulps, mulps)
270  AVX_OP(Mulsd, mulsd)
271  AVX_OP(Mulss, mulss)
272  AVX_OP(Orpd, orpd)
273  AVX_OP(Orps, orps)
274  AVX_OP(Packssdw, packssdw)
275  AVX_OP(Packsswb, packsswb)
276  AVX_OP(Packuswb, packuswb)
277  AVX_OP(Paddb, paddb)
278  AVX_OP(Paddd, paddd)
279  AVX_OP(Paddq, paddq)
280  AVX_OP(Paddsb, paddsb)
281  AVX_OP(Paddsw, paddsw)
282  AVX_OP(Paddusb, paddusb)
283  AVX_OP(Paddusw, paddusw)
284  AVX_OP(Paddw, paddw)
285  AVX_OP(Pavgb, pavgb)
286  AVX_OP(Pavgw, pavgw)
287  AVX_OP(Pcmpgtb, pcmpgtb)
288  AVX_OP(Pcmpgtd, pcmpgtd)
289  AVX_OP(Pcmpgtw, pcmpgtw)
290  AVX_OP(Pcmpeqb, pcmpeqb)
291  AVX_OP(Pcmpeqd, pcmpeqd)
292  AVX_OP(Pcmpeqw, pcmpeqw)
293  AVX_OP(Pmaddwd, pmaddwd)
294  AVX_OP(Pmaxsw, pmaxsw)
295  AVX_OP(Pmaxub, pmaxub)
296  AVX_OP(Pminsw, pminsw)
297  AVX_OP(Pminub, pminub)
298  AVX_OP(Pmovmskb, pmovmskb)
299  AVX_OP(Pmullw, pmullw)
300  AVX_OP(Pmuludq, pmuludq)
301  AVX_OP(Pshufd, pshufd)
302  AVX_OP(Pshufhw, pshufhw)
303  AVX_OP(Pshuflw, pshuflw)
304  AVX_OP(Pslld, pslld)
305  AVX_OP(Psllq, psllq)
306  AVX_OP(Psllw, psllw)
307  AVX_OP(Psrad, psrad)
308  AVX_OP(Psraw, psraw)
309  AVX_OP(Psrld, psrld)
310  AVX_OP(Psrlq, psrlq)
311  AVX_OP(Psrlw, psrlw)
312  AVX_OP(Psubb, psubb)
313  AVX_OP(Psubd, psubd)
314  AVX_OP(Psubq, psubq)
315  AVX_OP(Psubsb, psubsb)
316  AVX_OP(Psubsw, psubsw)
317  AVX_OP(Psubusb, psubusb)
318  AVX_OP(Psubusw, psubusw)
319  AVX_OP(Psubw, psubw)
320  AVX_OP(Punpckhbw, punpckhbw)
321  AVX_OP(Punpckhdq, punpckhdq)
322  AVX_OP(Punpckhqdq, punpckhqdq)
323  AVX_OP(Punpckhwd, punpckhwd)
324  AVX_OP(Punpcklbw, punpcklbw)
325  AVX_OP(Punpckldq, punpckldq)
326  AVX_OP(Punpcklqdq, punpcklqdq)
327  AVX_OP(Punpcklwd, punpcklwd)
328  AVX_OP(Rcpps, rcpps)
329  AVX_OP(Rsqrtps, rsqrtps)
330  AVX_OP(Sqrtpd, sqrtpd)
331  AVX_OP(Sqrtps, sqrtps)
332  AVX_OP(Sqrtsd, sqrtsd)
333  AVX_OP(Sqrtss, sqrtss)
334  AVX_OP(Subpd, subpd)
335  AVX_OP(Subps, subps)
336  AVX_OP(Subsd, subsd)
337  AVX_OP(Subss, subss)
338  AVX_OP(Ucomisd, ucomisd)
339  AVX_OP(Ucomiss, ucomiss)
340  AVX_OP(Unpcklps, unpcklps)
341  AVX_OP(Xorpd, xorpd)
342  AVX_OP(Xorps, xorps)
343
344  // Many AVX processors have separate integer/floating-point domains, so use
345  // vmovaps if AVX is supported. On SSE, movaps is 1 byte shorter than movdqa,
346  // and has the same behavior. Most SSE processors also don't have the same
347  // delay moving between integer and floating-point domains.
348  AVX_OP_WITH_DIFF_SSE_INSTR(Movapd, movapd, movaps)
349  AVX_OP_WITH_DIFF_SSE_INSTR(Movdqa, movdqa, movaps)
350  AVX_OP_WITH_DIFF_SSE_INSTR(Movdqu, movdqu, movups)
351  AVX_OP_WITH_DIFF_SSE_INSTR(Pand, pand, andps)
352  AVX_OP_WITH_DIFF_SSE_INSTR(Por, por, orps)
353  AVX_OP_WITH_DIFF_SSE_INSTR(Pxor, pxor, xorps)
354
355  AVX_OP_SSE3(Haddps, haddps)
356  AVX_OP_SSE3(Movddup, movddup)
357  AVX_OP_SSE3(Movshdup, movshdup)
358
359  AVX_OP_SSSE3(Pabsb, pabsb)
360  AVX_OP_SSSE3(Pabsd, pabsd)
361  AVX_OP_SSSE3(Pabsw, pabsw)
362  AVX_OP_SSSE3(Palignr, palignr)
363  AVX_OP_SSSE3(Pmulhrsw, pmulhrsw)
364  AVX_OP_SSSE3(Psignb, psignb)
365  AVX_OP_SSSE3(Psignd, psignd)
366  AVX_OP_SSSE3(Psignw, psignw)
367
368  AVX_OP_SSE4_1(Extractps, extractps)
369  AVX_OP_SSE4_1(Insertps, insertps)
370  AVX_OP_SSE4_1(Packusdw, packusdw)
371  AVX_OP_SSE4_1(Pblendw, pblendw)
372  AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq)
373  AVX_OP_SSE4_1(Pextrb, pextrb)
374  AVX_OP_SSE4_1(Pextrw, pextrw)
375  AVX_OP_SSE4_1(Pmaxsb, pmaxsb)
376  AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
377  AVX_OP_SSE4_1(Pmaxud, pmaxud)
378  AVX_OP_SSE4_1(Pmaxuw, pmaxuw)
379  AVX_OP_SSE4_1(Pminsb, pminsb)
380  AVX_OP_SSE4_1(Pminsd, pminsd)
381  AVX_OP_SSE4_1(Pminud, pminud)
382  AVX_OP_SSE4_1(Pminuw, pminuw)
383  AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw)
384  AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq)
385  AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd)
386  AVX_OP_SSE4_1(Pmovzxbw, pmovzxbw)
387  AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq)
388  AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd)
389  AVX_OP_SSE4_1(Pmulld, pmulld)
390  AVX_OP_SSE4_1(Ptest, ptest)
391  AVX_OP_SSE4_1(Roundpd, roundpd)
392  AVX_OP_SSE4_1(Roundps, roundps)
393  AVX_OP_SSE4_1(Roundsd, roundsd)
394  AVX_OP_SSE4_1(Roundss, roundss)
395
396#undef AVX_OP
397#undef AVX_OP_SSE3
398#undef AVX_OP_SSSE3
399#undef AVX_OP_SSE4_1
400#undef AVX_OP_SSE4_2
401
402  void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane);
403  void F64x2ReplaceLane(XMMRegister dst, XMMRegister src, DoubleRegister rep,
404                        uint8_t lane);
405  void F64x2Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
406                XMMRegister scratch);
407  void F64x2Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
408                XMMRegister scratch);
409  void F32x4Splat(XMMRegister dst, DoubleRegister src);
410  void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
411  void F32x4Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
412                XMMRegister scratch);
413  void F32x4Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
414                XMMRegister scratch);
415  void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
416  void I8x16Splat(XMMRegister dst, Register src, XMMRegister scratch);
417  void I8x16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
418  void I8x16Shl(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
419                XMMRegister tmp2);
420  void I8x16Shl(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1,
421                XMMRegister tmp2, XMMRegister tmp3);
422  void I8x16ShrS(XMMRegister dst, XMMRegister src1, uint8_t src2,
423                 XMMRegister tmp);
424  void I8x16ShrS(XMMRegister dst, XMMRegister src1, Register src2,
425                 Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
426  void I8x16ShrU(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
427                 XMMRegister tmp2);
428  void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2,
429                 Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
430  void I16x8Splat(XMMRegister dst, Register src);
431  void I16x8Splat(XMMRegister dst, Operand src);
432  void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
433                      XMMRegister scrat, bool is_signed);
434  void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
435                        XMMRegister scratch);
436  void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2,
437                        XMMRegister scratch);
438  void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
439  void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src,
440                              XMMRegister scratch);
441  // Will move src1 to dst if AVX is not supported.
442  void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
443                        XMMRegister scratch);
444  void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
445                                 XMMRegister tmp);
446  // Requires that dst == src1 if AVX is not supported.
447  void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
448                   XMMRegister scratch, bool low, bool is_signed);
449  void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
450  void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src,
451                              XMMRegister scratch);
452  void I64x2Neg(XMMRegister dst, XMMRegister src, XMMRegister scratch);
453  void I64x2Abs(XMMRegister dst, XMMRegister src, XMMRegister scratch);
454  void I64x2GtS(XMMRegister dst, XMMRegister src0, XMMRegister src1,
455                XMMRegister scratch);
456  void I64x2GeS(XMMRegister dst, XMMRegister src0, XMMRegister src1,
457                XMMRegister scratch);
458  void I64x2ShrS(XMMRegister dst, XMMRegister src, uint8_t shift,
459                 XMMRegister xmm_tmp);
460  void I64x2ShrS(XMMRegister dst, XMMRegister src, Register shift,
461                 XMMRegister xmm_tmp, XMMRegister xmm_shift,
462                 Register tmp_shift);
463  void I64x2Mul(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
464                XMMRegister tmp1, XMMRegister tmp2);
465  void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
466                   XMMRegister scratch, bool low, bool is_signed);
467  void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src);
468  void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src,
469                              XMMRegister scratch);
470  void S128Not(XMMRegister dst, XMMRegister src, XMMRegister scratch);
471  // Requires dst == mask when AVX is not supported.
472  void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
473                  XMMRegister src2, XMMRegister scratch);
474  void S128Load8Splat(XMMRegister dst, Operand src, XMMRegister scratch);
475  void S128Load16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
476  void S128Load32Splat(XMMRegister dst, Operand src);
477  void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx);
478
479  void F64x2Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2,
480                 XMMRegister src3, XMMRegister tmp);
481  void F64x2Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2,
482                 XMMRegister src3, XMMRegister tmp);
483  void F32x4Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2,
484                 XMMRegister src3, XMMRegister tmp);
485  void F32x4Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2,
486                 XMMRegister src3, XMMRegister tmp);
487
488 protected:
489  template <typename Op>
490  using AvxFn = void (Assembler::*)(XMMRegister, XMMRegister, Op, uint8_t);
491  template <typename Op>
492  using NoAvxFn = void (Assembler::*)(XMMRegister, Op, uint8_t);
493
494  template <typename Op>
495  void PinsrHelper(Assembler* assm, AvxFn<Op> avx, NoAvxFn<Op> noavx,
496                   XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8,
497                   uint32_t* load_pc_offset = nullptr,
498                   base::Optional<CpuFeature> feature = base::nullopt) {
499    if (CpuFeatures::IsSupported(AVX)) {
500      CpuFeatureScope scope(assm, AVX);
501      if (load_pc_offset) *load_pc_offset = assm->pc_offset();
502      (assm->*avx)(dst, src1, src2, imm8);
503      return;
504    }
505
506    if (dst != src1) assm->movaps(dst, src1);
507    if (load_pc_offset) *load_pc_offset = assm->pc_offset();
508    if (feature.has_value()) {
509      DCHECK(CpuFeatures::IsSupported(*feature));
510      CpuFeatureScope scope(assm, *feature);
511      (assm->*noavx)(dst, src2, imm8);
512    } else {
513      (assm->*noavx)(dst, src2, imm8);
514    }
515  }
516
517 private:
518  template <typename Op>
519  void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch);
520  template <typename Op>
521  void I16x8SplatPreAvx2(XMMRegister dst, Op src);
522};
523
524// Common base class template shared by ia32 and x64 TurboAssembler. This uses
525// the Curiously Recurring Template Pattern (CRTP), where Impl is the actual
526// class (subclass of SharedTurboAssemblerBase instantiated with the actual
527// class). This allows static polymorphism, where member functions can be move
528// into SharedTurboAssembler, and we can also call into member functions
529// defined in ia32 or x64 specific TurboAssembler from within this template
530// class, via Impl.
531//
532// Note: all member functions must be defined in this header file so that the
533// compiler can generate code for the function definitions. See
534// https://isocpp.org/wiki/faq/templates#templates-defn-vs-decl for rationale.
535// If a function does not need polymorphism, move it into SharedTurboAssembler,
536// and define it outside of this header.
537template <typename Impl>
538class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
539  using SharedTurboAssembler::SharedTurboAssembler;
540
541 public:
542  void Abspd(XMMRegister dst, XMMRegister src, Register tmp) {
543    FloatUnop(dst, src, tmp, &SharedTurboAssembler::Andps,
544              ExternalReference::address_of_double_abs_constant());
545  }
546
547  void Absps(XMMRegister dst, XMMRegister src, Register tmp) {
548    FloatUnop(dst, src, tmp, &SharedTurboAssembler::Andps,
549              ExternalReference::address_of_float_abs_constant());
550  }
551
552  void Negpd(XMMRegister dst, XMMRegister src, Register tmp) {
553    FloatUnop(dst, src, tmp, &SharedTurboAssembler::Xorps,
554              ExternalReference::address_of_double_neg_constant());
555  }
556
557  void Negps(XMMRegister dst, XMMRegister src, Register tmp) {
558    FloatUnop(dst, src, tmp, &SharedTurboAssembler::Xorps,
559              ExternalReference::address_of_float_neg_constant());
560  }
561#undef FLOAT_UNOP
562
563  void Pextrd(Register dst, XMMRegister src, uint8_t imm8) {
564    if (imm8 == 0) {
565      Movd(dst, src);
566      return;
567    }
568
569    if (CpuFeatures::IsSupported(AVX)) {
570      CpuFeatureScope scope(this, AVX);
571      vpextrd(dst, src, imm8);
572    } else if (CpuFeatures::IsSupported(SSE4_1)) {
573      CpuFeatureScope sse_scope(this, SSE4_1);
574      pextrd(dst, src, imm8);
575    } else {
576      DCHECK_LT(imm8, 2);
577      impl()->PextrdPreSse41(dst, src, imm8);
578    }
579  }
580
581  template <typename Op>
582  void Pinsrd(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8,
583              uint32_t* load_pc_offset = nullptr) {
584    if (CpuFeatures::IsSupported(SSE4_1)) {
585      PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1,
586                  src2, imm8, load_pc_offset,
587                  base::Optional<CpuFeature>(SSE4_1));
588    } else {
589      if (dst != src1) {
590        movaps(dst, src1);
591      }
592      impl()->PinsrdPreSse41(dst, src2, imm8, load_pc_offset);
593    }
594  }
595
596  template <typename Op>
597  void Pinsrd(XMMRegister dst, Op src, uint8_t imm8,
598              uint32_t* load_pc_offset = nullptr) {
599    Pinsrd(dst, dst, src, imm8, load_pc_offset);
600  }
601
602  void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src,
603                             Register scratch) {
604    ASM_CODE_COMMENT(this);
605    // dst = [ src_low, 0x43300000, src_high, 0x4330000 ];
606    // 0x43300000'00000000 is a special double where the significand bits
607    // precisely represents all uint32 numbers.
608    if (!CpuFeatures::IsSupported(AVX) && dst != src) {
609      movaps(dst, src);
610      src = dst;
611    }
612    Unpcklps(dst, src,
613             ExternalReferenceAsOperand(
614                 ExternalReference::
615                     address_of_wasm_f64x2_convert_low_i32x4_u_int_mask(),
616                 scratch));
617    Subpd(dst,
618          ExternalReferenceAsOperand(
619              ExternalReference::address_of_wasm_double_2_power_52(), scratch));
620  }
621
622  void I32x4SConvertF32x4(XMMRegister dst, XMMRegister src, XMMRegister tmp,
623                          Register scratch) {
624    ASM_CODE_COMMENT(this);
625    Operand op = ExternalReferenceAsOperand(
626        ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch);
627
628    // This algorithm works by:
629    // 1. lanes with NaNs are zero-ed
630    // 2. lanes ge than 2147483648.0f (MAX_INT32+1) set to 0xffff'ffff
631    // 3. cvttps2dq sets all out of range lanes to 0x8000'0000
632    //   a. correct for underflows (< MIN_INT32)
633    //   b. wrong for overflow, and we know which lanes overflow from 2.
634    // 4. adjust for 3b by xor-ing 2 and 3
635    //   a. 0x8000'0000 xor 0xffff'ffff = 0x7fff'ffff (MAX_INT32)
636    if (CpuFeatures::IsSupported(AVX)) {
637      CpuFeatureScope scope(this, AVX);
638      vcmpeqps(tmp, src, src);
639      vandps(dst, src, tmp);
640      vcmpgeps(tmp, src, op);
641      vcvttps2dq(dst, dst);
642      vpxor(dst, dst, tmp);
643    } else {
644      if (src == dst) {
645        movaps(tmp, src);
646        cmpeqps(tmp, tmp);
647        andps(dst, tmp);
648        movaps(tmp, op);
649        cmpleps(tmp, dst);
650        cvttps2dq(dst, dst);
651        xorps(dst, tmp);
652      } else {
653        movaps(tmp, op);
654        cmpleps(tmp, src);
655        cvttps2dq(dst, src);
656        xorps(dst, tmp);
657        movaps(tmp, src);
658        cmpeqps(tmp, tmp);
659        andps(dst, tmp);
660      }
661    }
662  }
663
664  void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src,
665                               XMMRegister scratch, Register tmp) {
666    ASM_CODE_COMMENT(this);
667    if (CpuFeatures::IsSupported(AVX)) {
668      CpuFeatureScope avx_scope(this, AVX);
669      XMMRegister original_dst = dst;
670      // Make sure we don't overwrite src.
671      if (dst == src) {
672        DCHECK_NE(src, scratch);
673        dst = scratch;
674      }
675      // dst = 0 if src == NaN, else all ones.
676      vcmpeqpd(dst, src, src);
677      // dst = 0 if src == NaN, else INT32_MAX as double.
678      vandpd(
679          dst, dst,
680          ExternalReferenceAsOperand(
681              ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
682      // dst = 0 if src == NaN, src is saturated to INT32_MAX as double.
683      vminpd(dst, src, dst);
684      // Values > INT32_MAX already saturated, values < INT32_MIN raises an
685      // exception, which is masked and returns 0x80000000.
686      vcvttpd2dq(original_dst, dst);
687    } else {
688      if (dst != src) {
689        movaps(dst, src);
690      }
691      movaps(scratch, dst);
692      cmpeqpd(scratch, dst);
693      andps(scratch,
694            ExternalReferenceAsOperand(
695                ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
696      minpd(dst, scratch);
697      cvttpd2dq(dst, dst);
698    }
699  }
700
701  void I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src,
702                               XMMRegister scratch, Register tmp) {
703    ASM_CODE_COMMENT(this);
704    if (CpuFeatures::IsSupported(AVX)) {
705      CpuFeatureScope avx_scope(this, AVX);
706      vxorpd(scratch, scratch, scratch);
707      // Saturate to 0.
708      vmaxpd(dst, src, scratch);
709      // Saturate to UINT32_MAX.
710      vminpd(
711          dst, dst,
712          ExternalReferenceAsOperand(
713              ExternalReference::address_of_wasm_uint32_max_as_double(), tmp));
714      // Truncate.
715      vroundpd(dst, dst, kRoundToZero);
716      // Add to special double where significant bits == uint32.
717      vaddpd(dst, dst,
718             ExternalReferenceAsOperand(
719                 ExternalReference::address_of_wasm_double_2_power_52(), tmp));
720      // Extract low 32 bits of each double's significand, zero top lanes.
721      // dst = [dst[0], dst[2], 0, 0]
722      vshufps(dst, dst, scratch, 0x88);
723    } else {
724      CpuFeatureScope scope(this, SSE4_1);
725      if (dst != src) {
726        movaps(dst, src);
727      }
728      xorps(scratch, scratch);
729      maxpd(dst, scratch);
730      minpd(dst, ExternalReferenceAsOperand(
731                     ExternalReference::address_of_wasm_uint32_max_as_double(),
732                     tmp));
733      roundpd(dst, dst, kRoundToZero);
734      addpd(dst,
735            ExternalReferenceAsOperand(
736                ExternalReference::address_of_wasm_double_2_power_52(), tmp));
737      shufps(dst, scratch, 0x88);
738    }
739  }
740
741  void I32x4TruncF64x2UZero(XMMRegister dst, XMMRegister src, Register tmp,
742                            XMMRegister scratch) {
743    // TODO(zhin): call this from I32x4TruncSatF64x2UZero.
744    ASM_CODE_COMMENT(this);
745    if (dst != src && !CpuFeatures::IsSupported(AVX)) {
746      movaps(dst, src);
747      src = dst;
748    }
749    // Same as I32x4TruncSatF64x2UZero but without the saturation.
750    Roundpd(dst, src, kRoundToZero);
751    // Add to special double where significant bits == uint32.
752    Addpd(dst, dst,
753          ExternalReferenceAsOperand(
754              ExternalReference::address_of_wasm_double_2_power_52(), tmp));
755    // Extract low 32 bits of each double's significand, zero top lanes.
756    // dst = [dst[0], dst[2], 0, 0]
757    Shufps(dst, dst, scratch, 0x88);
758  }
759
760  void I32x4TruncF32x4U(XMMRegister dst, XMMRegister src, Register scratch,
761                        XMMRegister tmp) {
762    ASM_CODE_COMMENT(this);
763    Operand int32_overflow_op = ExternalReferenceAsOperand(
764        ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch);
765    if (CpuFeatures::IsSupported(AVX)) {
766      CpuFeatureScope avx_scope(this, AVX);
767      vcmpltps(tmp, src, int32_overflow_op);
768    } else {
769      movaps(tmp, src);
770      cmpltps(tmp, int32_overflow_op);
771    }
772    // In tmp, lanes < INT32_MAX are left alone, other lanes are zeroed.
773    Pand(tmp, src);
774    // tmp = src with all the valid conversions
775    if (dst != src) {
776      Movaps(dst, src);
777    }
778    // In dst, lanes < INT32_MAX are zeroed, other lanes left alone.
779    Pxor(dst, tmp);
780    // tmp contains only lanes which can be converted correctly (<INT32_MAX)
781    Cvttps2dq(tmp, tmp);
782    // Bit-trick follows:
783    // All integers from INT32_MAX to UINT32_MAX that are representable as
784    // floats lie between [0x4f00'0000,0x4f80'0000).
785    // The bit representation of the integers is actually shifted right by 8.
786    // For example given 2147483904.0f (which fits in UINT32_MAX):
787    //
788    // 01001111 000000000 000000000 000000001 (float 0x4f00'0001)
789    //          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
790    //          these are exactly the top 24 bits of the int representation
791    //          but needs the top bit to be flipped
792    // 10000000 000000000 000000001 000000000 (int 0x8000'0100)
793    //
794    // So what needs to be done is to flip bit 23, which is the lowest bit of
795    // the exponent, which means multiply by 2 (or addps to itself).
796    Addps(dst, dst, dst);
797    // Then shift to get the bit representation of the int.
798    Pslld(dst, byte{8});
799    // Merge the converted lanes and bit shifted lanes.
800    Paddd(dst, tmp);
801  }
802
803  void I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src,
804                                 Register scratch) {
805    ASM_CODE_COMMENT(this);
806    Operand op = ExternalReferenceAsOperand(
807        ExternalReference::address_of_wasm_i16x8_splat_0x0001(), scratch);
808    // pmaddwd multiplies signed words in src and op, producing
809    // signed doublewords, then adds pairwise.
810    // src = |a|b|c|d|e|f|g|h|
811    // dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
812    if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
813      movaps(dst, src);
814      src = dst;
815    }
816
817    Pmaddwd(dst, src, op);
818  }
819
820  void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src,
821                                 XMMRegister scratch, Register tmp) {
822    ASM_CODE_COMMENT(this);
823    // pmaddubsw treats the first operand as unsigned, so pass the external
824    // reference to it as the first operand.
825    Operand op = ExternalReferenceAsOperand(
826        ExternalReference::address_of_wasm_i8x16_splat_0x01(), tmp);
827    if (CpuFeatures::IsSupported(AVX)) {
828      CpuFeatureScope avx_scope(this, AVX);
829      vmovdqa(scratch, op);
830      vpmaddubsw(dst, scratch, src);
831    } else {
832      CpuFeatureScope sse_scope(this, SSSE3);
833      if (dst == src) {
834        movaps(scratch, op);
835        pmaddubsw(scratch, src);
836        movaps(dst, scratch);
837      } else {
838        movaps(dst, op);
839        pmaddubsw(dst, src);
840      }
841    }
842  }
843
844  void I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src,
845                                 Register scratch) {
846    ASM_CODE_COMMENT(this);
847    Operand op = ExternalReferenceAsOperand(
848        ExternalReference::address_of_wasm_i8x16_splat_0x01(), scratch);
849    if (CpuFeatures::IsSupported(AVX)) {
850      CpuFeatureScope avx_scope(this, AVX);
851      vpmaddubsw(dst, src, op);
852    } else {
853      CpuFeatureScope sse_scope(this, SSSE3);
854      if (dst != src) {
855        movaps(dst, src);
856      }
857      pmaddubsw(dst, op);
858    }
859  }
860
861  void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
862                    XMMRegister scratch, Register tmp, bool omit_add = false) {
863    ASM_CODE_COMMENT(this);
864    if (omit_add) {
865      // We have determined that the indices are immediates, and they are either
866      // within bounds, or the top bit is set, so we can omit the add.
867      Pshufb(dst, src, mask);
868      return;
869    }
870
871    // Out-of-range indices should return 0, add 112 so that any value > 15
872    // saturates to 128 (top bit set), so pshufb will zero that lane.
873    Operand op = ExternalReferenceAsOperand(
874        ExternalReference::address_of_wasm_i8x16_swizzle_mask(), tmp);
875    if (CpuFeatures::IsSupported(AVX)) {
876      CpuFeatureScope avx_scope(this, AVX);
877      vpaddusb(scratch, mask, op);
878      vpshufb(dst, src, scratch);
879    } else {
880      CpuFeatureScope sse_scope(this, SSSE3);
881      movaps(scratch, op);
882      if (dst != src) {
883        DCHECK_NE(dst, mask);
884        movaps(dst, src);
885      }
886      paddusb(scratch, mask);
887      pshufb(dst, scratch);
888    }
889  }
890
891  void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
892                   XMMRegister tmp2, Register scratch) {
893    ASM_CODE_COMMENT(this);
894    DCHECK_NE(dst, tmp1);
895    DCHECK_NE(src, tmp1);
896    DCHECK_NE(dst, tmp2);
897    DCHECK_NE(src, tmp2);
898    if (CpuFeatures::IsSupported(AVX)) {
899      CpuFeatureScope avx_scope(this, AVX);
900      vmovdqa(tmp1, ExternalReferenceAsOperand(
901                        ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
902                        scratch));
903      vpandn(tmp2, tmp1, src);
904      vpand(dst, tmp1, src);
905      vmovdqa(tmp1, ExternalReferenceAsOperand(
906                        ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
907                        scratch));
908      vpsrlw(tmp2, tmp2, 4);
909      vpshufb(dst, tmp1, dst);
910      vpshufb(tmp2, tmp1, tmp2);
911      vpaddb(dst, dst, tmp2);
912    } else if (CpuFeatures::IsSupported(INTEL_ATOM)) {
913      // Pre-Goldmont low-power Intel microarchitectures have very slow
914      // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
915      // algorithm on these processors. ATOM CPU feature captures exactly
916      // the right set of processors.
917      movaps(tmp1, src);
918      psrlw(tmp1, 1);
919      if (dst != src) {
920        movaps(dst, src);
921      }
922      andps(tmp1, ExternalReferenceAsOperand(
923                      ExternalReference::address_of_wasm_i8x16_splat_0x55(),
924                      scratch));
925      psubb(dst, tmp1);
926      Operand splat_0x33 = ExternalReferenceAsOperand(
927          ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
928      movaps(tmp1, dst);
929      andps(dst, splat_0x33);
930      psrlw(tmp1, 2);
931      andps(tmp1, splat_0x33);
932      paddb(dst, tmp1);
933      movaps(tmp1, dst);
934      psrlw(dst, 4);
935      paddb(dst, tmp1);
936      andps(dst, ExternalReferenceAsOperand(
937                     ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
938                     scratch));
939    } else {
940      CpuFeatureScope sse_scope(this, SSSE3);
941      movaps(tmp1, ExternalReferenceAsOperand(
942                       ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
943                       scratch));
944      Operand mask = ExternalReferenceAsOperand(
945          ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
946      if (tmp2 != tmp1) {
947        movaps(tmp2, tmp1);
948      }
949      andps(tmp1, src);
950      andnps(tmp2, src);
951      psrlw(tmp2, 4);
952      movaps(dst, mask);
953      pshufb(dst, tmp1);
954      movaps(tmp1, mask);
955      pshufb(tmp1, tmp2);
956      paddb(dst, tmp1);
957    }
958  }
959
960 private:
961  // All implementation-specific methods must be called through this.
962  Impl* impl() { return static_cast<Impl*>(this); }
963
964  Operand ExternalReferenceAsOperand(ExternalReference reference,
965                                     Register scratch) {
966    return impl()->ExternalReferenceAsOperand(reference, scratch);
967  }
968
969  using FloatInstruction = void (SharedTurboAssembler::*)(XMMRegister,
970                                                          XMMRegister, Operand);
971  void FloatUnop(XMMRegister dst, XMMRegister src, Register tmp,
972                 FloatInstruction op, ExternalReference ext) {
973    if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
974      movaps(dst, src);
975      src = dst;
976    }
977    SharedTurboAssembler* assm = this;
978    (assm->*op)(dst, src, ExternalReferenceAsOperand(ext, tmp));
979  }
980};
981
982}  // namespace internal
983}  // namespace v8
984#endif  // V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_
985