1// Copyright 2021 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_ 6#define V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_ 7 8#include "src/base/macros.h" 9#include "src/codegen/cpu-features.h" 10#include "src/codegen/external-reference.h" 11#include "src/codegen/turbo-assembler.h" 12 13#if V8_TARGET_ARCH_IA32 14#include "src/codegen/ia32/register-ia32.h" 15#elif V8_TARGET_ARCH_X64 16#include "src/codegen/x64/register-x64.h" 17#else 18#error Unsupported target architecture. 19#endif 20 21namespace v8 { 22namespace internal { 23class Assembler; 24 25// For WebAssembly we care about the full floating point register. If we are not 26// running Wasm, we can get away with saving half of those registers. 27#if V8_ENABLE_WEBASSEMBLY 28constexpr int kStackSavedSavedFPSize = 2 * kDoubleSize; 29#else 30constexpr int kStackSavedSavedFPSize = kDoubleSize; 31#endif // V8_ENABLE_WEBASSEMBLY 32 33// Base class for SharedTurboAssemblerBase. This class contains macro-assembler 34// functions that can be shared across ia32 and x64 without any template 35// machinery, i.e. does not require the CRTP pattern that 36// SharedTurboAssemblerBase exposes. This allows us to keep the bulk of 37// definition inside a separate source file, rather than putting everything 38// inside this header. 39class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { 40 public: 41 using TurboAssemblerBase::TurboAssemblerBase; 42 43 void Move(Register dst, uint32_t src); 44 // Move if registers are not identical. 45 void Move(Register dst, Register src); 46 void Add(Register dst, Immediate src); 47 void And(Register dst, Immediate src); 48 49 // Will move src1 to dst if AVX is not supported. 50 void Movhps(XMMRegister dst, XMMRegister src1, Operand src2); 51 void Movlps(XMMRegister dst, XMMRegister src1, Operand src2); 52 53 void Pblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2, 54 XMMRegister mask); 55 56 template <typename Op> 57 void Pinsrb(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, 58 uint32_t* load_pc_offset = nullptr) { 59 PinsrHelper(this, &Assembler::vpinsrb, &Assembler::pinsrb, dst, src1, src2, 60 imm8, load_pc_offset, {SSE4_1}); 61 } 62 63 template <typename Op> 64 void Pinsrw(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, 65 uint32_t* load_pc_offset = nullptr) { 66 PinsrHelper(this, &Assembler::vpinsrw, &Assembler::pinsrw, dst, src1, src2, 67 imm8, load_pc_offset); 68 } 69 70 // Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE. 71 template <typename Op> 72 void Pshufb(XMMRegister dst, XMMRegister src, Op mask) { 73 if (CpuFeatures::IsSupported(AVX)) { 74 CpuFeatureScope avx_scope(this, AVX); 75 vpshufb(dst, src, mask); 76 } else { 77 // Make sure these are different so that we won't overwrite mask. 78 DCHECK_NE(mask, dst); 79 if (dst != src) { 80 movaps(dst, src); 81 } 82 CpuFeatureScope sse_scope(this, SSSE3); 83 pshufb(dst, mask); 84 } 85 } 86 87 template <typename Op> 88 void Pshufb(XMMRegister dst, Op mask) { 89 Pshufb(dst, dst, mask); 90 } 91 92 // Shufps that will mov src1 into dst if AVX is not supported. 93 void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, 94 uint8_t imm8); 95 96 // Helper struct to implement functions that check for AVX support and 97 // dispatch to the appropriate AVX/SSE instruction. 98 template <typename Dst, typename Arg, typename... Args> 99 struct AvxHelper { 100 Assembler* assm; 101 base::Optional<CpuFeature> feature = base::nullopt; 102 // Call a method where the AVX version expects the dst argument to be 103 // duplicated. 104 // E.g. Andps(x, y) -> vandps(x, x, y) 105 // -> andps(x, y) 106 template <void (Assembler::*avx)(Dst, Dst, Arg, Args...), 107 void (Assembler::*no_avx)(Dst, Arg, Args...)> 108 void emit(Dst dst, Arg arg, Args... args) { 109 if (CpuFeatures::IsSupported(AVX)) { 110 CpuFeatureScope scope(assm, AVX); 111 (assm->*avx)(dst, dst, arg, args...); 112 } else if (feature.has_value()) { 113 DCHECK(CpuFeatures::IsSupported(*feature)); 114 CpuFeatureScope scope(assm, *feature); 115 (assm->*no_avx)(dst, arg, args...); 116 } else { 117 (assm->*no_avx)(dst, arg, args...); 118 } 119 } 120 121 // Call a method in the AVX form (one more operand), but if unsupported will 122 // check that dst == first src. 123 // E.g. Andps(x, y, z) -> vandps(x, y, z) 124 // -> andps(x, z) and check that x == y 125 template <void (Assembler::*avx)(Dst, Arg, Args...), 126 void (Assembler::*no_avx)(Dst, Args...)> 127 void emit(Dst dst, Arg arg, Args... args) { 128 if (CpuFeatures::IsSupported(AVX)) { 129 CpuFeatureScope scope(assm, AVX); 130 (assm->*avx)(dst, arg, args...); 131 } else if (feature.has_value()) { 132 DCHECK_EQ(dst, arg); 133 DCHECK(CpuFeatures::IsSupported(*feature)); 134 CpuFeatureScope scope(assm, *feature); 135 (assm->*no_avx)(dst, args...); 136 } else { 137 DCHECK_EQ(dst, arg); 138 (assm->*no_avx)(dst, args...); 139 } 140 } 141 142 // Call a method where the AVX version expects no duplicated dst argument. 143 // E.g. Movddup(x, y) -> vmovddup(x, y) 144 // -> movddup(x, y) 145 template <void (Assembler::*avx)(Dst, Arg, Args...), 146 void (Assembler::*no_avx)(Dst, Arg, Args...)> 147 void emit(Dst dst, Arg arg, Args... args) { 148 if (CpuFeatures::IsSupported(AVX)) { 149 CpuFeatureScope scope(assm, AVX); 150 (assm->*avx)(dst, arg, args...); 151 } else if (feature.has_value()) { 152 DCHECK(CpuFeatures::IsSupported(*feature)); 153 CpuFeatureScope scope(assm, *feature); 154 (assm->*no_avx)(dst, arg, args...); 155 } else { 156 (assm->*no_avx)(dst, arg, args...); 157 } 158 } 159 }; 160 161#define AVX_OP(macro_name, name) \ 162 template <typename Dst, typename Arg, typename... Args> \ 163 void macro_name(Dst dst, Arg arg, Args... args) { \ 164 AvxHelper<Dst, Arg, Args...>{this} \ 165 .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ 166 args...); \ 167 } 168 169// Define a macro which uses |avx_name| when AVX is supported, and |sse_name| 170// when AVX is not supported. This is useful for bit-wise instructions like 171// andpd/andps, where the behavior is exactly the same, but the *ps 172// version is 1 byte shorter, and on SSE-only processors there is no 173// performance difference since those processors don't differentiate integer 174// and floating-point domains. 175// Note: we require |avx_name| to be the AVX instruction without the "v" 176// prefix. If we require the full AVX instruction name and the caller 177// accidentally passes in a SSE instruction, we compile without any issues and 178// generate the SSE instruction. By appending "v" here, we ensure that we will 179// generate an AVX instruction. 180#define AVX_OP_WITH_DIFF_SSE_INSTR(macro_name, avx_name, sse_name) \ 181 template <typename Dst, typename Arg, typename... Args> \ 182 void macro_name(Dst dst, Arg arg, Args... args) { \ 183 AvxHelper<Dst, Arg, Args...>{this} \ 184 .template emit<&Assembler::v##avx_name, &Assembler::sse_name>( \ 185 dst, arg, args...); \ 186 } 187 188#define AVX_OP_SSE3(macro_name, name) \ 189 template <typename Dst, typename Arg, typename... Args> \ 190 void macro_name(Dst dst, Arg arg, Args... args) { \ 191 AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE3)} \ 192 .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ 193 args...); \ 194 } 195 196#define AVX_OP_SSSE3(macro_name, name) \ 197 template <typename Dst, typename Arg, typename... Args> \ 198 void macro_name(Dst dst, Arg arg, Args... args) { \ 199 AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSSE3)} \ 200 .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ 201 args...); \ 202 } 203 204#define AVX_OP_SSE4_1(macro_name, name) \ 205 template <typename Dst, typename Arg, typename... Args> \ 206 void macro_name(Dst dst, Arg arg, Args... args) { \ 207 AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_1)} \ 208 .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ 209 args...); \ 210 } 211 212#define AVX_OP_SSE4_2(macro_name, name) \ 213 template <typename Dst, typename Arg, typename... Args> \ 214 void macro_name(Dst dst, Arg arg, Args... args) { \ 215 AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_2)} \ 216 .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ 217 args...); \ 218 } 219 220 // Keep this list sorted by required extension, then instruction name. 221 AVX_OP(Addpd, addpd) 222 AVX_OP(Addps, addps) 223 AVX_OP(Addsd, addsd) 224 AVX_OP(Addss, addss) 225 AVX_OP(Andnpd, andnpd) 226 AVX_OP(Andnps, andnps) 227 AVX_OP(Andpd, andpd) 228 AVX_OP(Andps, andps) 229 AVX_OP(Cmpeqpd, cmpeqpd) 230 AVX_OP(Cmpeqps, cmpeqps) 231 AVX_OP(Cmplepd, cmplepd) 232 AVX_OP(Cmpleps, cmpleps) 233 AVX_OP(Cmpltpd, cmpltpd) 234 AVX_OP(Cmpltps, cmpltps) 235 AVX_OP(Cmpneqpd, cmpneqpd) 236 AVX_OP(Cmpneqps, cmpneqps) 237 AVX_OP(Cmpunordpd, cmpunordpd) 238 AVX_OP(Cmpunordps, cmpunordps) 239 AVX_OP(Cvtdq2pd, cvtdq2pd) 240 AVX_OP(Cvtdq2ps, cvtdq2ps) 241 AVX_OP(Cvtpd2ps, cvtpd2ps) 242 AVX_OP(Cvtps2pd, cvtps2pd) 243 AVX_OP(Cvtsd2ss, cvtsd2ss) 244 AVX_OP(Cvtss2sd, cvtss2sd) 245 AVX_OP(Cvttpd2dq, cvttpd2dq) 246 AVX_OP(Cvttps2dq, cvttps2dq) 247 AVX_OP(Cvttsd2si, cvttsd2si) 248 AVX_OP(Cvttss2si, cvttss2si) 249 AVX_OP(Divpd, divpd) 250 AVX_OP(Divps, divps) 251 AVX_OP(Divsd, divsd) 252 AVX_OP(Divss, divss) 253 AVX_OP(Maxpd, maxpd) 254 AVX_OP(Maxps, maxps) 255 AVX_OP(Minpd, minpd) 256 AVX_OP(Minps, minps) 257 AVX_OP(Movaps, movaps) 258 AVX_OP(Movd, movd) 259 AVX_OP(Movhlps, movhlps) 260 AVX_OP(Movhps, movhps) 261 AVX_OP(Movlps, movlps) 262 AVX_OP(Movmskpd, movmskpd) 263 AVX_OP(Movmskps, movmskps) 264 AVX_OP(Movsd, movsd) 265 AVX_OP(Movss, movss) 266 AVX_OP(Movupd, movupd) 267 AVX_OP(Movups, movups) 268 AVX_OP(Mulpd, mulpd) 269 AVX_OP(Mulps, mulps) 270 AVX_OP(Mulsd, mulsd) 271 AVX_OP(Mulss, mulss) 272 AVX_OP(Orpd, orpd) 273 AVX_OP(Orps, orps) 274 AVX_OP(Packssdw, packssdw) 275 AVX_OP(Packsswb, packsswb) 276 AVX_OP(Packuswb, packuswb) 277 AVX_OP(Paddb, paddb) 278 AVX_OP(Paddd, paddd) 279 AVX_OP(Paddq, paddq) 280 AVX_OP(Paddsb, paddsb) 281 AVX_OP(Paddsw, paddsw) 282 AVX_OP(Paddusb, paddusb) 283 AVX_OP(Paddusw, paddusw) 284 AVX_OP(Paddw, paddw) 285 AVX_OP(Pavgb, pavgb) 286 AVX_OP(Pavgw, pavgw) 287 AVX_OP(Pcmpgtb, pcmpgtb) 288 AVX_OP(Pcmpgtd, pcmpgtd) 289 AVX_OP(Pcmpgtw, pcmpgtw) 290 AVX_OP(Pcmpeqb, pcmpeqb) 291 AVX_OP(Pcmpeqd, pcmpeqd) 292 AVX_OP(Pcmpeqw, pcmpeqw) 293 AVX_OP(Pmaddwd, pmaddwd) 294 AVX_OP(Pmaxsw, pmaxsw) 295 AVX_OP(Pmaxub, pmaxub) 296 AVX_OP(Pminsw, pminsw) 297 AVX_OP(Pminub, pminub) 298 AVX_OP(Pmovmskb, pmovmskb) 299 AVX_OP(Pmullw, pmullw) 300 AVX_OP(Pmuludq, pmuludq) 301 AVX_OP(Pshufd, pshufd) 302 AVX_OP(Pshufhw, pshufhw) 303 AVX_OP(Pshuflw, pshuflw) 304 AVX_OP(Pslld, pslld) 305 AVX_OP(Psllq, psllq) 306 AVX_OP(Psllw, psllw) 307 AVX_OP(Psrad, psrad) 308 AVX_OP(Psraw, psraw) 309 AVX_OP(Psrld, psrld) 310 AVX_OP(Psrlq, psrlq) 311 AVX_OP(Psrlw, psrlw) 312 AVX_OP(Psubb, psubb) 313 AVX_OP(Psubd, psubd) 314 AVX_OP(Psubq, psubq) 315 AVX_OP(Psubsb, psubsb) 316 AVX_OP(Psubsw, psubsw) 317 AVX_OP(Psubusb, psubusb) 318 AVX_OP(Psubusw, psubusw) 319 AVX_OP(Psubw, psubw) 320 AVX_OP(Punpckhbw, punpckhbw) 321 AVX_OP(Punpckhdq, punpckhdq) 322 AVX_OP(Punpckhqdq, punpckhqdq) 323 AVX_OP(Punpckhwd, punpckhwd) 324 AVX_OP(Punpcklbw, punpcklbw) 325 AVX_OP(Punpckldq, punpckldq) 326 AVX_OP(Punpcklqdq, punpcklqdq) 327 AVX_OP(Punpcklwd, punpcklwd) 328 AVX_OP(Rcpps, rcpps) 329 AVX_OP(Rsqrtps, rsqrtps) 330 AVX_OP(Sqrtpd, sqrtpd) 331 AVX_OP(Sqrtps, sqrtps) 332 AVX_OP(Sqrtsd, sqrtsd) 333 AVX_OP(Sqrtss, sqrtss) 334 AVX_OP(Subpd, subpd) 335 AVX_OP(Subps, subps) 336 AVX_OP(Subsd, subsd) 337 AVX_OP(Subss, subss) 338 AVX_OP(Ucomisd, ucomisd) 339 AVX_OP(Ucomiss, ucomiss) 340 AVX_OP(Unpcklps, unpcklps) 341 AVX_OP(Xorpd, xorpd) 342 AVX_OP(Xorps, xorps) 343 344 // Many AVX processors have separate integer/floating-point domains, so use 345 // vmovaps if AVX is supported. On SSE, movaps is 1 byte shorter than movdqa, 346 // and has the same behavior. Most SSE processors also don't have the same 347 // delay moving between integer and floating-point domains. 348 AVX_OP_WITH_DIFF_SSE_INSTR(Movapd, movapd, movaps) 349 AVX_OP_WITH_DIFF_SSE_INSTR(Movdqa, movdqa, movaps) 350 AVX_OP_WITH_DIFF_SSE_INSTR(Movdqu, movdqu, movups) 351 AVX_OP_WITH_DIFF_SSE_INSTR(Pand, pand, andps) 352 AVX_OP_WITH_DIFF_SSE_INSTR(Por, por, orps) 353 AVX_OP_WITH_DIFF_SSE_INSTR(Pxor, pxor, xorps) 354 355 AVX_OP_SSE3(Haddps, haddps) 356 AVX_OP_SSE3(Movddup, movddup) 357 AVX_OP_SSE3(Movshdup, movshdup) 358 359 AVX_OP_SSSE3(Pabsb, pabsb) 360 AVX_OP_SSSE3(Pabsd, pabsd) 361 AVX_OP_SSSE3(Pabsw, pabsw) 362 AVX_OP_SSSE3(Palignr, palignr) 363 AVX_OP_SSSE3(Pmulhrsw, pmulhrsw) 364 AVX_OP_SSSE3(Psignb, psignb) 365 AVX_OP_SSSE3(Psignd, psignd) 366 AVX_OP_SSSE3(Psignw, psignw) 367 368 AVX_OP_SSE4_1(Extractps, extractps) 369 AVX_OP_SSE4_1(Insertps, insertps) 370 AVX_OP_SSE4_1(Packusdw, packusdw) 371 AVX_OP_SSE4_1(Pblendw, pblendw) 372 AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq) 373 AVX_OP_SSE4_1(Pextrb, pextrb) 374 AVX_OP_SSE4_1(Pextrw, pextrw) 375 AVX_OP_SSE4_1(Pmaxsb, pmaxsb) 376 AVX_OP_SSE4_1(Pmaxsd, pmaxsd) 377 AVX_OP_SSE4_1(Pmaxud, pmaxud) 378 AVX_OP_SSE4_1(Pmaxuw, pmaxuw) 379 AVX_OP_SSE4_1(Pminsb, pminsb) 380 AVX_OP_SSE4_1(Pminsd, pminsd) 381 AVX_OP_SSE4_1(Pminud, pminud) 382 AVX_OP_SSE4_1(Pminuw, pminuw) 383 AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw) 384 AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq) 385 AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd) 386 AVX_OP_SSE4_1(Pmovzxbw, pmovzxbw) 387 AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq) 388 AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd) 389 AVX_OP_SSE4_1(Pmulld, pmulld) 390 AVX_OP_SSE4_1(Ptest, ptest) 391 AVX_OP_SSE4_1(Roundpd, roundpd) 392 AVX_OP_SSE4_1(Roundps, roundps) 393 AVX_OP_SSE4_1(Roundsd, roundsd) 394 AVX_OP_SSE4_1(Roundss, roundss) 395 396#undef AVX_OP 397#undef AVX_OP_SSE3 398#undef AVX_OP_SSSE3 399#undef AVX_OP_SSE4_1 400#undef AVX_OP_SSE4_2 401 402 void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane); 403 void F64x2ReplaceLane(XMMRegister dst, XMMRegister src, DoubleRegister rep, 404 uint8_t lane); 405 void F64x2Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, 406 XMMRegister scratch); 407 void F64x2Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, 408 XMMRegister scratch); 409 void F32x4Splat(XMMRegister dst, DoubleRegister src); 410 void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane); 411 void F32x4Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, 412 XMMRegister scratch); 413 void F32x4Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, 414 XMMRegister scratch); 415 void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx); 416 void I8x16Splat(XMMRegister dst, Register src, XMMRegister scratch); 417 void I8x16Splat(XMMRegister dst, Operand src, XMMRegister scratch); 418 void I8x16Shl(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1, 419 XMMRegister tmp2); 420 void I8x16Shl(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1, 421 XMMRegister tmp2, XMMRegister tmp3); 422 void I8x16ShrS(XMMRegister dst, XMMRegister src1, uint8_t src2, 423 XMMRegister tmp); 424 void I8x16ShrS(XMMRegister dst, XMMRegister src1, Register src2, 425 Register tmp1, XMMRegister tmp2, XMMRegister tmp3); 426 void I8x16ShrU(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1, 427 XMMRegister tmp2); 428 void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2, 429 Register tmp1, XMMRegister tmp2, XMMRegister tmp3); 430 void I16x8Splat(XMMRegister dst, Register src); 431 void I16x8Splat(XMMRegister dst, Operand src); 432 void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2, 433 XMMRegister scrat, bool is_signed); 434 void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2, 435 XMMRegister scratch); 436 void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2, 437 XMMRegister scratch); 438 void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src); 439 void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src, 440 XMMRegister scratch); 441 // Will move src1 to dst if AVX is not supported. 442 void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2, 443 XMMRegister scratch); 444 void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src, 445 XMMRegister tmp); 446 // Requires that dst == src1 if AVX is not supported. 447 void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, 448 XMMRegister scratch, bool low, bool is_signed); 449 void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src); 450 void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src, 451 XMMRegister scratch); 452 void I64x2Neg(XMMRegister dst, XMMRegister src, XMMRegister scratch); 453 void I64x2Abs(XMMRegister dst, XMMRegister src, XMMRegister scratch); 454 void I64x2GtS(XMMRegister dst, XMMRegister src0, XMMRegister src1, 455 XMMRegister scratch); 456 void I64x2GeS(XMMRegister dst, XMMRegister src0, XMMRegister src1, 457 XMMRegister scratch); 458 void I64x2ShrS(XMMRegister dst, XMMRegister src, uint8_t shift, 459 XMMRegister xmm_tmp); 460 void I64x2ShrS(XMMRegister dst, XMMRegister src, Register shift, 461 XMMRegister xmm_tmp, XMMRegister xmm_shift, 462 Register tmp_shift); 463 void I64x2Mul(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, 464 XMMRegister tmp1, XMMRegister tmp2); 465 void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, 466 XMMRegister scratch, bool low, bool is_signed); 467 void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src); 468 void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src, 469 XMMRegister scratch); 470 void S128Not(XMMRegister dst, XMMRegister src, XMMRegister scratch); 471 // Requires dst == mask when AVX is not supported. 472 void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, 473 XMMRegister src2, XMMRegister scratch); 474 void S128Load8Splat(XMMRegister dst, Operand src, XMMRegister scratch); 475 void S128Load16Splat(XMMRegister dst, Operand src, XMMRegister scratch); 476 void S128Load32Splat(XMMRegister dst, Operand src); 477 void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx); 478 479 void F64x2Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2, 480 XMMRegister src3, XMMRegister tmp); 481 void F64x2Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2, 482 XMMRegister src3, XMMRegister tmp); 483 void F32x4Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2, 484 XMMRegister src3, XMMRegister tmp); 485 void F32x4Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2, 486 XMMRegister src3, XMMRegister tmp); 487 488 protected: 489 template <typename Op> 490 using AvxFn = void (Assembler::*)(XMMRegister, XMMRegister, Op, uint8_t); 491 template <typename Op> 492 using NoAvxFn = void (Assembler::*)(XMMRegister, Op, uint8_t); 493 494 template <typename Op> 495 void PinsrHelper(Assembler* assm, AvxFn<Op> avx, NoAvxFn<Op> noavx, 496 XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, 497 uint32_t* load_pc_offset = nullptr, 498 base::Optional<CpuFeature> feature = base::nullopt) { 499 if (CpuFeatures::IsSupported(AVX)) { 500 CpuFeatureScope scope(assm, AVX); 501 if (load_pc_offset) *load_pc_offset = assm->pc_offset(); 502 (assm->*avx)(dst, src1, src2, imm8); 503 return; 504 } 505 506 if (dst != src1) assm->movaps(dst, src1); 507 if (load_pc_offset) *load_pc_offset = assm->pc_offset(); 508 if (feature.has_value()) { 509 DCHECK(CpuFeatures::IsSupported(*feature)); 510 CpuFeatureScope scope(assm, *feature); 511 (assm->*noavx)(dst, src2, imm8); 512 } else { 513 (assm->*noavx)(dst, src2, imm8); 514 } 515 } 516 517 private: 518 template <typename Op> 519 void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch); 520 template <typename Op> 521 void I16x8SplatPreAvx2(XMMRegister dst, Op src); 522}; 523 524// Common base class template shared by ia32 and x64 TurboAssembler. This uses 525// the Curiously Recurring Template Pattern (CRTP), where Impl is the actual 526// class (subclass of SharedTurboAssemblerBase instantiated with the actual 527// class). This allows static polymorphism, where member functions can be move 528// into SharedTurboAssembler, and we can also call into member functions 529// defined in ia32 or x64 specific TurboAssembler from within this template 530// class, via Impl. 531// 532// Note: all member functions must be defined in this header file so that the 533// compiler can generate code for the function definitions. See 534// https://isocpp.org/wiki/faq/templates#templates-defn-vs-decl for rationale. 535// If a function does not need polymorphism, move it into SharedTurboAssembler, 536// and define it outside of this header. 537template <typename Impl> 538class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler { 539 using SharedTurboAssembler::SharedTurboAssembler; 540 541 public: 542 void Abspd(XMMRegister dst, XMMRegister src, Register tmp) { 543 FloatUnop(dst, src, tmp, &SharedTurboAssembler::Andps, 544 ExternalReference::address_of_double_abs_constant()); 545 } 546 547 void Absps(XMMRegister dst, XMMRegister src, Register tmp) { 548 FloatUnop(dst, src, tmp, &SharedTurboAssembler::Andps, 549 ExternalReference::address_of_float_abs_constant()); 550 } 551 552 void Negpd(XMMRegister dst, XMMRegister src, Register tmp) { 553 FloatUnop(dst, src, tmp, &SharedTurboAssembler::Xorps, 554 ExternalReference::address_of_double_neg_constant()); 555 } 556 557 void Negps(XMMRegister dst, XMMRegister src, Register tmp) { 558 FloatUnop(dst, src, tmp, &SharedTurboAssembler::Xorps, 559 ExternalReference::address_of_float_neg_constant()); 560 } 561#undef FLOAT_UNOP 562 563 void Pextrd(Register dst, XMMRegister src, uint8_t imm8) { 564 if (imm8 == 0) { 565 Movd(dst, src); 566 return; 567 } 568 569 if (CpuFeatures::IsSupported(AVX)) { 570 CpuFeatureScope scope(this, AVX); 571 vpextrd(dst, src, imm8); 572 } else if (CpuFeatures::IsSupported(SSE4_1)) { 573 CpuFeatureScope sse_scope(this, SSE4_1); 574 pextrd(dst, src, imm8); 575 } else { 576 DCHECK_LT(imm8, 2); 577 impl()->PextrdPreSse41(dst, src, imm8); 578 } 579 } 580 581 template <typename Op> 582 void Pinsrd(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, 583 uint32_t* load_pc_offset = nullptr) { 584 if (CpuFeatures::IsSupported(SSE4_1)) { 585 PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1, 586 src2, imm8, load_pc_offset, 587 base::Optional<CpuFeature>(SSE4_1)); 588 } else { 589 if (dst != src1) { 590 movaps(dst, src1); 591 } 592 impl()->PinsrdPreSse41(dst, src2, imm8, load_pc_offset); 593 } 594 } 595 596 template <typename Op> 597 void Pinsrd(XMMRegister dst, Op src, uint8_t imm8, 598 uint32_t* load_pc_offset = nullptr) { 599 Pinsrd(dst, dst, src, imm8, load_pc_offset); 600 } 601 602 void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src, 603 Register scratch) { 604 ASM_CODE_COMMENT(this); 605 // dst = [ src_low, 0x43300000, src_high, 0x4330000 ]; 606 // 0x43300000'00000000 is a special double where the significand bits 607 // precisely represents all uint32 numbers. 608 if (!CpuFeatures::IsSupported(AVX) && dst != src) { 609 movaps(dst, src); 610 src = dst; 611 } 612 Unpcklps(dst, src, 613 ExternalReferenceAsOperand( 614 ExternalReference:: 615 address_of_wasm_f64x2_convert_low_i32x4_u_int_mask(), 616 scratch)); 617 Subpd(dst, 618 ExternalReferenceAsOperand( 619 ExternalReference::address_of_wasm_double_2_power_52(), scratch)); 620 } 621 622 void I32x4SConvertF32x4(XMMRegister dst, XMMRegister src, XMMRegister tmp, 623 Register scratch) { 624 ASM_CODE_COMMENT(this); 625 Operand op = ExternalReferenceAsOperand( 626 ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch); 627 628 // This algorithm works by: 629 // 1. lanes with NaNs are zero-ed 630 // 2. lanes ge than 2147483648.0f (MAX_INT32+1) set to 0xffff'ffff 631 // 3. cvttps2dq sets all out of range lanes to 0x8000'0000 632 // a. correct for underflows (< MIN_INT32) 633 // b. wrong for overflow, and we know which lanes overflow from 2. 634 // 4. adjust for 3b by xor-ing 2 and 3 635 // a. 0x8000'0000 xor 0xffff'ffff = 0x7fff'ffff (MAX_INT32) 636 if (CpuFeatures::IsSupported(AVX)) { 637 CpuFeatureScope scope(this, AVX); 638 vcmpeqps(tmp, src, src); 639 vandps(dst, src, tmp); 640 vcmpgeps(tmp, src, op); 641 vcvttps2dq(dst, dst); 642 vpxor(dst, dst, tmp); 643 } else { 644 if (src == dst) { 645 movaps(tmp, src); 646 cmpeqps(tmp, tmp); 647 andps(dst, tmp); 648 movaps(tmp, op); 649 cmpleps(tmp, dst); 650 cvttps2dq(dst, dst); 651 xorps(dst, tmp); 652 } else { 653 movaps(tmp, op); 654 cmpleps(tmp, src); 655 cvttps2dq(dst, src); 656 xorps(dst, tmp); 657 movaps(tmp, src); 658 cmpeqps(tmp, tmp); 659 andps(dst, tmp); 660 } 661 } 662 } 663 664 void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src, 665 XMMRegister scratch, Register tmp) { 666 ASM_CODE_COMMENT(this); 667 if (CpuFeatures::IsSupported(AVX)) { 668 CpuFeatureScope avx_scope(this, AVX); 669 XMMRegister original_dst = dst; 670 // Make sure we don't overwrite src. 671 if (dst == src) { 672 DCHECK_NE(src, scratch); 673 dst = scratch; 674 } 675 // dst = 0 if src == NaN, else all ones. 676 vcmpeqpd(dst, src, src); 677 // dst = 0 if src == NaN, else INT32_MAX as double. 678 vandpd( 679 dst, dst, 680 ExternalReferenceAsOperand( 681 ExternalReference::address_of_wasm_int32_max_as_double(), tmp)); 682 // dst = 0 if src == NaN, src is saturated to INT32_MAX as double. 683 vminpd(dst, src, dst); 684 // Values > INT32_MAX already saturated, values < INT32_MIN raises an 685 // exception, which is masked and returns 0x80000000. 686 vcvttpd2dq(original_dst, dst); 687 } else { 688 if (dst != src) { 689 movaps(dst, src); 690 } 691 movaps(scratch, dst); 692 cmpeqpd(scratch, dst); 693 andps(scratch, 694 ExternalReferenceAsOperand( 695 ExternalReference::address_of_wasm_int32_max_as_double(), tmp)); 696 minpd(dst, scratch); 697 cvttpd2dq(dst, dst); 698 } 699 } 700 701 void I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src, 702 XMMRegister scratch, Register tmp) { 703 ASM_CODE_COMMENT(this); 704 if (CpuFeatures::IsSupported(AVX)) { 705 CpuFeatureScope avx_scope(this, AVX); 706 vxorpd(scratch, scratch, scratch); 707 // Saturate to 0. 708 vmaxpd(dst, src, scratch); 709 // Saturate to UINT32_MAX. 710 vminpd( 711 dst, dst, 712 ExternalReferenceAsOperand( 713 ExternalReference::address_of_wasm_uint32_max_as_double(), tmp)); 714 // Truncate. 715 vroundpd(dst, dst, kRoundToZero); 716 // Add to special double where significant bits == uint32. 717 vaddpd(dst, dst, 718 ExternalReferenceAsOperand( 719 ExternalReference::address_of_wasm_double_2_power_52(), tmp)); 720 // Extract low 32 bits of each double's significand, zero top lanes. 721 // dst = [dst[0], dst[2], 0, 0] 722 vshufps(dst, dst, scratch, 0x88); 723 } else { 724 CpuFeatureScope scope(this, SSE4_1); 725 if (dst != src) { 726 movaps(dst, src); 727 } 728 xorps(scratch, scratch); 729 maxpd(dst, scratch); 730 minpd(dst, ExternalReferenceAsOperand( 731 ExternalReference::address_of_wasm_uint32_max_as_double(), 732 tmp)); 733 roundpd(dst, dst, kRoundToZero); 734 addpd(dst, 735 ExternalReferenceAsOperand( 736 ExternalReference::address_of_wasm_double_2_power_52(), tmp)); 737 shufps(dst, scratch, 0x88); 738 } 739 } 740 741 void I32x4TruncF64x2UZero(XMMRegister dst, XMMRegister src, Register tmp, 742 XMMRegister scratch) { 743 // TODO(zhin): call this from I32x4TruncSatF64x2UZero. 744 ASM_CODE_COMMENT(this); 745 if (dst != src && !CpuFeatures::IsSupported(AVX)) { 746 movaps(dst, src); 747 src = dst; 748 } 749 // Same as I32x4TruncSatF64x2UZero but without the saturation. 750 Roundpd(dst, src, kRoundToZero); 751 // Add to special double where significant bits == uint32. 752 Addpd(dst, dst, 753 ExternalReferenceAsOperand( 754 ExternalReference::address_of_wasm_double_2_power_52(), tmp)); 755 // Extract low 32 bits of each double's significand, zero top lanes. 756 // dst = [dst[0], dst[2], 0, 0] 757 Shufps(dst, dst, scratch, 0x88); 758 } 759 760 void I32x4TruncF32x4U(XMMRegister dst, XMMRegister src, Register scratch, 761 XMMRegister tmp) { 762 ASM_CODE_COMMENT(this); 763 Operand int32_overflow_op = ExternalReferenceAsOperand( 764 ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch); 765 if (CpuFeatures::IsSupported(AVX)) { 766 CpuFeatureScope avx_scope(this, AVX); 767 vcmpltps(tmp, src, int32_overflow_op); 768 } else { 769 movaps(tmp, src); 770 cmpltps(tmp, int32_overflow_op); 771 } 772 // In tmp, lanes < INT32_MAX are left alone, other lanes are zeroed. 773 Pand(tmp, src); 774 // tmp = src with all the valid conversions 775 if (dst != src) { 776 Movaps(dst, src); 777 } 778 // In dst, lanes < INT32_MAX are zeroed, other lanes left alone. 779 Pxor(dst, tmp); 780 // tmp contains only lanes which can be converted correctly (<INT32_MAX) 781 Cvttps2dq(tmp, tmp); 782 // Bit-trick follows: 783 // All integers from INT32_MAX to UINT32_MAX that are representable as 784 // floats lie between [0x4f00'0000,0x4f80'0000). 785 // The bit representation of the integers is actually shifted right by 8. 786 // For example given 2147483904.0f (which fits in UINT32_MAX): 787 // 788 // 01001111 000000000 000000000 000000001 (float 0x4f00'0001) 789 // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 790 // these are exactly the top 24 bits of the int representation 791 // but needs the top bit to be flipped 792 // 10000000 000000000 000000001 000000000 (int 0x8000'0100) 793 // 794 // So what needs to be done is to flip bit 23, which is the lowest bit of 795 // the exponent, which means multiply by 2 (or addps to itself). 796 Addps(dst, dst, dst); 797 // Then shift to get the bit representation of the int. 798 Pslld(dst, byte{8}); 799 // Merge the converted lanes and bit shifted lanes. 800 Paddd(dst, tmp); 801 } 802 803 void I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src, 804 Register scratch) { 805 ASM_CODE_COMMENT(this); 806 Operand op = ExternalReferenceAsOperand( 807 ExternalReference::address_of_wasm_i16x8_splat_0x0001(), scratch); 808 // pmaddwd multiplies signed words in src and op, producing 809 // signed doublewords, then adds pairwise. 810 // src = |a|b|c|d|e|f|g|h| 811 // dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 | 812 if (!CpuFeatures::IsSupported(AVX) && (dst != src)) { 813 movaps(dst, src); 814 src = dst; 815 } 816 817 Pmaddwd(dst, src, op); 818 } 819 820 void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src, 821 XMMRegister scratch, Register tmp) { 822 ASM_CODE_COMMENT(this); 823 // pmaddubsw treats the first operand as unsigned, so pass the external 824 // reference to it as the first operand. 825 Operand op = ExternalReferenceAsOperand( 826 ExternalReference::address_of_wasm_i8x16_splat_0x01(), tmp); 827 if (CpuFeatures::IsSupported(AVX)) { 828 CpuFeatureScope avx_scope(this, AVX); 829 vmovdqa(scratch, op); 830 vpmaddubsw(dst, scratch, src); 831 } else { 832 CpuFeatureScope sse_scope(this, SSSE3); 833 if (dst == src) { 834 movaps(scratch, op); 835 pmaddubsw(scratch, src); 836 movaps(dst, scratch); 837 } else { 838 movaps(dst, op); 839 pmaddubsw(dst, src); 840 } 841 } 842 } 843 844 void I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src, 845 Register scratch) { 846 ASM_CODE_COMMENT(this); 847 Operand op = ExternalReferenceAsOperand( 848 ExternalReference::address_of_wasm_i8x16_splat_0x01(), scratch); 849 if (CpuFeatures::IsSupported(AVX)) { 850 CpuFeatureScope avx_scope(this, AVX); 851 vpmaddubsw(dst, src, op); 852 } else { 853 CpuFeatureScope sse_scope(this, SSSE3); 854 if (dst != src) { 855 movaps(dst, src); 856 } 857 pmaddubsw(dst, op); 858 } 859 } 860 861 void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask, 862 XMMRegister scratch, Register tmp, bool omit_add = false) { 863 ASM_CODE_COMMENT(this); 864 if (omit_add) { 865 // We have determined that the indices are immediates, and they are either 866 // within bounds, or the top bit is set, so we can omit the add. 867 Pshufb(dst, src, mask); 868 return; 869 } 870 871 // Out-of-range indices should return 0, add 112 so that any value > 15 872 // saturates to 128 (top bit set), so pshufb will zero that lane. 873 Operand op = ExternalReferenceAsOperand( 874 ExternalReference::address_of_wasm_i8x16_swizzle_mask(), tmp); 875 if (CpuFeatures::IsSupported(AVX)) { 876 CpuFeatureScope avx_scope(this, AVX); 877 vpaddusb(scratch, mask, op); 878 vpshufb(dst, src, scratch); 879 } else { 880 CpuFeatureScope sse_scope(this, SSSE3); 881 movaps(scratch, op); 882 if (dst != src) { 883 DCHECK_NE(dst, mask); 884 movaps(dst, src); 885 } 886 paddusb(scratch, mask); 887 pshufb(dst, scratch); 888 } 889 } 890 891 void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1, 892 XMMRegister tmp2, Register scratch) { 893 ASM_CODE_COMMENT(this); 894 DCHECK_NE(dst, tmp1); 895 DCHECK_NE(src, tmp1); 896 DCHECK_NE(dst, tmp2); 897 DCHECK_NE(src, tmp2); 898 if (CpuFeatures::IsSupported(AVX)) { 899 CpuFeatureScope avx_scope(this, AVX); 900 vmovdqa(tmp1, ExternalReferenceAsOperand( 901 ExternalReference::address_of_wasm_i8x16_splat_0x0f(), 902 scratch)); 903 vpandn(tmp2, tmp1, src); 904 vpand(dst, tmp1, src); 905 vmovdqa(tmp1, ExternalReferenceAsOperand( 906 ExternalReference::address_of_wasm_i8x16_popcnt_mask(), 907 scratch)); 908 vpsrlw(tmp2, tmp2, 4); 909 vpshufb(dst, tmp1, dst); 910 vpshufb(tmp2, tmp1, tmp2); 911 vpaddb(dst, dst, tmp2); 912 } else if (CpuFeatures::IsSupported(INTEL_ATOM)) { 913 // Pre-Goldmont low-power Intel microarchitectures have very slow 914 // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer 915 // algorithm on these processors. ATOM CPU feature captures exactly 916 // the right set of processors. 917 movaps(tmp1, src); 918 psrlw(tmp1, 1); 919 if (dst != src) { 920 movaps(dst, src); 921 } 922 andps(tmp1, ExternalReferenceAsOperand( 923 ExternalReference::address_of_wasm_i8x16_splat_0x55(), 924 scratch)); 925 psubb(dst, tmp1); 926 Operand splat_0x33 = ExternalReferenceAsOperand( 927 ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch); 928 movaps(tmp1, dst); 929 andps(dst, splat_0x33); 930 psrlw(tmp1, 2); 931 andps(tmp1, splat_0x33); 932 paddb(dst, tmp1); 933 movaps(tmp1, dst); 934 psrlw(dst, 4); 935 paddb(dst, tmp1); 936 andps(dst, ExternalReferenceAsOperand( 937 ExternalReference::address_of_wasm_i8x16_splat_0x0f(), 938 scratch)); 939 } else { 940 CpuFeatureScope sse_scope(this, SSSE3); 941 movaps(tmp1, ExternalReferenceAsOperand( 942 ExternalReference::address_of_wasm_i8x16_splat_0x0f(), 943 scratch)); 944 Operand mask = ExternalReferenceAsOperand( 945 ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch); 946 if (tmp2 != tmp1) { 947 movaps(tmp2, tmp1); 948 } 949 andps(tmp1, src); 950 andnps(tmp2, src); 951 psrlw(tmp2, 4); 952 movaps(dst, mask); 953 pshufb(dst, tmp1); 954 movaps(tmp1, mask); 955 pshufb(tmp1, tmp2); 956 paddb(dst, tmp1); 957 } 958 } 959 960 private: 961 // All implementation-specific methods must be called through this. 962 Impl* impl() { return static_cast<Impl*>(this); } 963 964 Operand ExternalReferenceAsOperand(ExternalReference reference, 965 Register scratch) { 966 return impl()->ExternalReferenceAsOperand(reference, scratch); 967 } 968 969 using FloatInstruction = void (SharedTurboAssembler::*)(XMMRegister, 970 XMMRegister, Operand); 971 void FloatUnop(XMMRegister dst, XMMRegister src, Register tmp, 972 FloatInstruction op, ExternalReference ext) { 973 if (!CpuFeatures::IsSupported(AVX) && (dst != src)) { 974 movaps(dst, src); 975 src = dst; 976 } 977 SharedTurboAssembler* assm = this; 978 (assm->*op)(dst, src, ExternalReferenceAsOperand(ext, tmp)); 979 } 980}; 981 982} // namespace internal 983} // namespace v8 984#endif // V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_ 985