11cb0ef41Sopenharmony_ci// Copyright 2021 the V8 project authors. All rights reserved.
21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
31cb0ef41Sopenharmony_ci// found in the LICENSE file.
41cb0ef41Sopenharmony_ci
51cb0ef41Sopenharmony_ci#include "src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h"
61cb0ef41Sopenharmony_ci
71cb0ef41Sopenharmony_ci#include "src/codegen/assembler.h"
81cb0ef41Sopenharmony_ci#include "src/codegen/cpu-features.h"
91cb0ef41Sopenharmony_ci#include "src/codegen/register.h"
101cb0ef41Sopenharmony_ci
111cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32
121cb0ef41Sopenharmony_ci#include "src/codegen/ia32/register-ia32.h"
131cb0ef41Sopenharmony_ci#elif V8_TARGET_ARCH_X64
141cb0ef41Sopenharmony_ci#include "src/codegen/x64/register-x64.h"
151cb0ef41Sopenharmony_ci#else
161cb0ef41Sopenharmony_ci#error Unsupported target architecture.
171cb0ef41Sopenharmony_ci#endif
181cb0ef41Sopenharmony_ci
191cb0ef41Sopenharmony_ci// Operand on IA32 can be a wrapper for a single register, in which case they
201cb0ef41Sopenharmony_ci// should call I8x16Splat |src| being Register.
211cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32
221cb0ef41Sopenharmony_ci#define DCHECK_OPERAND_IS_NOT_REG(op) DCHECK(!op.is_reg_only());
231cb0ef41Sopenharmony_ci#else
241cb0ef41Sopenharmony_ci#define DCHECK_OPERAND_IS_NOT_REG(op)
251cb0ef41Sopenharmony_ci#endif
261cb0ef41Sopenharmony_ci
271cb0ef41Sopenharmony_cinamespace v8 {
281cb0ef41Sopenharmony_cinamespace internal {
291cb0ef41Sopenharmony_ci
301cb0ef41Sopenharmony_civoid SharedTurboAssembler::Move(Register dst, uint32_t src) {
311cb0ef41Sopenharmony_ci  // Helper to paper over the different assembler function names.
321cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32
331cb0ef41Sopenharmony_ci  mov(dst, Immediate(src));
341cb0ef41Sopenharmony_ci#elif V8_TARGET_ARCH_X64
351cb0ef41Sopenharmony_ci  movl(dst, Immediate(src));
361cb0ef41Sopenharmony_ci#else
371cb0ef41Sopenharmony_ci#error Unsupported target architecture.
381cb0ef41Sopenharmony_ci#endif
391cb0ef41Sopenharmony_ci}
401cb0ef41Sopenharmony_ci
411cb0ef41Sopenharmony_civoid SharedTurboAssembler::Move(Register dst, Register src) {
421cb0ef41Sopenharmony_ci  // Helper to paper over the different assembler function names.
431cb0ef41Sopenharmony_ci  if (dst != src) {
441cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32
451cb0ef41Sopenharmony_ci    mov(dst, src);
461cb0ef41Sopenharmony_ci#elif V8_TARGET_ARCH_X64
471cb0ef41Sopenharmony_ci    movq(dst, src);
481cb0ef41Sopenharmony_ci#else
491cb0ef41Sopenharmony_ci#error Unsupported target architecture.
501cb0ef41Sopenharmony_ci#endif
511cb0ef41Sopenharmony_ci  }
521cb0ef41Sopenharmony_ci}
531cb0ef41Sopenharmony_ci
541cb0ef41Sopenharmony_civoid SharedTurboAssembler::Add(Register dst, Immediate src) {
551cb0ef41Sopenharmony_ci  // Helper to paper over the different assembler function names.
561cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32
571cb0ef41Sopenharmony_ci  add(dst, src);
581cb0ef41Sopenharmony_ci#elif V8_TARGET_ARCH_X64
591cb0ef41Sopenharmony_ci  addq(dst, src);
601cb0ef41Sopenharmony_ci#else
611cb0ef41Sopenharmony_ci#error Unsupported target architecture.
621cb0ef41Sopenharmony_ci#endif
631cb0ef41Sopenharmony_ci}
641cb0ef41Sopenharmony_ci
651cb0ef41Sopenharmony_civoid SharedTurboAssembler::And(Register dst, Immediate src) {
661cb0ef41Sopenharmony_ci  // Helper to paper over the different assembler function names.
671cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32
681cb0ef41Sopenharmony_ci  and_(dst, src);
691cb0ef41Sopenharmony_ci#elif V8_TARGET_ARCH_X64
701cb0ef41Sopenharmony_ci  if (is_uint32(src.value())) {
711cb0ef41Sopenharmony_ci    andl(dst, src);
721cb0ef41Sopenharmony_ci  } else {
731cb0ef41Sopenharmony_ci    andq(dst, src);
741cb0ef41Sopenharmony_ci  }
751cb0ef41Sopenharmony_ci#else
761cb0ef41Sopenharmony_ci#error Unsupported target architecture.
771cb0ef41Sopenharmony_ci#endif
781cb0ef41Sopenharmony_ci}
791cb0ef41Sopenharmony_ci
801cb0ef41Sopenharmony_civoid SharedTurboAssembler::Movhps(XMMRegister dst, XMMRegister src1,
811cb0ef41Sopenharmony_ci                                  Operand src2) {
821cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
831cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, AVX);
841cb0ef41Sopenharmony_ci    vmovhps(dst, src1, src2);
851cb0ef41Sopenharmony_ci  } else {
861cb0ef41Sopenharmony_ci    if (dst != src1) {
871cb0ef41Sopenharmony_ci      movaps(dst, src1);
881cb0ef41Sopenharmony_ci    }
891cb0ef41Sopenharmony_ci    movhps(dst, src2);
901cb0ef41Sopenharmony_ci  }
911cb0ef41Sopenharmony_ci}
921cb0ef41Sopenharmony_ci
931cb0ef41Sopenharmony_civoid SharedTurboAssembler::Movlps(XMMRegister dst, XMMRegister src1,
941cb0ef41Sopenharmony_ci                                  Operand src2) {
951cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
961cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, AVX);
971cb0ef41Sopenharmony_ci    vmovlps(dst, src1, src2);
981cb0ef41Sopenharmony_ci  } else {
991cb0ef41Sopenharmony_ci    if (dst != src1) {
1001cb0ef41Sopenharmony_ci      movaps(dst, src1);
1011cb0ef41Sopenharmony_ci    }
1021cb0ef41Sopenharmony_ci    movlps(dst, src2);
1031cb0ef41Sopenharmony_ci  }
1041cb0ef41Sopenharmony_ci}
1051cb0ef41Sopenharmony_ci
1061cb0ef41Sopenharmony_civoid SharedTurboAssembler::Pblendvb(XMMRegister dst, XMMRegister src1,
1071cb0ef41Sopenharmony_ci                                    XMMRegister src2, XMMRegister mask) {
1081cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
1091cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, AVX);
1101cb0ef41Sopenharmony_ci    vpblendvb(dst, src1, src2, mask);
1111cb0ef41Sopenharmony_ci  } else {
1121cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, SSE4_1);
1131cb0ef41Sopenharmony_ci    DCHECK_EQ(mask, xmm0);
1141cb0ef41Sopenharmony_ci    DCHECK_EQ(dst, src1);
1151cb0ef41Sopenharmony_ci    pblendvb(dst, src2);
1161cb0ef41Sopenharmony_ci  }
1171cb0ef41Sopenharmony_ci}
1181cb0ef41Sopenharmony_ci
1191cb0ef41Sopenharmony_civoid SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1,
1201cb0ef41Sopenharmony_ci                                  XMMRegister src2, uint8_t imm8) {
1211cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
1221cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
1231cb0ef41Sopenharmony_ci    vshufps(dst, src1, src2, imm8);
1241cb0ef41Sopenharmony_ci  } else {
1251cb0ef41Sopenharmony_ci    if (dst != src1) {
1261cb0ef41Sopenharmony_ci      movaps(dst, src1);
1271cb0ef41Sopenharmony_ci    }
1281cb0ef41Sopenharmony_ci    shufps(dst, src2, imm8);
1291cb0ef41Sopenharmony_ci  }
1301cb0ef41Sopenharmony_ci}
1311cb0ef41Sopenharmony_ci
1321cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
1331cb0ef41Sopenharmony_ci                                            uint8_t lane) {
1341cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
1351cb0ef41Sopenharmony_ci  if (lane == 0) {
1361cb0ef41Sopenharmony_ci    if (dst != src) {
1371cb0ef41Sopenharmony_ci      Movaps(dst, src);
1381cb0ef41Sopenharmony_ci    }
1391cb0ef41Sopenharmony_ci  } else {
1401cb0ef41Sopenharmony_ci    DCHECK_EQ(1, lane);
1411cb0ef41Sopenharmony_ci    if (CpuFeatures::IsSupported(AVX)) {
1421cb0ef41Sopenharmony_ci      CpuFeatureScope avx_scope(this, AVX);
1431cb0ef41Sopenharmony_ci      // Pass src as operand to avoid false-dependency on dst.
1441cb0ef41Sopenharmony_ci      vmovhlps(dst, src, src);
1451cb0ef41Sopenharmony_ci    } else {
1461cb0ef41Sopenharmony_ci      movhlps(dst, src);
1471cb0ef41Sopenharmony_ci    }
1481cb0ef41Sopenharmony_ci  }
1491cb0ef41Sopenharmony_ci}
1501cb0ef41Sopenharmony_ci
1511cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src,
1521cb0ef41Sopenharmony_ci                                            DoubleRegister rep, uint8_t lane) {
1531cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
1541cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
1551cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, AVX);
1561cb0ef41Sopenharmony_ci    if (lane == 0) {
1571cb0ef41Sopenharmony_ci      vmovsd(dst, src, rep);
1581cb0ef41Sopenharmony_ci    } else {
1591cb0ef41Sopenharmony_ci      vmovlhps(dst, src, rep);
1601cb0ef41Sopenharmony_ci    }
1611cb0ef41Sopenharmony_ci  } else {
1621cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, SSE4_1);
1631cb0ef41Sopenharmony_ci    if (dst != src) {
1641cb0ef41Sopenharmony_ci      DCHECK_NE(dst, rep);  // Ensure rep is not overwritten.
1651cb0ef41Sopenharmony_ci      movaps(dst, src);
1661cb0ef41Sopenharmony_ci    }
1671cb0ef41Sopenharmony_ci    if (lane == 0) {
1681cb0ef41Sopenharmony_ci      movsd(dst, rep);
1691cb0ef41Sopenharmony_ci    } else {
1701cb0ef41Sopenharmony_ci      movlhps(dst, rep);
1711cb0ef41Sopenharmony_ci    }
1721cb0ef41Sopenharmony_ci  }
1731cb0ef41Sopenharmony_ci}
1741cb0ef41Sopenharmony_ci
1751cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4Min(XMMRegister dst, XMMRegister lhs,
1761cb0ef41Sopenharmony_ci                                    XMMRegister rhs, XMMRegister scratch) {
1771cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
1781cb0ef41Sopenharmony_ci  // The minps instruction doesn't propagate NaNs and +0's in its first
1791cb0ef41Sopenharmony_ci  // operand. Perform minps in both orders, merge the results, and adjust.
1801cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
1811cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, AVX);
1821cb0ef41Sopenharmony_ci    vminps(scratch, lhs, rhs);
1831cb0ef41Sopenharmony_ci    vminps(dst, rhs, lhs);
1841cb0ef41Sopenharmony_ci  } else if (dst == lhs || dst == rhs) {
1851cb0ef41Sopenharmony_ci    XMMRegister src = dst == lhs ? rhs : lhs;
1861cb0ef41Sopenharmony_ci    movaps(scratch, src);
1871cb0ef41Sopenharmony_ci    minps(scratch, dst);
1881cb0ef41Sopenharmony_ci    minps(dst, src);
1891cb0ef41Sopenharmony_ci  } else {
1901cb0ef41Sopenharmony_ci    movaps(scratch, lhs);
1911cb0ef41Sopenharmony_ci    minps(scratch, rhs);
1921cb0ef41Sopenharmony_ci    movaps(dst, rhs);
1931cb0ef41Sopenharmony_ci    minps(dst, lhs);
1941cb0ef41Sopenharmony_ci  }
1951cb0ef41Sopenharmony_ci  // Propagate -0's and NaNs, which may be non-canonical.
1961cb0ef41Sopenharmony_ci  Orps(scratch, dst);
1971cb0ef41Sopenharmony_ci  // Canonicalize NaNs by quieting and clearing the payload.
1981cb0ef41Sopenharmony_ci  Cmpunordps(dst, dst, scratch);
1991cb0ef41Sopenharmony_ci  Orps(scratch, dst);
2001cb0ef41Sopenharmony_ci  Psrld(dst, dst, byte{10});
2011cb0ef41Sopenharmony_ci  Andnps(dst, dst, scratch);
2021cb0ef41Sopenharmony_ci}
2031cb0ef41Sopenharmony_ci
2041cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4Max(XMMRegister dst, XMMRegister lhs,
2051cb0ef41Sopenharmony_ci                                    XMMRegister rhs, XMMRegister scratch) {
2061cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
2071cb0ef41Sopenharmony_ci  // The maxps instruction doesn't propagate NaNs and +0's in its first
2081cb0ef41Sopenharmony_ci  // operand. Perform maxps in both orders, merge the results, and adjust.
2091cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
2101cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, AVX);
2111cb0ef41Sopenharmony_ci    vmaxps(scratch, lhs, rhs);
2121cb0ef41Sopenharmony_ci    vmaxps(dst, rhs, lhs);
2131cb0ef41Sopenharmony_ci  } else if (dst == lhs || dst == rhs) {
2141cb0ef41Sopenharmony_ci    XMMRegister src = dst == lhs ? rhs : lhs;
2151cb0ef41Sopenharmony_ci    movaps(scratch, src);
2161cb0ef41Sopenharmony_ci    maxps(scratch, dst);
2171cb0ef41Sopenharmony_ci    maxps(dst, src);
2181cb0ef41Sopenharmony_ci  } else {
2191cb0ef41Sopenharmony_ci    movaps(scratch, lhs);
2201cb0ef41Sopenharmony_ci    maxps(scratch, rhs);
2211cb0ef41Sopenharmony_ci    movaps(dst, rhs);
2221cb0ef41Sopenharmony_ci    maxps(dst, lhs);
2231cb0ef41Sopenharmony_ci  }
2241cb0ef41Sopenharmony_ci  // Find discrepancies.
2251cb0ef41Sopenharmony_ci  Xorps(dst, scratch);
2261cb0ef41Sopenharmony_ci  // Propagate NaNs, which may be non-canonical.
2271cb0ef41Sopenharmony_ci  Orps(scratch, dst);
2281cb0ef41Sopenharmony_ci  // Propagate sign discrepancy and (subtle) quiet NaNs.
2291cb0ef41Sopenharmony_ci  Subps(scratch, scratch, dst);
2301cb0ef41Sopenharmony_ci  // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
2311cb0ef41Sopenharmony_ci  Cmpunordps(dst, dst, scratch);
2321cb0ef41Sopenharmony_ci  Psrld(dst, dst, byte{10});
2331cb0ef41Sopenharmony_ci  Andnps(dst, dst, scratch);
2341cb0ef41Sopenharmony_ci}
2351cb0ef41Sopenharmony_ci
2361cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs,
2371cb0ef41Sopenharmony_ci                                    XMMRegister rhs, XMMRegister scratch) {
2381cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
2391cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
2401cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, AVX);
2411cb0ef41Sopenharmony_ci    // The minpd instruction doesn't propagate NaNs and +0's in its first
2421cb0ef41Sopenharmony_ci    // operand. Perform minpd in both orders, merge the resuls, and adjust.
2431cb0ef41Sopenharmony_ci    vminpd(scratch, lhs, rhs);
2441cb0ef41Sopenharmony_ci    vminpd(dst, rhs, lhs);
2451cb0ef41Sopenharmony_ci    // propagate -0's and NaNs, which may be non-canonical.
2461cb0ef41Sopenharmony_ci    vorpd(scratch, scratch, dst);
2471cb0ef41Sopenharmony_ci    // Canonicalize NaNs by quieting and clearing the payload.
2481cb0ef41Sopenharmony_ci    vcmpunordpd(dst, dst, scratch);
2491cb0ef41Sopenharmony_ci    vorpd(scratch, scratch, dst);
2501cb0ef41Sopenharmony_ci    vpsrlq(dst, dst, byte{13});
2511cb0ef41Sopenharmony_ci    vandnpd(dst, dst, scratch);
2521cb0ef41Sopenharmony_ci  } else {
2531cb0ef41Sopenharmony_ci    // Compare lhs with rhs, and rhs with lhs, and have the results in scratch
2541cb0ef41Sopenharmony_ci    // and dst. If dst overlaps with lhs or rhs, we can save a move.
2551cb0ef41Sopenharmony_ci    if (dst == lhs || dst == rhs) {
2561cb0ef41Sopenharmony_ci      XMMRegister src = dst == lhs ? rhs : lhs;
2571cb0ef41Sopenharmony_ci      movaps(scratch, src);
2581cb0ef41Sopenharmony_ci      minpd(scratch, dst);
2591cb0ef41Sopenharmony_ci      minpd(dst, src);
2601cb0ef41Sopenharmony_ci    } else {
2611cb0ef41Sopenharmony_ci      movaps(scratch, lhs);
2621cb0ef41Sopenharmony_ci      movaps(dst, rhs);
2631cb0ef41Sopenharmony_ci      minpd(scratch, rhs);
2641cb0ef41Sopenharmony_ci      minpd(dst, lhs);
2651cb0ef41Sopenharmony_ci    }
2661cb0ef41Sopenharmony_ci    orpd(scratch, dst);
2671cb0ef41Sopenharmony_ci    cmpunordpd(dst, scratch);
2681cb0ef41Sopenharmony_ci    orpd(scratch, dst);
2691cb0ef41Sopenharmony_ci    psrlq(dst, byte{13});
2701cb0ef41Sopenharmony_ci    andnpd(dst, scratch);
2711cb0ef41Sopenharmony_ci  }
2721cb0ef41Sopenharmony_ci}
2731cb0ef41Sopenharmony_ci
2741cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2Max(XMMRegister dst, XMMRegister lhs,
2751cb0ef41Sopenharmony_ci                                    XMMRegister rhs, XMMRegister scratch) {
2761cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
2771cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
2781cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, AVX);
2791cb0ef41Sopenharmony_ci    // The maxpd instruction doesn't propagate NaNs and +0's in its first
2801cb0ef41Sopenharmony_ci    // operand. Perform maxpd in both orders, merge the resuls, and adjust.
2811cb0ef41Sopenharmony_ci    vmaxpd(scratch, lhs, rhs);
2821cb0ef41Sopenharmony_ci    vmaxpd(dst, rhs, lhs);
2831cb0ef41Sopenharmony_ci    // Find discrepancies.
2841cb0ef41Sopenharmony_ci    vxorpd(dst, dst, scratch);
2851cb0ef41Sopenharmony_ci    // Propagate NaNs, which may be non-canonical.
2861cb0ef41Sopenharmony_ci    vorpd(scratch, scratch, dst);
2871cb0ef41Sopenharmony_ci    // Propagate sign discrepancy and (subtle) quiet NaNs.
2881cb0ef41Sopenharmony_ci    vsubpd(scratch, scratch, dst);
2891cb0ef41Sopenharmony_ci    // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
2901cb0ef41Sopenharmony_ci    vcmpunordpd(dst, dst, scratch);
2911cb0ef41Sopenharmony_ci    vpsrlq(dst, dst, byte{13});
2921cb0ef41Sopenharmony_ci    vandnpd(dst, dst, scratch);
2931cb0ef41Sopenharmony_ci  } else {
2941cb0ef41Sopenharmony_ci    if (dst == lhs || dst == rhs) {
2951cb0ef41Sopenharmony_ci      XMMRegister src = dst == lhs ? rhs : lhs;
2961cb0ef41Sopenharmony_ci      movaps(scratch, src);
2971cb0ef41Sopenharmony_ci      maxpd(scratch, dst);
2981cb0ef41Sopenharmony_ci      maxpd(dst, src);
2991cb0ef41Sopenharmony_ci    } else {
3001cb0ef41Sopenharmony_ci      movaps(scratch, lhs);
3011cb0ef41Sopenharmony_ci      movaps(dst, rhs);
3021cb0ef41Sopenharmony_ci      maxpd(scratch, rhs);
3031cb0ef41Sopenharmony_ci      maxpd(dst, lhs);
3041cb0ef41Sopenharmony_ci    }
3051cb0ef41Sopenharmony_ci    xorpd(dst, scratch);
3061cb0ef41Sopenharmony_ci    orpd(scratch, dst);
3071cb0ef41Sopenharmony_ci    subpd(scratch, dst);
3081cb0ef41Sopenharmony_ci    cmpunordpd(dst, scratch);
3091cb0ef41Sopenharmony_ci    psrlq(dst, byte{13});
3101cb0ef41Sopenharmony_ci    andnpd(dst, scratch);
3111cb0ef41Sopenharmony_ci  }
3121cb0ef41Sopenharmony_ci}
3131cb0ef41Sopenharmony_ci
3141cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) {
3151cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
3161cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX2)) {
3171cb0ef41Sopenharmony_ci    CpuFeatureScope avx2_scope(this, AVX2);
3181cb0ef41Sopenharmony_ci    vbroadcastss(dst, src);
3191cb0ef41Sopenharmony_ci  } else if (CpuFeatures::IsSupported(AVX)) {
3201cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
3211cb0ef41Sopenharmony_ci    vshufps(dst, src, src, 0);
3221cb0ef41Sopenharmony_ci  } else {
3231cb0ef41Sopenharmony_ci    if (dst == src) {
3241cb0ef41Sopenharmony_ci      // 1 byte shorter than pshufd.
3251cb0ef41Sopenharmony_ci      shufps(dst, src, 0);
3261cb0ef41Sopenharmony_ci    } else {
3271cb0ef41Sopenharmony_ci      pshufd(dst, src, 0);
3281cb0ef41Sopenharmony_ci    }
3291cb0ef41Sopenharmony_ci  }
3301cb0ef41Sopenharmony_ci}
3311cb0ef41Sopenharmony_ci
3321cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4ExtractLane(FloatRegister dst, XMMRegister src,
3331cb0ef41Sopenharmony_ci                                            uint8_t lane) {
3341cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
3351cb0ef41Sopenharmony_ci  DCHECK_LT(lane, 4);
3361cb0ef41Sopenharmony_ci  // These instructions are shorter than insertps, but will leave junk in
3371cb0ef41Sopenharmony_ci  // the top lanes of dst.
3381cb0ef41Sopenharmony_ci  if (lane == 0) {
3391cb0ef41Sopenharmony_ci    if (dst != src) {
3401cb0ef41Sopenharmony_ci      Movaps(dst, src);
3411cb0ef41Sopenharmony_ci    }
3421cb0ef41Sopenharmony_ci  } else if (lane == 1) {
3431cb0ef41Sopenharmony_ci    Movshdup(dst, src);
3441cb0ef41Sopenharmony_ci  } else if (lane == 2 && dst == src) {
3451cb0ef41Sopenharmony_ci    // Check dst == src to avoid false dependency on dst.
3461cb0ef41Sopenharmony_ci    Movhlps(dst, src);
3471cb0ef41Sopenharmony_ci  } else if (dst == src) {
3481cb0ef41Sopenharmony_ci    Shufps(dst, src, src, lane);
3491cb0ef41Sopenharmony_ci  } else {
3501cb0ef41Sopenharmony_ci    Pshufd(dst, src, lane);
3511cb0ef41Sopenharmony_ci  }
3521cb0ef41Sopenharmony_ci}
3531cb0ef41Sopenharmony_ci
3541cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
3551cb0ef41Sopenharmony_ci                                           uint8_t laneidx) {
3561cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
3571cb0ef41Sopenharmony_ci  if (laneidx == 0) {
3581cb0ef41Sopenharmony_ci    Movss(dst, src);
3591cb0ef41Sopenharmony_ci  } else {
3601cb0ef41Sopenharmony_ci    DCHECK_GE(3, laneidx);
3611cb0ef41Sopenharmony_ci    Extractps(dst, src, laneidx);
3621cb0ef41Sopenharmony_ci  }
3631cb0ef41Sopenharmony_ci}
3641cb0ef41Sopenharmony_ci
3651cb0ef41Sopenharmony_citemplate <typename Op>
3661cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src,
3671cb0ef41Sopenharmony_ci                                             XMMRegister scratch) {
3681cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
3691cb0ef41Sopenharmony_ci  DCHECK(!CpuFeatures::IsSupported(AVX2));
3701cb0ef41Sopenharmony_ci  CpuFeatureScope ssse3_scope(this, SSSE3);
3711cb0ef41Sopenharmony_ci  Movd(dst, src);
3721cb0ef41Sopenharmony_ci  Xorps(scratch, scratch);
3731cb0ef41Sopenharmony_ci  Pshufb(dst, scratch);
3741cb0ef41Sopenharmony_ci}
3751cb0ef41Sopenharmony_ci
3761cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
3771cb0ef41Sopenharmony_ci                                      XMMRegister scratch) {
3781cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
3791cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX2)) {
3801cb0ef41Sopenharmony_ci    CpuFeatureScope avx2_scope(this, AVX2);
3811cb0ef41Sopenharmony_ci    Movd(scratch, src);
3821cb0ef41Sopenharmony_ci    vpbroadcastb(dst, scratch);
3831cb0ef41Sopenharmony_ci  } else {
3841cb0ef41Sopenharmony_ci    I8x16SplatPreAvx2(dst, src, scratch);
3851cb0ef41Sopenharmony_ci  }
3861cb0ef41Sopenharmony_ci}
3871cb0ef41Sopenharmony_ci
3881cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
3891cb0ef41Sopenharmony_ci                                      XMMRegister scratch) {
3901cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
3911cb0ef41Sopenharmony_ci  DCHECK_OPERAND_IS_NOT_REG(src);
3921cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX2)) {
3931cb0ef41Sopenharmony_ci    CpuFeatureScope avx2_scope(this, AVX2);
3941cb0ef41Sopenharmony_ci    vpbroadcastb(dst, src);
3951cb0ef41Sopenharmony_ci  } else {
3961cb0ef41Sopenharmony_ci    I8x16SplatPreAvx2(dst, src, scratch);
3971cb0ef41Sopenharmony_ci  }
3981cb0ef41Sopenharmony_ci}
3991cb0ef41Sopenharmony_ci
4001cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
4011cb0ef41Sopenharmony_ci                                    uint8_t src2, Register tmp1,
4021cb0ef41Sopenharmony_ci                                    XMMRegister tmp2) {
4031cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
4041cb0ef41Sopenharmony_ci  DCHECK_NE(dst, tmp2);
4051cb0ef41Sopenharmony_ci  // Perform 16-bit shift, then mask away low bits.
4061cb0ef41Sopenharmony_ci  if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
4071cb0ef41Sopenharmony_ci    movaps(dst, src1);
4081cb0ef41Sopenharmony_ci    src1 = dst;
4091cb0ef41Sopenharmony_ci  }
4101cb0ef41Sopenharmony_ci
4111cb0ef41Sopenharmony_ci  uint8_t shift = truncate_to_int3(src2);
4121cb0ef41Sopenharmony_ci  Psllw(dst, src1, byte{shift});
4131cb0ef41Sopenharmony_ci
4141cb0ef41Sopenharmony_ci  uint8_t bmask = static_cast<uint8_t>(0xff << shift);
4151cb0ef41Sopenharmony_ci  uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
4161cb0ef41Sopenharmony_ci  Move(tmp1, mask);
4171cb0ef41Sopenharmony_ci  Movd(tmp2, tmp1);
4181cb0ef41Sopenharmony_ci  Pshufd(tmp2, tmp2, uint8_t{0});
4191cb0ef41Sopenharmony_ci  Pand(dst, tmp2);
4201cb0ef41Sopenharmony_ci}
4211cb0ef41Sopenharmony_ci
4221cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
4231cb0ef41Sopenharmony_ci                                    Register src2, Register tmp1,
4241cb0ef41Sopenharmony_ci                                    XMMRegister tmp2, XMMRegister tmp3) {
4251cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
4261cb0ef41Sopenharmony_ci  DCHECK(!AreAliased(dst, tmp2, tmp3));
4271cb0ef41Sopenharmony_ci  DCHECK(!AreAliased(src1, tmp2, tmp3));
4281cb0ef41Sopenharmony_ci
4291cb0ef41Sopenharmony_ci  // Take shift value modulo 8.
4301cb0ef41Sopenharmony_ci  Move(tmp1, src2);
4311cb0ef41Sopenharmony_ci  And(tmp1, Immediate(7));
4321cb0ef41Sopenharmony_ci  Add(tmp1, Immediate(8));
4331cb0ef41Sopenharmony_ci  // Create a mask to unset high bits.
4341cb0ef41Sopenharmony_ci  Movd(tmp3, tmp1);
4351cb0ef41Sopenharmony_ci  Pcmpeqd(tmp2, tmp2);
4361cb0ef41Sopenharmony_ci  Psrlw(tmp2, tmp2, tmp3);
4371cb0ef41Sopenharmony_ci  Packuswb(tmp2, tmp2);
4381cb0ef41Sopenharmony_ci  if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
4391cb0ef41Sopenharmony_ci    movaps(dst, src1);
4401cb0ef41Sopenharmony_ci    src1 = dst;
4411cb0ef41Sopenharmony_ci  }
4421cb0ef41Sopenharmony_ci  // Mask off the unwanted bits before word-shifting.
4431cb0ef41Sopenharmony_ci  Pand(dst, src1, tmp2);
4441cb0ef41Sopenharmony_ci  Add(tmp1, Immediate(-8));
4451cb0ef41Sopenharmony_ci  Movd(tmp3, tmp1);
4461cb0ef41Sopenharmony_ci  Psllw(dst, dst, tmp3);
4471cb0ef41Sopenharmony_ci}
4481cb0ef41Sopenharmony_ci
4491cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
4501cb0ef41Sopenharmony_ci                                     uint8_t src2, XMMRegister tmp) {
4511cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
4521cb0ef41Sopenharmony_ci  // Unpack bytes into words, do word (16-bit) shifts, and repack.
4531cb0ef41Sopenharmony_ci  DCHECK_NE(dst, tmp);
4541cb0ef41Sopenharmony_ci  uint8_t shift = truncate_to_int3(src2) + 8;
4551cb0ef41Sopenharmony_ci
4561cb0ef41Sopenharmony_ci  Punpckhbw(tmp, src1);
4571cb0ef41Sopenharmony_ci  Punpcklbw(dst, src1);
4581cb0ef41Sopenharmony_ci  Psraw(tmp, shift);
4591cb0ef41Sopenharmony_ci  Psraw(dst, shift);
4601cb0ef41Sopenharmony_ci  Packsswb(dst, tmp);
4611cb0ef41Sopenharmony_ci}
4621cb0ef41Sopenharmony_ci
4631cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
4641cb0ef41Sopenharmony_ci                                     Register src2, Register tmp1,
4651cb0ef41Sopenharmony_ci                                     XMMRegister tmp2, XMMRegister tmp3) {
4661cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
4671cb0ef41Sopenharmony_ci  DCHECK(!AreAliased(dst, tmp2, tmp3));
4681cb0ef41Sopenharmony_ci  DCHECK_NE(src1, tmp2);
4691cb0ef41Sopenharmony_ci
4701cb0ef41Sopenharmony_ci  // Unpack the bytes into words, do arithmetic shifts, and repack.
4711cb0ef41Sopenharmony_ci  Punpckhbw(tmp2, src1);
4721cb0ef41Sopenharmony_ci  Punpcklbw(dst, src1);
4731cb0ef41Sopenharmony_ci  // Prepare shift value
4741cb0ef41Sopenharmony_ci  Move(tmp1, src2);
4751cb0ef41Sopenharmony_ci  // Take shift value modulo 8.
4761cb0ef41Sopenharmony_ci  And(tmp1, Immediate(7));
4771cb0ef41Sopenharmony_ci  Add(tmp1, Immediate(8));
4781cb0ef41Sopenharmony_ci  Movd(tmp3, tmp1);
4791cb0ef41Sopenharmony_ci  Psraw(tmp2, tmp3);
4801cb0ef41Sopenharmony_ci  Psraw(dst, tmp3);
4811cb0ef41Sopenharmony_ci  Packsswb(dst, tmp2);
4821cb0ef41Sopenharmony_ci}
4831cb0ef41Sopenharmony_ci
4841cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
4851cb0ef41Sopenharmony_ci                                     uint8_t src2, Register tmp1,
4861cb0ef41Sopenharmony_ci                                     XMMRegister tmp2) {
4871cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
4881cb0ef41Sopenharmony_ci  DCHECK_NE(dst, tmp2);
4891cb0ef41Sopenharmony_ci  if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
4901cb0ef41Sopenharmony_ci    movaps(dst, src1);
4911cb0ef41Sopenharmony_ci    src1 = dst;
4921cb0ef41Sopenharmony_ci  }
4931cb0ef41Sopenharmony_ci
4941cb0ef41Sopenharmony_ci  // Perform 16-bit shift, then mask away high bits.
4951cb0ef41Sopenharmony_ci  uint8_t shift = truncate_to_int3(src2);
4961cb0ef41Sopenharmony_ci  Psrlw(dst, src1, shift);
4971cb0ef41Sopenharmony_ci
4981cb0ef41Sopenharmony_ci  uint8_t bmask = 0xff >> shift;
4991cb0ef41Sopenharmony_ci  uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
5001cb0ef41Sopenharmony_ci  Move(tmp1, mask);
5011cb0ef41Sopenharmony_ci  Movd(tmp2, tmp1);
5021cb0ef41Sopenharmony_ci  Pshufd(tmp2, tmp2, byte{0});
5031cb0ef41Sopenharmony_ci  Pand(dst, tmp2);
5041cb0ef41Sopenharmony_ci}
5051cb0ef41Sopenharmony_ci
5061cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
5071cb0ef41Sopenharmony_ci                                     Register src2, Register tmp1,
5081cb0ef41Sopenharmony_ci                                     XMMRegister tmp2, XMMRegister tmp3) {
5091cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
5101cb0ef41Sopenharmony_ci  DCHECK(!AreAliased(dst, tmp2, tmp3));
5111cb0ef41Sopenharmony_ci  DCHECK_NE(src1, tmp2);
5121cb0ef41Sopenharmony_ci
5131cb0ef41Sopenharmony_ci  // Unpack the bytes into words, do logical shifts, and repack.
5141cb0ef41Sopenharmony_ci  Punpckhbw(tmp2, src1);
5151cb0ef41Sopenharmony_ci  Punpcklbw(dst, src1);
5161cb0ef41Sopenharmony_ci  // Prepare shift value.
5171cb0ef41Sopenharmony_ci  Move(tmp1, src2);
5181cb0ef41Sopenharmony_ci  // Take shift value modulo 8.
5191cb0ef41Sopenharmony_ci  And(tmp1, Immediate(7));
5201cb0ef41Sopenharmony_ci  Add(tmp1, Immediate(8));
5211cb0ef41Sopenharmony_ci  Movd(tmp3, tmp1);
5221cb0ef41Sopenharmony_ci  Psrlw(tmp2, tmp3);
5231cb0ef41Sopenharmony_ci  Psrlw(dst, tmp3);
5241cb0ef41Sopenharmony_ci  Packuswb(dst, tmp2);
5251cb0ef41Sopenharmony_ci}
5261cb0ef41Sopenharmony_ci
5271cb0ef41Sopenharmony_citemplate <typename Op>
5281cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8SplatPreAvx2(XMMRegister dst, Op src) {
5291cb0ef41Sopenharmony_ci  DCHECK(!CpuFeatures::IsSupported(AVX2));
5301cb0ef41Sopenharmony_ci  Movd(dst, src);
5311cb0ef41Sopenharmony_ci  Pshuflw(dst, dst, uint8_t{0x0});
5321cb0ef41Sopenharmony_ci  Punpcklqdq(dst, dst);
5331cb0ef41Sopenharmony_ci}
5341cb0ef41Sopenharmony_ci
5351cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8Splat(XMMRegister dst, Register src) {
5361cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
5371cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX2)) {
5381cb0ef41Sopenharmony_ci    CpuFeatureScope avx2_scope(this, AVX2);
5391cb0ef41Sopenharmony_ci    Movd(dst, src);
5401cb0ef41Sopenharmony_ci    vpbroadcastw(dst, dst);
5411cb0ef41Sopenharmony_ci  } else {
5421cb0ef41Sopenharmony_ci    I16x8SplatPreAvx2(dst, src);
5431cb0ef41Sopenharmony_ci  }
5441cb0ef41Sopenharmony_ci}
5451cb0ef41Sopenharmony_ci
5461cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8Splat(XMMRegister dst, Operand src) {
5471cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
5481cb0ef41Sopenharmony_ci  DCHECK_OPERAND_IS_NOT_REG(src);
5491cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX2)) {
5501cb0ef41Sopenharmony_ci    CpuFeatureScope avx2_scope(this, AVX2);
5511cb0ef41Sopenharmony_ci    vpbroadcastw(dst, src);
5521cb0ef41Sopenharmony_ci  } else {
5531cb0ef41Sopenharmony_ci    I16x8SplatPreAvx2(dst, src);
5541cb0ef41Sopenharmony_ci  }
5551cb0ef41Sopenharmony_ci}
5561cb0ef41Sopenharmony_ci
5571cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
5581cb0ef41Sopenharmony_ci                                          XMMRegister src2, XMMRegister scratch,
5591cb0ef41Sopenharmony_ci                                          bool is_signed) {
5601cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
5611cb0ef41Sopenharmony_ci  is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1);
5621cb0ef41Sopenharmony_ci  is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
5631cb0ef41Sopenharmony_ci  Pmullw(dst, scratch);
5641cb0ef41Sopenharmony_ci}
5651cb0ef41Sopenharmony_ci
5661cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
5671cb0ef41Sopenharmony_ci                                            XMMRegister src2,
5681cb0ef41Sopenharmony_ci                                            XMMRegister scratch) {
5691cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
5701cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
5711cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
5721cb0ef41Sopenharmony_ci    vpunpckhbw(scratch, src1, src1);
5731cb0ef41Sopenharmony_ci    vpsraw(scratch, scratch, 8);
5741cb0ef41Sopenharmony_ci    vpunpckhbw(dst, src2, src2);
5751cb0ef41Sopenharmony_ci    vpsraw(dst, dst, 8);
5761cb0ef41Sopenharmony_ci    vpmullw(dst, dst, scratch);
5771cb0ef41Sopenharmony_ci  } else {
5781cb0ef41Sopenharmony_ci    if (dst != src1) {
5791cb0ef41Sopenharmony_ci      movaps(dst, src1);
5801cb0ef41Sopenharmony_ci    }
5811cb0ef41Sopenharmony_ci    movaps(scratch, src2);
5821cb0ef41Sopenharmony_ci    punpckhbw(dst, dst);
5831cb0ef41Sopenharmony_ci    psraw(dst, 8);
5841cb0ef41Sopenharmony_ci    punpckhbw(scratch, scratch);
5851cb0ef41Sopenharmony_ci    psraw(scratch, 8);
5861cb0ef41Sopenharmony_ci    pmullw(dst, scratch);
5871cb0ef41Sopenharmony_ci  }
5881cb0ef41Sopenharmony_ci}
5891cb0ef41Sopenharmony_ci
5901cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
5911cb0ef41Sopenharmony_ci                                            XMMRegister src2,
5921cb0ef41Sopenharmony_ci                                            XMMRegister scratch) {
5931cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
5941cb0ef41Sopenharmony_ci  // The logic here is slightly complicated to handle all the cases of register
5951cb0ef41Sopenharmony_ci  // aliasing. This allows flexibility for callers in TurboFan and Liftoff.
5961cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
5971cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
5981cb0ef41Sopenharmony_ci    if (src1 == src2) {
5991cb0ef41Sopenharmony_ci      vpxor(scratch, scratch, scratch);
6001cb0ef41Sopenharmony_ci      vpunpckhbw(dst, src1, scratch);
6011cb0ef41Sopenharmony_ci      vpmullw(dst, dst, dst);
6021cb0ef41Sopenharmony_ci    } else {
6031cb0ef41Sopenharmony_ci      if (dst == src2) {
6041cb0ef41Sopenharmony_ci        // We overwrite dst, then use src2, so swap src1 and src2.
6051cb0ef41Sopenharmony_ci        std::swap(src1, src2);
6061cb0ef41Sopenharmony_ci      }
6071cb0ef41Sopenharmony_ci      vpxor(scratch, scratch, scratch);
6081cb0ef41Sopenharmony_ci      vpunpckhbw(dst, src1, scratch);
6091cb0ef41Sopenharmony_ci      vpunpckhbw(scratch, src2, scratch);
6101cb0ef41Sopenharmony_ci      vpmullw(dst, dst, scratch);
6111cb0ef41Sopenharmony_ci    }
6121cb0ef41Sopenharmony_ci  } else {
6131cb0ef41Sopenharmony_ci    if (src1 == src2) {
6141cb0ef41Sopenharmony_ci      xorps(scratch, scratch);
6151cb0ef41Sopenharmony_ci      if (dst != src1) {
6161cb0ef41Sopenharmony_ci        movaps(dst, src1);
6171cb0ef41Sopenharmony_ci      }
6181cb0ef41Sopenharmony_ci      punpckhbw(dst, scratch);
6191cb0ef41Sopenharmony_ci      pmullw(dst, scratch);
6201cb0ef41Sopenharmony_ci    } else {
6211cb0ef41Sopenharmony_ci      // When dst == src1, nothing special needs to be done.
6221cb0ef41Sopenharmony_ci      // When dst == src2, swap src1 and src2, since we overwrite dst.
6231cb0ef41Sopenharmony_ci      // When dst is unique, copy src1 to dst first.
6241cb0ef41Sopenharmony_ci      if (dst == src2) {
6251cb0ef41Sopenharmony_ci        std::swap(src1, src2);
6261cb0ef41Sopenharmony_ci        // Now, dst == src1.
6271cb0ef41Sopenharmony_ci      } else if (dst != src1) {
6281cb0ef41Sopenharmony_ci        // dst != src1 && dst != src2.
6291cb0ef41Sopenharmony_ci        movaps(dst, src1);
6301cb0ef41Sopenharmony_ci      }
6311cb0ef41Sopenharmony_ci      xorps(scratch, scratch);
6321cb0ef41Sopenharmony_ci      punpckhbw(dst, scratch);
6331cb0ef41Sopenharmony_ci      punpckhbw(scratch, src2);
6341cb0ef41Sopenharmony_ci      psrlw(scratch, 8);
6351cb0ef41Sopenharmony_ci      pmullw(dst, scratch);
6361cb0ef41Sopenharmony_ci    }
6371cb0ef41Sopenharmony_ci  }
6381cb0ef41Sopenharmony_ci}
6391cb0ef41Sopenharmony_ci
6401cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst,
6411cb0ef41Sopenharmony_ci                                                  XMMRegister src) {
6421cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
6431cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
6441cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
6451cb0ef41Sopenharmony_ci    // src = |a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p| (high)
6461cb0ef41Sopenharmony_ci    // dst = |i|i|j|j|k|k|l|l|m|m|n|n|o|o|p|p|
6471cb0ef41Sopenharmony_ci    vpunpckhbw(dst, src, src);
6481cb0ef41Sopenharmony_ci    vpsraw(dst, dst, 8);
6491cb0ef41Sopenharmony_ci  } else {
6501cb0ef41Sopenharmony_ci    CpuFeatureScope sse_scope(this, SSE4_1);
6511cb0ef41Sopenharmony_ci    if (dst == src) {
6521cb0ef41Sopenharmony_ci      // 2 bytes shorter than pshufd, but has depdency on dst.
6531cb0ef41Sopenharmony_ci      movhlps(dst, src);
6541cb0ef41Sopenharmony_ci      pmovsxbw(dst, dst);
6551cb0ef41Sopenharmony_ci    } else {
6561cb0ef41Sopenharmony_ci      // No dependency on dst.
6571cb0ef41Sopenharmony_ci      pshufd(dst, src, 0xEE);
6581cb0ef41Sopenharmony_ci      pmovsxbw(dst, dst);
6591cb0ef41Sopenharmony_ci    }
6601cb0ef41Sopenharmony_ci  }
6611cb0ef41Sopenharmony_ci}
6621cb0ef41Sopenharmony_ci
6631cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
6641cb0ef41Sopenharmony_ci                                                  XMMRegister src,
6651cb0ef41Sopenharmony_ci                                                  XMMRegister scratch) {
6661cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
6671cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
6681cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
6691cb0ef41Sopenharmony_ci    // tmp = |0|0|0|0|0|0|0|0 | 0|0|0|0|0|0|0|0|
6701cb0ef41Sopenharmony_ci    // src = |a|b|c|d|e|f|g|h | i|j|k|l|m|n|o|p|
6711cb0ef41Sopenharmony_ci    // dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h|
6721cb0ef41Sopenharmony_ci    XMMRegister tmp = dst == src ? scratch : dst;
6731cb0ef41Sopenharmony_ci    vpxor(tmp, tmp, tmp);
6741cb0ef41Sopenharmony_ci    vpunpckhbw(dst, src, tmp);
6751cb0ef41Sopenharmony_ci  } else {
6761cb0ef41Sopenharmony_ci    CpuFeatureScope sse_scope(this, SSE4_1);
6771cb0ef41Sopenharmony_ci    if (dst == src) {
6781cb0ef41Sopenharmony_ci      // xorps can be executed on more ports than pshufd.
6791cb0ef41Sopenharmony_ci      xorps(scratch, scratch);
6801cb0ef41Sopenharmony_ci      punpckhbw(dst, scratch);
6811cb0ef41Sopenharmony_ci    } else {
6821cb0ef41Sopenharmony_ci      // No dependency on dst.
6831cb0ef41Sopenharmony_ci      pshufd(dst, src, 0xEE);
6841cb0ef41Sopenharmony_ci      pmovzxbw(dst, dst);
6851cb0ef41Sopenharmony_ci    }
6861cb0ef41Sopenharmony_ci  }
6871cb0ef41Sopenharmony_ci}
6881cb0ef41Sopenharmony_ci
6891cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
6901cb0ef41Sopenharmony_ci                                            XMMRegister src2,
6911cb0ef41Sopenharmony_ci                                            XMMRegister scratch) {
6921cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
6931cb0ef41Sopenharmony_ci  // k = i16x8.splat(0x8000)
6941cb0ef41Sopenharmony_ci  Pcmpeqd(scratch, scratch);
6951cb0ef41Sopenharmony_ci  Psllw(scratch, scratch, byte{15});
6961cb0ef41Sopenharmony_ci
6971cb0ef41Sopenharmony_ci  if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
6981cb0ef41Sopenharmony_ci    movaps(dst, src1);
6991cb0ef41Sopenharmony_ci    src1 = dst;
7001cb0ef41Sopenharmony_ci  }
7011cb0ef41Sopenharmony_ci
7021cb0ef41Sopenharmony_ci  Pmulhrsw(dst, src1, src2);
7031cb0ef41Sopenharmony_ci  Pcmpeqw(scratch, dst);
7041cb0ef41Sopenharmony_ci  Pxor(dst, scratch);
7051cb0ef41Sopenharmony_ci}
7061cb0ef41Sopenharmony_ci
7071cb0ef41Sopenharmony_civoid SharedTurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
7081cb0ef41Sopenharmony_ci                                                     XMMRegister src,
7091cb0ef41Sopenharmony_ci                                                     XMMRegister tmp) {
7101cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
7111cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
7121cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
7131cb0ef41Sopenharmony_ci    // src = |a|b|c|d|e|f|g|h| (low)
7141cb0ef41Sopenharmony_ci    // scratch = |0|a|0|c|0|e|0|g|
7151cb0ef41Sopenharmony_ci    vpsrld(tmp, src, 16);
7161cb0ef41Sopenharmony_ci    // dst = |0|b|0|d|0|f|0|h|
7171cb0ef41Sopenharmony_ci    vpblendw(dst, src, tmp, 0xAA);
7181cb0ef41Sopenharmony_ci    // dst = |a+b|c+d|e+f|g+h|
7191cb0ef41Sopenharmony_ci    vpaddd(dst, tmp, dst);
7201cb0ef41Sopenharmony_ci  } else if (CpuFeatures::IsSupported(SSE4_1)) {
7211cb0ef41Sopenharmony_ci    CpuFeatureScope sse_scope(this, SSE4_1);
7221cb0ef41Sopenharmony_ci    // There is a potentially better lowering if we get rip-relative
7231cb0ef41Sopenharmony_ci    // constants, see https://github.com/WebAssembly/simd/pull/380.
7241cb0ef41Sopenharmony_ci    movaps(tmp, src);
7251cb0ef41Sopenharmony_ci    psrld(tmp, 16);
7261cb0ef41Sopenharmony_ci    if (dst != src) {
7271cb0ef41Sopenharmony_ci      movaps(dst, src);
7281cb0ef41Sopenharmony_ci    }
7291cb0ef41Sopenharmony_ci    pblendw(dst, tmp, 0xAA);
7301cb0ef41Sopenharmony_ci    paddd(dst, tmp);
7311cb0ef41Sopenharmony_ci  } else {
7321cb0ef41Sopenharmony_ci    // src = |a|b|c|d|e|f|g|h|
7331cb0ef41Sopenharmony_ci    // tmp = i32x4.splat(0x0000FFFF)
7341cb0ef41Sopenharmony_ci    pcmpeqd(tmp, tmp);
7351cb0ef41Sopenharmony_ci    psrld(tmp, byte{16});
7361cb0ef41Sopenharmony_ci    // tmp =|0|b|0|d|0|f|0|h|
7371cb0ef41Sopenharmony_ci    andps(tmp, src);
7381cb0ef41Sopenharmony_ci    // dst = |0|a|0|c|0|e|0|g|
7391cb0ef41Sopenharmony_ci    if (dst != src) {
7401cb0ef41Sopenharmony_ci      movaps(dst, src);
7411cb0ef41Sopenharmony_ci    }
7421cb0ef41Sopenharmony_ci    psrld(dst, byte{16});
7431cb0ef41Sopenharmony_ci    // dst = |a+b|c+d|e+f|g+h|
7441cb0ef41Sopenharmony_ci    paddd(dst, tmp);
7451cb0ef41Sopenharmony_ci  }
7461cb0ef41Sopenharmony_ci}
7471cb0ef41Sopenharmony_ci
7481cb0ef41Sopenharmony_ci// 1. Multiply low word into scratch.
7491cb0ef41Sopenharmony_ci// 2. Multiply high word (can be signed or unsigned) into dst.
7501cb0ef41Sopenharmony_ci// 3. Unpack and interleave scratch and dst into dst.
7511cb0ef41Sopenharmony_civoid SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
7521cb0ef41Sopenharmony_ci                                       XMMRegister src2, XMMRegister scratch,
7531cb0ef41Sopenharmony_ci                                       bool low, bool is_signed) {
7541cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
7551cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
7561cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
7571cb0ef41Sopenharmony_ci    vpmullw(scratch, src1, src2);
7581cb0ef41Sopenharmony_ci    is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
7591cb0ef41Sopenharmony_ci    low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
7601cb0ef41Sopenharmony_ci  } else {
7611cb0ef41Sopenharmony_ci    DCHECK_EQ(dst, src1);
7621cb0ef41Sopenharmony_ci    movaps(scratch, src1);
7631cb0ef41Sopenharmony_ci    pmullw(dst, src2);
7641cb0ef41Sopenharmony_ci    is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
7651cb0ef41Sopenharmony_ci    low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
7661cb0ef41Sopenharmony_ci  }
7671cb0ef41Sopenharmony_ci}
7681cb0ef41Sopenharmony_ci
7691cb0ef41Sopenharmony_civoid SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst,
7701cb0ef41Sopenharmony_ci                                                  XMMRegister src) {
7711cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
7721cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
7731cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
7741cb0ef41Sopenharmony_ci    // src = |a|b|c|d|e|f|g|h| (high)
7751cb0ef41Sopenharmony_ci    // dst = |e|e|f|f|g|g|h|h|
7761cb0ef41Sopenharmony_ci    vpunpckhwd(dst, src, src);
7771cb0ef41Sopenharmony_ci    vpsrad(dst, dst, 16);
7781cb0ef41Sopenharmony_ci  } else {
7791cb0ef41Sopenharmony_ci    CpuFeatureScope sse_scope(this, SSE4_1);
7801cb0ef41Sopenharmony_ci    if (dst == src) {
7811cb0ef41Sopenharmony_ci      // 2 bytes shorter than pshufd, but has depdency on dst.
7821cb0ef41Sopenharmony_ci      movhlps(dst, src);
7831cb0ef41Sopenharmony_ci      pmovsxwd(dst, dst);
7841cb0ef41Sopenharmony_ci    } else {
7851cb0ef41Sopenharmony_ci      // No dependency on dst.
7861cb0ef41Sopenharmony_ci      pshufd(dst, src, 0xEE);
7871cb0ef41Sopenharmony_ci      pmovsxwd(dst, dst);
7881cb0ef41Sopenharmony_ci    }
7891cb0ef41Sopenharmony_ci  }
7901cb0ef41Sopenharmony_ci}
7911cb0ef41Sopenharmony_ci
7921cb0ef41Sopenharmony_civoid SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst,
7931cb0ef41Sopenharmony_ci                                                  XMMRegister src,
7941cb0ef41Sopenharmony_ci                                                  XMMRegister scratch) {
7951cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
7961cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
7971cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
7981cb0ef41Sopenharmony_ci    // scratch = |0|0|0|0|0|0|0|0|
7991cb0ef41Sopenharmony_ci    // src     = |a|b|c|d|e|f|g|h|
8001cb0ef41Sopenharmony_ci    // dst     = |0|a|0|b|0|c|0|d|
8011cb0ef41Sopenharmony_ci    XMMRegister tmp = dst == src ? scratch : dst;
8021cb0ef41Sopenharmony_ci    vpxor(tmp, tmp, tmp);
8031cb0ef41Sopenharmony_ci    vpunpckhwd(dst, src, tmp);
8041cb0ef41Sopenharmony_ci  } else {
8051cb0ef41Sopenharmony_ci    if (dst == src) {
8061cb0ef41Sopenharmony_ci      // xorps can be executed on more ports than pshufd.
8071cb0ef41Sopenharmony_ci      xorps(scratch, scratch);
8081cb0ef41Sopenharmony_ci      punpckhwd(dst, scratch);
8091cb0ef41Sopenharmony_ci    } else {
8101cb0ef41Sopenharmony_ci      CpuFeatureScope sse_scope(this, SSE4_1);
8111cb0ef41Sopenharmony_ci      // No dependency on dst.
8121cb0ef41Sopenharmony_ci      pshufd(dst, src, 0xEE);
8131cb0ef41Sopenharmony_ci      pmovzxwd(dst, dst);
8141cb0ef41Sopenharmony_ci    }
8151cb0ef41Sopenharmony_ci  }
8161cb0ef41Sopenharmony_ci}
8171cb0ef41Sopenharmony_ci
8181cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2Neg(XMMRegister dst, XMMRegister src,
8191cb0ef41Sopenharmony_ci                                    XMMRegister scratch) {
8201cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
8211cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
8221cb0ef41Sopenharmony_ci    CpuFeatureScope scope(this, AVX);
8231cb0ef41Sopenharmony_ci    vpxor(scratch, scratch, scratch);
8241cb0ef41Sopenharmony_ci    vpsubq(dst, scratch, src);
8251cb0ef41Sopenharmony_ci  } else {
8261cb0ef41Sopenharmony_ci    if (dst == src) {
8271cb0ef41Sopenharmony_ci      movaps(scratch, src);
8281cb0ef41Sopenharmony_ci      std::swap(src, scratch);
8291cb0ef41Sopenharmony_ci    }
8301cb0ef41Sopenharmony_ci    pxor(dst, dst);
8311cb0ef41Sopenharmony_ci    psubq(dst, src);
8321cb0ef41Sopenharmony_ci  }
8331cb0ef41Sopenharmony_ci}
8341cb0ef41Sopenharmony_ci
8351cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2Abs(XMMRegister dst, XMMRegister src,
8361cb0ef41Sopenharmony_ci                                    XMMRegister scratch) {
8371cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
8381cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
8391cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
8401cb0ef41Sopenharmony_ci    XMMRegister tmp = dst == src ? scratch : dst;
8411cb0ef41Sopenharmony_ci    vpxor(tmp, tmp, tmp);
8421cb0ef41Sopenharmony_ci    vpsubq(tmp, tmp, src);
8431cb0ef41Sopenharmony_ci    vblendvpd(dst, src, tmp, src);
8441cb0ef41Sopenharmony_ci  } else {
8451cb0ef41Sopenharmony_ci    CpuFeatureScope sse_scope(this, SSE3);
8461cb0ef41Sopenharmony_ci    movshdup(scratch, src);
8471cb0ef41Sopenharmony_ci    if (dst != src) {
8481cb0ef41Sopenharmony_ci      movaps(dst, src);
8491cb0ef41Sopenharmony_ci    }
8501cb0ef41Sopenharmony_ci    psrad(scratch, 31);
8511cb0ef41Sopenharmony_ci    xorps(dst, scratch);
8521cb0ef41Sopenharmony_ci    psubq(dst, scratch);
8531cb0ef41Sopenharmony_ci  }
8541cb0ef41Sopenharmony_ci}
8551cb0ef41Sopenharmony_ci
8561cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2GtS(XMMRegister dst, XMMRegister src0,
8571cb0ef41Sopenharmony_ci                                    XMMRegister src1, XMMRegister scratch) {
8581cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
8591cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
8601cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
8611cb0ef41Sopenharmony_ci    vpcmpgtq(dst, src0, src1);
8621cb0ef41Sopenharmony_ci  } else if (CpuFeatures::IsSupported(SSE4_2)) {
8631cb0ef41Sopenharmony_ci    CpuFeatureScope sse_scope(this, SSE4_2);
8641cb0ef41Sopenharmony_ci    if (dst == src0) {
8651cb0ef41Sopenharmony_ci      pcmpgtq(dst, src1);
8661cb0ef41Sopenharmony_ci    } else if (dst == src1) {
8671cb0ef41Sopenharmony_ci      movaps(scratch, src0);
8681cb0ef41Sopenharmony_ci      pcmpgtq(scratch, src1);
8691cb0ef41Sopenharmony_ci      movaps(dst, scratch);
8701cb0ef41Sopenharmony_ci    } else {
8711cb0ef41Sopenharmony_ci      movaps(dst, src0);
8721cb0ef41Sopenharmony_ci      pcmpgtq(dst, src1);
8731cb0ef41Sopenharmony_ci    }
8741cb0ef41Sopenharmony_ci  } else {
8751cb0ef41Sopenharmony_ci    CpuFeatureScope sse_scope(this, SSE3);
8761cb0ef41Sopenharmony_ci    DCHECK_NE(dst, src0);
8771cb0ef41Sopenharmony_ci    DCHECK_NE(dst, src1);
8781cb0ef41Sopenharmony_ci    movaps(dst, src1);
8791cb0ef41Sopenharmony_ci    movaps(scratch, src0);
8801cb0ef41Sopenharmony_ci    psubq(dst, src0);
8811cb0ef41Sopenharmony_ci    pcmpeqd(scratch, src1);
8821cb0ef41Sopenharmony_ci    andps(dst, scratch);
8831cb0ef41Sopenharmony_ci    movaps(scratch, src0);
8841cb0ef41Sopenharmony_ci    pcmpgtd(scratch, src1);
8851cb0ef41Sopenharmony_ci    orps(dst, scratch);
8861cb0ef41Sopenharmony_ci    movshdup(dst, dst);
8871cb0ef41Sopenharmony_ci  }
8881cb0ef41Sopenharmony_ci}
8891cb0ef41Sopenharmony_ci
8901cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0,
8911cb0ef41Sopenharmony_ci                                    XMMRegister src1, XMMRegister scratch) {
8921cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
8931cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
8941cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
8951cb0ef41Sopenharmony_ci    vpcmpgtq(dst, src1, src0);
8961cb0ef41Sopenharmony_ci    vpcmpeqd(scratch, scratch, scratch);
8971cb0ef41Sopenharmony_ci    vpxor(dst, dst, scratch);
8981cb0ef41Sopenharmony_ci  } else if (CpuFeatures::IsSupported(SSE4_2)) {
8991cb0ef41Sopenharmony_ci    CpuFeatureScope sse_scope(this, SSE4_2);
9001cb0ef41Sopenharmony_ci    DCHECK_NE(dst, src0);
9011cb0ef41Sopenharmony_ci    if (dst != src1) {
9021cb0ef41Sopenharmony_ci      movaps(dst, src1);
9031cb0ef41Sopenharmony_ci    }
9041cb0ef41Sopenharmony_ci    pcmpgtq(dst, src0);
9051cb0ef41Sopenharmony_ci    pcmpeqd(scratch, scratch);
9061cb0ef41Sopenharmony_ci    xorps(dst, scratch);
9071cb0ef41Sopenharmony_ci  } else {
9081cb0ef41Sopenharmony_ci    CpuFeatureScope sse_scope(this, SSE3);
9091cb0ef41Sopenharmony_ci    DCHECK_NE(dst, src0);
9101cb0ef41Sopenharmony_ci    DCHECK_NE(dst, src1);
9111cb0ef41Sopenharmony_ci    movaps(dst, src0);
9121cb0ef41Sopenharmony_ci    movaps(scratch, src1);
9131cb0ef41Sopenharmony_ci    psubq(dst, src1);
9141cb0ef41Sopenharmony_ci    pcmpeqd(scratch, src0);
9151cb0ef41Sopenharmony_ci    andps(dst, scratch);
9161cb0ef41Sopenharmony_ci    movaps(scratch, src1);
9171cb0ef41Sopenharmony_ci    pcmpgtd(scratch, src0);
9181cb0ef41Sopenharmony_ci    orps(dst, scratch);
9191cb0ef41Sopenharmony_ci    movshdup(dst, dst);
9201cb0ef41Sopenharmony_ci    pcmpeqd(scratch, scratch);
9211cb0ef41Sopenharmony_ci    xorps(dst, scratch);
9221cb0ef41Sopenharmony_ci  }
9231cb0ef41Sopenharmony_ci}
9241cb0ef41Sopenharmony_ci
9251cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
9261cb0ef41Sopenharmony_ci                                     uint8_t shift, XMMRegister xmm_tmp) {
9271cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
9281cb0ef41Sopenharmony_ci  DCHECK_GT(64, shift);
9291cb0ef41Sopenharmony_ci  DCHECK_NE(xmm_tmp, dst);
9301cb0ef41Sopenharmony_ci  DCHECK_NE(xmm_tmp, src);
9311cb0ef41Sopenharmony_ci  // Use logical right shift to emulate arithmetic right shifts:
9321cb0ef41Sopenharmony_ci  // Given:
9331cb0ef41Sopenharmony_ci  // signed >> c
9341cb0ef41Sopenharmony_ci  //   == (signed + 2^63 - 2^63) >> c
9351cb0ef41Sopenharmony_ci  //   == ((signed + 2^63) >> c) - (2^63 >> c)
9361cb0ef41Sopenharmony_ci  //                                ^^^^^^^^^
9371cb0ef41Sopenharmony_ci  //                                 xmm_tmp
9381cb0ef41Sopenharmony_ci  // signed + 2^63 is an unsigned number, so we can use logical right shifts.
9391cb0ef41Sopenharmony_ci
9401cb0ef41Sopenharmony_ci  // xmm_tmp = wasm_i64x2_const(0x80000000'00000000).
9411cb0ef41Sopenharmony_ci  Pcmpeqd(xmm_tmp, xmm_tmp);
9421cb0ef41Sopenharmony_ci  Psllq(xmm_tmp, byte{63});
9431cb0ef41Sopenharmony_ci
9441cb0ef41Sopenharmony_ci  if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
9451cb0ef41Sopenharmony_ci    movaps(dst, src);
9461cb0ef41Sopenharmony_ci    src = dst;
9471cb0ef41Sopenharmony_ci  }
9481cb0ef41Sopenharmony_ci  // Add a bias of 2^63 to convert signed to unsigned.
9491cb0ef41Sopenharmony_ci  // Since only highest bit changes, use pxor instead of paddq.
9501cb0ef41Sopenharmony_ci  Pxor(dst, src, xmm_tmp);
9511cb0ef41Sopenharmony_ci  // Logically shift both value and bias.
9521cb0ef41Sopenharmony_ci  Psrlq(dst, shift);
9531cb0ef41Sopenharmony_ci  Psrlq(xmm_tmp, shift);
9541cb0ef41Sopenharmony_ci  // Subtract shifted bias to convert back to signed value.
9551cb0ef41Sopenharmony_ci  Psubq(dst, xmm_tmp);
9561cb0ef41Sopenharmony_ci}
9571cb0ef41Sopenharmony_ci
9581cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
9591cb0ef41Sopenharmony_ci                                     Register shift, XMMRegister xmm_tmp,
9601cb0ef41Sopenharmony_ci                                     XMMRegister xmm_shift,
9611cb0ef41Sopenharmony_ci                                     Register tmp_shift) {
9621cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
9631cb0ef41Sopenharmony_ci  DCHECK_NE(xmm_tmp, dst);
9641cb0ef41Sopenharmony_ci  DCHECK_NE(xmm_tmp, src);
9651cb0ef41Sopenharmony_ci  DCHECK_NE(xmm_shift, dst);
9661cb0ef41Sopenharmony_ci  DCHECK_NE(xmm_shift, src);
9671cb0ef41Sopenharmony_ci  // tmp_shift can alias shift since we don't use shift after masking it.
9681cb0ef41Sopenharmony_ci
9691cb0ef41Sopenharmony_ci  // See I64x2ShrS with constant shift for explanation of this algorithm.
9701cb0ef41Sopenharmony_ci  Pcmpeqd(xmm_tmp, xmm_tmp);
9711cb0ef41Sopenharmony_ci  Psllq(xmm_tmp, byte{63});
9721cb0ef41Sopenharmony_ci
9731cb0ef41Sopenharmony_ci  // Shift modulo 64.
9741cb0ef41Sopenharmony_ci  Move(tmp_shift, shift);
9751cb0ef41Sopenharmony_ci  And(tmp_shift, Immediate(0x3F));
9761cb0ef41Sopenharmony_ci  Movd(xmm_shift, tmp_shift);
9771cb0ef41Sopenharmony_ci
9781cb0ef41Sopenharmony_ci  if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
9791cb0ef41Sopenharmony_ci    movaps(dst, src);
9801cb0ef41Sopenharmony_ci    src = dst;
9811cb0ef41Sopenharmony_ci  }
9821cb0ef41Sopenharmony_ci  Pxor(dst, src, xmm_tmp);
9831cb0ef41Sopenharmony_ci  Psrlq(dst, xmm_shift);
9841cb0ef41Sopenharmony_ci  Psrlq(xmm_tmp, xmm_shift);
9851cb0ef41Sopenharmony_ci  Psubq(dst, xmm_tmp);
9861cb0ef41Sopenharmony_ci}
9871cb0ef41Sopenharmony_ci
9881cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs,
9891cb0ef41Sopenharmony_ci                                    XMMRegister rhs, XMMRegister tmp1,
9901cb0ef41Sopenharmony_ci                                    XMMRegister tmp2) {
9911cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
9921cb0ef41Sopenharmony_ci  DCHECK(!AreAliased(dst, tmp1, tmp2));
9931cb0ef41Sopenharmony_ci  DCHECK(!AreAliased(lhs, tmp1, tmp2));
9941cb0ef41Sopenharmony_ci  DCHECK(!AreAliased(rhs, tmp1, tmp2));
9951cb0ef41Sopenharmony_ci
9961cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
9971cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
9981cb0ef41Sopenharmony_ci    // 1. Multiply high dword of each qword of left with right.
9991cb0ef41Sopenharmony_ci    vpsrlq(tmp1, lhs, byte{32});
10001cb0ef41Sopenharmony_ci    vpmuludq(tmp1, tmp1, rhs);
10011cb0ef41Sopenharmony_ci    // 2. Multiply high dword of each qword of right with left.
10021cb0ef41Sopenharmony_ci    vpsrlq(tmp2, rhs, byte{32});
10031cb0ef41Sopenharmony_ci    vpmuludq(tmp2, tmp2, lhs);
10041cb0ef41Sopenharmony_ci    // 3. Add 1 and 2, then shift left by 32 (this is the high dword of result).
10051cb0ef41Sopenharmony_ci    vpaddq(tmp2, tmp2, tmp1);
10061cb0ef41Sopenharmony_ci    vpsllq(tmp2, tmp2, byte{32});
10071cb0ef41Sopenharmony_ci    // 4. Multiply low dwords (this is the low dword of result).
10081cb0ef41Sopenharmony_ci    vpmuludq(dst, lhs, rhs);
10091cb0ef41Sopenharmony_ci    // 5. Add 3 and 4.
10101cb0ef41Sopenharmony_ci    vpaddq(dst, dst, tmp2);
10111cb0ef41Sopenharmony_ci  } else {
10121cb0ef41Sopenharmony_ci    // Same algorithm as AVX version, but with moves to not overwrite inputs.
10131cb0ef41Sopenharmony_ci    movaps(tmp1, lhs);
10141cb0ef41Sopenharmony_ci    movaps(tmp2, rhs);
10151cb0ef41Sopenharmony_ci    psrlq(tmp1, byte{32});
10161cb0ef41Sopenharmony_ci    pmuludq(tmp1, rhs);
10171cb0ef41Sopenharmony_ci    psrlq(tmp2, byte{32});
10181cb0ef41Sopenharmony_ci    pmuludq(tmp2, lhs);
10191cb0ef41Sopenharmony_ci    paddq(tmp2, tmp1);
10201cb0ef41Sopenharmony_ci    psllq(tmp2, byte{32});
10211cb0ef41Sopenharmony_ci    if (dst == rhs) {
10221cb0ef41Sopenharmony_ci      // pmuludq is commutative
10231cb0ef41Sopenharmony_ci      pmuludq(dst, lhs);
10241cb0ef41Sopenharmony_ci    } else {
10251cb0ef41Sopenharmony_ci      if (dst != lhs) {
10261cb0ef41Sopenharmony_ci        movaps(dst, lhs);
10271cb0ef41Sopenharmony_ci      }
10281cb0ef41Sopenharmony_ci      pmuludq(dst, rhs);
10291cb0ef41Sopenharmony_ci    }
10301cb0ef41Sopenharmony_ci    paddq(dst, tmp2);
10311cb0ef41Sopenharmony_ci  }
10321cb0ef41Sopenharmony_ci}
10331cb0ef41Sopenharmony_ci
10341cb0ef41Sopenharmony_ci// 1. Unpack src0, src1 into even-number elements of scratch.
10351cb0ef41Sopenharmony_ci// 2. Unpack src1, src0 into even-number elements of dst.
10361cb0ef41Sopenharmony_ci// 3. Multiply 1. with 2.
10371cb0ef41Sopenharmony_ci// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
10381cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
10391cb0ef41Sopenharmony_ci                                       XMMRegister src2, XMMRegister scratch,
10401cb0ef41Sopenharmony_ci                                       bool low, bool is_signed) {
10411cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
10421cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
10431cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
10441cb0ef41Sopenharmony_ci    if (low) {
10451cb0ef41Sopenharmony_ci      vpunpckldq(scratch, src1, src1);
10461cb0ef41Sopenharmony_ci      vpunpckldq(dst, src2, src2);
10471cb0ef41Sopenharmony_ci    } else {
10481cb0ef41Sopenharmony_ci      vpunpckhdq(scratch, src1, src1);
10491cb0ef41Sopenharmony_ci      vpunpckhdq(dst, src2, src2);
10501cb0ef41Sopenharmony_ci    }
10511cb0ef41Sopenharmony_ci    if (is_signed) {
10521cb0ef41Sopenharmony_ci      vpmuldq(dst, scratch, dst);
10531cb0ef41Sopenharmony_ci    } else {
10541cb0ef41Sopenharmony_ci      vpmuludq(dst, scratch, dst);
10551cb0ef41Sopenharmony_ci    }
10561cb0ef41Sopenharmony_ci  } else {
10571cb0ef41Sopenharmony_ci    uint8_t mask = low ? 0x50 : 0xFA;
10581cb0ef41Sopenharmony_ci    pshufd(scratch, src1, mask);
10591cb0ef41Sopenharmony_ci    pshufd(dst, src2, mask);
10601cb0ef41Sopenharmony_ci    if (is_signed) {
10611cb0ef41Sopenharmony_ci      CpuFeatureScope sse4_scope(this, SSE4_1);
10621cb0ef41Sopenharmony_ci      pmuldq(dst, scratch);
10631cb0ef41Sopenharmony_ci    } else {
10641cb0ef41Sopenharmony_ci      pmuludq(dst, scratch);
10651cb0ef41Sopenharmony_ci    }
10661cb0ef41Sopenharmony_ci  }
10671cb0ef41Sopenharmony_ci}
10681cb0ef41Sopenharmony_ci
10691cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst,
10701cb0ef41Sopenharmony_ci                                                  XMMRegister src) {
10711cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
10721cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
10731cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
10741cb0ef41Sopenharmony_ci    vpunpckhqdq(dst, src, src);
10751cb0ef41Sopenharmony_ci    vpmovsxdq(dst, dst);
10761cb0ef41Sopenharmony_ci  } else {
10771cb0ef41Sopenharmony_ci    CpuFeatureScope sse_scope(this, SSE4_1);
10781cb0ef41Sopenharmony_ci    if (dst == src) {
10791cb0ef41Sopenharmony_ci      movhlps(dst, src);
10801cb0ef41Sopenharmony_ci    } else {
10811cb0ef41Sopenharmony_ci      pshufd(dst, src, 0xEE);
10821cb0ef41Sopenharmony_ci    }
10831cb0ef41Sopenharmony_ci    pmovsxdq(dst, dst);
10841cb0ef41Sopenharmony_ci  }
10851cb0ef41Sopenharmony_ci}
10861cb0ef41Sopenharmony_ci
10871cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2UConvertI32x4High(XMMRegister dst,
10881cb0ef41Sopenharmony_ci                                                  XMMRegister src,
10891cb0ef41Sopenharmony_ci                                                  XMMRegister scratch) {
10901cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
10911cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
10921cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
10931cb0ef41Sopenharmony_ci    vpxor(scratch, scratch, scratch);
10941cb0ef41Sopenharmony_ci    vpunpckhdq(dst, src, scratch);
10951cb0ef41Sopenharmony_ci  } else {
10961cb0ef41Sopenharmony_ci    if (dst == src) {
10971cb0ef41Sopenharmony_ci      // xorps can be executed on more ports than pshufd.
10981cb0ef41Sopenharmony_ci      xorps(scratch, scratch);
10991cb0ef41Sopenharmony_ci      punpckhdq(dst, scratch);
11001cb0ef41Sopenharmony_ci    } else {
11011cb0ef41Sopenharmony_ci      CpuFeatureScope sse_scope(this, SSE4_1);
11021cb0ef41Sopenharmony_ci      // No dependency on dst.
11031cb0ef41Sopenharmony_ci      pshufd(dst, src, 0xEE);
11041cb0ef41Sopenharmony_ci      pmovzxdq(dst, dst);
11051cb0ef41Sopenharmony_ci    }
11061cb0ef41Sopenharmony_ci  }
11071cb0ef41Sopenharmony_ci}
11081cb0ef41Sopenharmony_ci
11091cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Not(XMMRegister dst, XMMRegister src,
11101cb0ef41Sopenharmony_ci                                   XMMRegister scratch) {
11111cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
11121cb0ef41Sopenharmony_ci  if (dst == src) {
11131cb0ef41Sopenharmony_ci    Pcmpeqd(scratch, scratch);
11141cb0ef41Sopenharmony_ci    Pxor(dst, scratch);
11151cb0ef41Sopenharmony_ci  } else {
11161cb0ef41Sopenharmony_ci    Pcmpeqd(dst, dst);
11171cb0ef41Sopenharmony_ci    Pxor(dst, src);
11181cb0ef41Sopenharmony_ci  }
11191cb0ef41Sopenharmony_ci}
11201cb0ef41Sopenharmony_ci
11211cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
11221cb0ef41Sopenharmony_ci                                      XMMRegister src1, XMMRegister src2,
11231cb0ef41Sopenharmony_ci                                      XMMRegister scratch) {
11241cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
11251cb0ef41Sopenharmony_ci  // v128.select = v128.or(v128.and(v1, c), v128.andnot(v2, c)).
11261cb0ef41Sopenharmony_ci  // pandn(x, y) = !x & y, so we have to flip the mask and input.
11271cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
11281cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
11291cb0ef41Sopenharmony_ci    vpandn(scratch, mask, src2);
11301cb0ef41Sopenharmony_ci    vpand(dst, src1, mask);
11311cb0ef41Sopenharmony_ci    vpor(dst, dst, scratch);
11321cb0ef41Sopenharmony_ci  } else {
11331cb0ef41Sopenharmony_ci    DCHECK_EQ(dst, mask);
11341cb0ef41Sopenharmony_ci    // Use float ops as they are 1 byte shorter than int ops.
11351cb0ef41Sopenharmony_ci    movaps(scratch, mask);
11361cb0ef41Sopenharmony_ci    andnps(scratch, src2);
11371cb0ef41Sopenharmony_ci    andps(dst, src1);
11381cb0ef41Sopenharmony_ci    orps(dst, scratch);
11391cb0ef41Sopenharmony_ci  }
11401cb0ef41Sopenharmony_ci}
11411cb0ef41Sopenharmony_ci
11421cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src,
11431cb0ef41Sopenharmony_ci                                          XMMRegister scratch) {
11441cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
11451cb0ef41Sopenharmony_ci  // The trap handler uses the current pc to creating a landing, so that it can
11461cb0ef41Sopenharmony_ci  // determine if a trap occured in Wasm code due to a OOB load. Make sure the
11471cb0ef41Sopenharmony_ci  // first instruction in each case below is the one that loads.
11481cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX2)) {
11491cb0ef41Sopenharmony_ci    CpuFeatureScope avx2_scope(this, AVX2);
11501cb0ef41Sopenharmony_ci    vpbroadcastb(dst, src);
11511cb0ef41Sopenharmony_ci  } else if (CpuFeatures::IsSupported(AVX)) {
11521cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
11531cb0ef41Sopenharmony_ci    // Avoid dependency on previous value of dst.
11541cb0ef41Sopenharmony_ci    vpinsrb(dst, scratch, src, uint8_t{0});
11551cb0ef41Sopenharmony_ci    vpxor(scratch, scratch, scratch);
11561cb0ef41Sopenharmony_ci    vpshufb(dst, dst, scratch);
11571cb0ef41Sopenharmony_ci  } else {
11581cb0ef41Sopenharmony_ci    CpuFeatureScope ssse4_scope(this, SSE4_1);
11591cb0ef41Sopenharmony_ci    pinsrb(dst, src, uint8_t{0});
11601cb0ef41Sopenharmony_ci    xorps(scratch, scratch);
11611cb0ef41Sopenharmony_ci    pshufb(dst, scratch);
11621cb0ef41Sopenharmony_ci  }
11631cb0ef41Sopenharmony_ci}
11641cb0ef41Sopenharmony_ci
11651cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src,
11661cb0ef41Sopenharmony_ci                                           XMMRegister scratch) {
11671cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
11681cb0ef41Sopenharmony_ci  // The trap handler uses the current pc to creating a landing, so that it can
11691cb0ef41Sopenharmony_ci  // determine if a trap occured in Wasm code due to a OOB load. Make sure the
11701cb0ef41Sopenharmony_ci  // first instruction in each case below is the one that loads.
11711cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX2)) {
11721cb0ef41Sopenharmony_ci    CpuFeatureScope avx2_scope(this, AVX2);
11731cb0ef41Sopenharmony_ci    vpbroadcastw(dst, src);
11741cb0ef41Sopenharmony_ci  } else if (CpuFeatures::IsSupported(AVX)) {
11751cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
11761cb0ef41Sopenharmony_ci    // Avoid dependency on previous value of dst.
11771cb0ef41Sopenharmony_ci    vpinsrw(dst, scratch, src, uint8_t{0});
11781cb0ef41Sopenharmony_ci    vpshuflw(dst, dst, uint8_t{0});
11791cb0ef41Sopenharmony_ci    vpunpcklqdq(dst, dst, dst);
11801cb0ef41Sopenharmony_ci  } else {
11811cb0ef41Sopenharmony_ci    pinsrw(dst, src, uint8_t{0});
11821cb0ef41Sopenharmony_ci    pshuflw(dst, dst, uint8_t{0});
11831cb0ef41Sopenharmony_ci    movlhps(dst, dst);
11841cb0ef41Sopenharmony_ci  }
11851cb0ef41Sopenharmony_ci}
11861cb0ef41Sopenharmony_ci
11871cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) {
11881cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
11891cb0ef41Sopenharmony_ci  // The trap handler uses the current pc to creating a landing, so that it can
11901cb0ef41Sopenharmony_ci  // determine if a trap occured in Wasm code due to a OOB load. Make sure the
11911cb0ef41Sopenharmony_ci  // first instruction in each case below is the one that loads.
11921cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(AVX)) {
11931cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);
11941cb0ef41Sopenharmony_ci    vbroadcastss(dst, src);
11951cb0ef41Sopenharmony_ci  } else {
11961cb0ef41Sopenharmony_ci    movss(dst, src);
11971cb0ef41Sopenharmony_ci    shufps(dst, dst, byte{0});
11981cb0ef41Sopenharmony_ci  }
11991cb0ef41Sopenharmony_ci}
12001cb0ef41Sopenharmony_ci
12011cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
12021cb0ef41Sopenharmony_ci                                           uint8_t laneidx) {
12031cb0ef41Sopenharmony_ci  ASM_CODE_COMMENT(this);
12041cb0ef41Sopenharmony_ci  if (laneidx == 0) {
12051cb0ef41Sopenharmony_ci    Movlps(dst, src);
12061cb0ef41Sopenharmony_ci  } else {
12071cb0ef41Sopenharmony_ci    DCHECK_EQ(1, laneidx);
12081cb0ef41Sopenharmony_ci    Movhps(dst, src);
12091cb0ef41Sopenharmony_ci  }
12101cb0ef41Sopenharmony_ci}
12111cb0ef41Sopenharmony_ci
12121cb0ef41Sopenharmony_ci// Helper macro to define qfma macro-assembler. This takes care of every
12131cb0ef41Sopenharmony_ci// possible case of register aliasing to minimize the number of instructions.
12141cb0ef41Sopenharmony_ci#define QFMA(ps_or_pd)                        \
12151cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(FMA3)) {       \
12161cb0ef41Sopenharmony_ci    CpuFeatureScope fma3_scope(this, FMA3);   \
12171cb0ef41Sopenharmony_ci    if (dst == src1) {                        \
12181cb0ef41Sopenharmony_ci      vfmadd231##ps_or_pd(dst, src2, src3);   \
12191cb0ef41Sopenharmony_ci    } else if (dst == src2) {                 \
12201cb0ef41Sopenharmony_ci      vfmadd132##ps_or_pd(dst, src1, src3);   \
12211cb0ef41Sopenharmony_ci    } else if (dst == src3) {                 \
12221cb0ef41Sopenharmony_ci      vfmadd213##ps_or_pd(dst, src2, src1);   \
12231cb0ef41Sopenharmony_ci    } else {                                  \
12241cb0ef41Sopenharmony_ci      CpuFeatureScope avx_scope(this, AVX);   \
12251cb0ef41Sopenharmony_ci      vmovups(dst, src1);                     \
12261cb0ef41Sopenharmony_ci      vfmadd231##ps_or_pd(dst, src2, src3);   \
12271cb0ef41Sopenharmony_ci    }                                         \
12281cb0ef41Sopenharmony_ci  } else if (CpuFeatures::IsSupported(AVX)) { \
12291cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);     \
12301cb0ef41Sopenharmony_ci    vmul##ps_or_pd(tmp, src2, src3);          \
12311cb0ef41Sopenharmony_ci    vadd##ps_or_pd(dst, src1, tmp);           \
12321cb0ef41Sopenharmony_ci  } else {                                    \
12331cb0ef41Sopenharmony_ci    if (dst == src1) {                        \
12341cb0ef41Sopenharmony_ci      movaps(tmp, src2);                      \
12351cb0ef41Sopenharmony_ci      mul##ps_or_pd(tmp, src3);               \
12361cb0ef41Sopenharmony_ci      add##ps_or_pd(dst, tmp);                \
12371cb0ef41Sopenharmony_ci    } else if (dst == src2) {                 \
12381cb0ef41Sopenharmony_ci      DCHECK_NE(src2, src1);                  \
12391cb0ef41Sopenharmony_ci      mul##ps_or_pd(src2, src3);              \
12401cb0ef41Sopenharmony_ci      add##ps_or_pd(src2, src1);              \
12411cb0ef41Sopenharmony_ci    } else if (dst == src3) {                 \
12421cb0ef41Sopenharmony_ci      DCHECK_NE(src3, src1);                  \
12431cb0ef41Sopenharmony_ci      mul##ps_or_pd(src3, src2);              \
12441cb0ef41Sopenharmony_ci      add##ps_or_pd(src3, src1);              \
12451cb0ef41Sopenharmony_ci    } else {                                  \
12461cb0ef41Sopenharmony_ci      movaps(dst, src2);                      \
12471cb0ef41Sopenharmony_ci      mul##ps_or_pd(dst, src3);               \
12481cb0ef41Sopenharmony_ci      add##ps_or_pd(dst, src1);               \
12491cb0ef41Sopenharmony_ci    }                                         \
12501cb0ef41Sopenharmony_ci  }
12511cb0ef41Sopenharmony_ci
12521cb0ef41Sopenharmony_ci// Helper macro to define qfms macro-assembler. This takes care of every
12531cb0ef41Sopenharmony_ci// possible case of register aliasing to minimize the number of instructions.
12541cb0ef41Sopenharmony_ci#define QFMS(ps_or_pd)                        \
12551cb0ef41Sopenharmony_ci  if (CpuFeatures::IsSupported(FMA3)) {       \
12561cb0ef41Sopenharmony_ci    CpuFeatureScope fma3_scope(this, FMA3);   \
12571cb0ef41Sopenharmony_ci    if (dst == src1) {                        \
12581cb0ef41Sopenharmony_ci      vfnmadd231##ps_or_pd(dst, src2, src3);  \
12591cb0ef41Sopenharmony_ci    } else if (dst == src2) {                 \
12601cb0ef41Sopenharmony_ci      vfnmadd132##ps_or_pd(dst, src1, src3);  \
12611cb0ef41Sopenharmony_ci    } else if (dst == src3) {                 \
12621cb0ef41Sopenharmony_ci      vfnmadd213##ps_or_pd(dst, src2, src1);  \
12631cb0ef41Sopenharmony_ci    } else {                                  \
12641cb0ef41Sopenharmony_ci      CpuFeatureScope avx_scope(this, AVX);   \
12651cb0ef41Sopenharmony_ci      vmovups(dst, src1);                     \
12661cb0ef41Sopenharmony_ci      vfnmadd231##ps_or_pd(dst, src2, src3);  \
12671cb0ef41Sopenharmony_ci    }                                         \
12681cb0ef41Sopenharmony_ci  } else if (CpuFeatures::IsSupported(AVX)) { \
12691cb0ef41Sopenharmony_ci    CpuFeatureScope avx_scope(this, AVX);     \
12701cb0ef41Sopenharmony_ci    vmul##ps_or_pd(tmp, src2, src3);          \
12711cb0ef41Sopenharmony_ci    vsub##ps_or_pd(dst, src1, tmp);           \
12721cb0ef41Sopenharmony_ci  } else {                                    \
12731cb0ef41Sopenharmony_ci    movaps(tmp, src2);                        \
12741cb0ef41Sopenharmony_ci    mul##ps_or_pd(tmp, src3);                 \
12751cb0ef41Sopenharmony_ci    if (dst != src1) {                        \
12761cb0ef41Sopenharmony_ci      movaps(dst, src1);                      \
12771cb0ef41Sopenharmony_ci    }                                         \
12781cb0ef41Sopenharmony_ci    sub##ps_or_pd(dst, tmp);                  \
12791cb0ef41Sopenharmony_ci  }
12801cb0ef41Sopenharmony_ci
12811cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4Qfma(XMMRegister dst, XMMRegister src1,
12821cb0ef41Sopenharmony_ci                                     XMMRegister src2, XMMRegister src3,
12831cb0ef41Sopenharmony_ci                                     XMMRegister tmp) {
12841cb0ef41Sopenharmony_ci  QFMA(ps)
12851cb0ef41Sopenharmony_ci}
12861cb0ef41Sopenharmony_ci
12871cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4Qfms(XMMRegister dst, XMMRegister src1,
12881cb0ef41Sopenharmony_ci                                     XMMRegister src2, XMMRegister src3,
12891cb0ef41Sopenharmony_ci                                     XMMRegister tmp) {
12901cb0ef41Sopenharmony_ci  QFMS(ps)
12911cb0ef41Sopenharmony_ci}
12921cb0ef41Sopenharmony_ci
12931cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2Qfma(XMMRegister dst, XMMRegister src1,
12941cb0ef41Sopenharmony_ci                                     XMMRegister src2, XMMRegister src3,
12951cb0ef41Sopenharmony_ci                                     XMMRegister tmp) {
12961cb0ef41Sopenharmony_ci  QFMA(pd);
12971cb0ef41Sopenharmony_ci}
12981cb0ef41Sopenharmony_ci
12991cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2Qfms(XMMRegister dst, XMMRegister src1,
13001cb0ef41Sopenharmony_ci                                     XMMRegister src2, XMMRegister src3,
13011cb0ef41Sopenharmony_ci                                     XMMRegister tmp) {
13021cb0ef41Sopenharmony_ci  QFMS(pd);
13031cb0ef41Sopenharmony_ci}
13041cb0ef41Sopenharmony_ci
13051cb0ef41Sopenharmony_ci#undef QFMOP
13061cb0ef41Sopenharmony_ci
13071cb0ef41Sopenharmony_ci}  // namespace internal
13081cb0ef41Sopenharmony_ci}  // namespace v8
13091cb0ef41Sopenharmony_ci
13101cb0ef41Sopenharmony_ci#undef DCHECK_OPERAND_IS_NOT_REG
1311