11cb0ef41Sopenharmony_ci// Copyright 2021 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci#include "src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h" 61cb0ef41Sopenharmony_ci 71cb0ef41Sopenharmony_ci#include "src/codegen/assembler.h" 81cb0ef41Sopenharmony_ci#include "src/codegen/cpu-features.h" 91cb0ef41Sopenharmony_ci#include "src/codegen/register.h" 101cb0ef41Sopenharmony_ci 111cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32 121cb0ef41Sopenharmony_ci#include "src/codegen/ia32/register-ia32.h" 131cb0ef41Sopenharmony_ci#elif V8_TARGET_ARCH_X64 141cb0ef41Sopenharmony_ci#include "src/codegen/x64/register-x64.h" 151cb0ef41Sopenharmony_ci#else 161cb0ef41Sopenharmony_ci#error Unsupported target architecture. 171cb0ef41Sopenharmony_ci#endif 181cb0ef41Sopenharmony_ci 191cb0ef41Sopenharmony_ci// Operand on IA32 can be a wrapper for a single register, in which case they 201cb0ef41Sopenharmony_ci// should call I8x16Splat |src| being Register. 211cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32 221cb0ef41Sopenharmony_ci#define DCHECK_OPERAND_IS_NOT_REG(op) DCHECK(!op.is_reg_only()); 231cb0ef41Sopenharmony_ci#else 241cb0ef41Sopenharmony_ci#define DCHECK_OPERAND_IS_NOT_REG(op) 251cb0ef41Sopenharmony_ci#endif 261cb0ef41Sopenharmony_ci 271cb0ef41Sopenharmony_cinamespace v8 { 281cb0ef41Sopenharmony_cinamespace internal { 291cb0ef41Sopenharmony_ci 301cb0ef41Sopenharmony_civoid SharedTurboAssembler::Move(Register dst, uint32_t src) { 311cb0ef41Sopenharmony_ci // Helper to paper over the different assembler function names. 321cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32 331cb0ef41Sopenharmony_ci mov(dst, Immediate(src)); 341cb0ef41Sopenharmony_ci#elif V8_TARGET_ARCH_X64 351cb0ef41Sopenharmony_ci movl(dst, Immediate(src)); 361cb0ef41Sopenharmony_ci#else 371cb0ef41Sopenharmony_ci#error Unsupported target architecture. 381cb0ef41Sopenharmony_ci#endif 391cb0ef41Sopenharmony_ci} 401cb0ef41Sopenharmony_ci 411cb0ef41Sopenharmony_civoid SharedTurboAssembler::Move(Register dst, Register src) { 421cb0ef41Sopenharmony_ci // Helper to paper over the different assembler function names. 431cb0ef41Sopenharmony_ci if (dst != src) { 441cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32 451cb0ef41Sopenharmony_ci mov(dst, src); 461cb0ef41Sopenharmony_ci#elif V8_TARGET_ARCH_X64 471cb0ef41Sopenharmony_ci movq(dst, src); 481cb0ef41Sopenharmony_ci#else 491cb0ef41Sopenharmony_ci#error Unsupported target architecture. 501cb0ef41Sopenharmony_ci#endif 511cb0ef41Sopenharmony_ci } 521cb0ef41Sopenharmony_ci} 531cb0ef41Sopenharmony_ci 541cb0ef41Sopenharmony_civoid SharedTurboAssembler::Add(Register dst, Immediate src) { 551cb0ef41Sopenharmony_ci // Helper to paper over the different assembler function names. 561cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32 571cb0ef41Sopenharmony_ci add(dst, src); 581cb0ef41Sopenharmony_ci#elif V8_TARGET_ARCH_X64 591cb0ef41Sopenharmony_ci addq(dst, src); 601cb0ef41Sopenharmony_ci#else 611cb0ef41Sopenharmony_ci#error Unsupported target architecture. 621cb0ef41Sopenharmony_ci#endif 631cb0ef41Sopenharmony_ci} 641cb0ef41Sopenharmony_ci 651cb0ef41Sopenharmony_civoid SharedTurboAssembler::And(Register dst, Immediate src) { 661cb0ef41Sopenharmony_ci // Helper to paper over the different assembler function names. 671cb0ef41Sopenharmony_ci#if V8_TARGET_ARCH_IA32 681cb0ef41Sopenharmony_ci and_(dst, src); 691cb0ef41Sopenharmony_ci#elif V8_TARGET_ARCH_X64 701cb0ef41Sopenharmony_ci if (is_uint32(src.value())) { 711cb0ef41Sopenharmony_ci andl(dst, src); 721cb0ef41Sopenharmony_ci } else { 731cb0ef41Sopenharmony_ci andq(dst, src); 741cb0ef41Sopenharmony_ci } 751cb0ef41Sopenharmony_ci#else 761cb0ef41Sopenharmony_ci#error Unsupported target architecture. 771cb0ef41Sopenharmony_ci#endif 781cb0ef41Sopenharmony_ci} 791cb0ef41Sopenharmony_ci 801cb0ef41Sopenharmony_civoid SharedTurboAssembler::Movhps(XMMRegister dst, XMMRegister src1, 811cb0ef41Sopenharmony_ci Operand src2) { 821cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 831cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, AVX); 841cb0ef41Sopenharmony_ci vmovhps(dst, src1, src2); 851cb0ef41Sopenharmony_ci } else { 861cb0ef41Sopenharmony_ci if (dst != src1) { 871cb0ef41Sopenharmony_ci movaps(dst, src1); 881cb0ef41Sopenharmony_ci } 891cb0ef41Sopenharmony_ci movhps(dst, src2); 901cb0ef41Sopenharmony_ci } 911cb0ef41Sopenharmony_ci} 921cb0ef41Sopenharmony_ci 931cb0ef41Sopenharmony_civoid SharedTurboAssembler::Movlps(XMMRegister dst, XMMRegister src1, 941cb0ef41Sopenharmony_ci Operand src2) { 951cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 961cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, AVX); 971cb0ef41Sopenharmony_ci vmovlps(dst, src1, src2); 981cb0ef41Sopenharmony_ci } else { 991cb0ef41Sopenharmony_ci if (dst != src1) { 1001cb0ef41Sopenharmony_ci movaps(dst, src1); 1011cb0ef41Sopenharmony_ci } 1021cb0ef41Sopenharmony_ci movlps(dst, src2); 1031cb0ef41Sopenharmony_ci } 1041cb0ef41Sopenharmony_ci} 1051cb0ef41Sopenharmony_ci 1061cb0ef41Sopenharmony_civoid SharedTurboAssembler::Pblendvb(XMMRegister dst, XMMRegister src1, 1071cb0ef41Sopenharmony_ci XMMRegister src2, XMMRegister mask) { 1081cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 1091cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, AVX); 1101cb0ef41Sopenharmony_ci vpblendvb(dst, src1, src2, mask); 1111cb0ef41Sopenharmony_ci } else { 1121cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, SSE4_1); 1131cb0ef41Sopenharmony_ci DCHECK_EQ(mask, xmm0); 1141cb0ef41Sopenharmony_ci DCHECK_EQ(dst, src1); 1151cb0ef41Sopenharmony_ci pblendvb(dst, src2); 1161cb0ef41Sopenharmony_ci } 1171cb0ef41Sopenharmony_ci} 1181cb0ef41Sopenharmony_ci 1191cb0ef41Sopenharmony_civoid SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, 1201cb0ef41Sopenharmony_ci XMMRegister src2, uint8_t imm8) { 1211cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 1221cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 1231cb0ef41Sopenharmony_ci vshufps(dst, src1, src2, imm8); 1241cb0ef41Sopenharmony_ci } else { 1251cb0ef41Sopenharmony_ci if (dst != src1) { 1261cb0ef41Sopenharmony_ci movaps(dst, src1); 1271cb0ef41Sopenharmony_ci } 1281cb0ef41Sopenharmony_ci shufps(dst, src2, imm8); 1291cb0ef41Sopenharmony_ci } 1301cb0ef41Sopenharmony_ci} 1311cb0ef41Sopenharmony_ci 1321cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src, 1331cb0ef41Sopenharmony_ci uint8_t lane) { 1341cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 1351cb0ef41Sopenharmony_ci if (lane == 0) { 1361cb0ef41Sopenharmony_ci if (dst != src) { 1371cb0ef41Sopenharmony_ci Movaps(dst, src); 1381cb0ef41Sopenharmony_ci } 1391cb0ef41Sopenharmony_ci } else { 1401cb0ef41Sopenharmony_ci DCHECK_EQ(1, lane); 1411cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 1421cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 1431cb0ef41Sopenharmony_ci // Pass src as operand to avoid false-dependency on dst. 1441cb0ef41Sopenharmony_ci vmovhlps(dst, src, src); 1451cb0ef41Sopenharmony_ci } else { 1461cb0ef41Sopenharmony_ci movhlps(dst, src); 1471cb0ef41Sopenharmony_ci } 1481cb0ef41Sopenharmony_ci } 1491cb0ef41Sopenharmony_ci} 1501cb0ef41Sopenharmony_ci 1511cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src, 1521cb0ef41Sopenharmony_ci DoubleRegister rep, uint8_t lane) { 1531cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 1541cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 1551cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, AVX); 1561cb0ef41Sopenharmony_ci if (lane == 0) { 1571cb0ef41Sopenharmony_ci vmovsd(dst, src, rep); 1581cb0ef41Sopenharmony_ci } else { 1591cb0ef41Sopenharmony_ci vmovlhps(dst, src, rep); 1601cb0ef41Sopenharmony_ci } 1611cb0ef41Sopenharmony_ci } else { 1621cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, SSE4_1); 1631cb0ef41Sopenharmony_ci if (dst != src) { 1641cb0ef41Sopenharmony_ci DCHECK_NE(dst, rep); // Ensure rep is not overwritten. 1651cb0ef41Sopenharmony_ci movaps(dst, src); 1661cb0ef41Sopenharmony_ci } 1671cb0ef41Sopenharmony_ci if (lane == 0) { 1681cb0ef41Sopenharmony_ci movsd(dst, rep); 1691cb0ef41Sopenharmony_ci } else { 1701cb0ef41Sopenharmony_ci movlhps(dst, rep); 1711cb0ef41Sopenharmony_ci } 1721cb0ef41Sopenharmony_ci } 1731cb0ef41Sopenharmony_ci} 1741cb0ef41Sopenharmony_ci 1751cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4Min(XMMRegister dst, XMMRegister lhs, 1761cb0ef41Sopenharmony_ci XMMRegister rhs, XMMRegister scratch) { 1771cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 1781cb0ef41Sopenharmony_ci // The minps instruction doesn't propagate NaNs and +0's in its first 1791cb0ef41Sopenharmony_ci // operand. Perform minps in both orders, merge the results, and adjust. 1801cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 1811cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, AVX); 1821cb0ef41Sopenharmony_ci vminps(scratch, lhs, rhs); 1831cb0ef41Sopenharmony_ci vminps(dst, rhs, lhs); 1841cb0ef41Sopenharmony_ci } else if (dst == lhs || dst == rhs) { 1851cb0ef41Sopenharmony_ci XMMRegister src = dst == lhs ? rhs : lhs; 1861cb0ef41Sopenharmony_ci movaps(scratch, src); 1871cb0ef41Sopenharmony_ci minps(scratch, dst); 1881cb0ef41Sopenharmony_ci minps(dst, src); 1891cb0ef41Sopenharmony_ci } else { 1901cb0ef41Sopenharmony_ci movaps(scratch, lhs); 1911cb0ef41Sopenharmony_ci minps(scratch, rhs); 1921cb0ef41Sopenharmony_ci movaps(dst, rhs); 1931cb0ef41Sopenharmony_ci minps(dst, lhs); 1941cb0ef41Sopenharmony_ci } 1951cb0ef41Sopenharmony_ci // Propagate -0's and NaNs, which may be non-canonical. 1961cb0ef41Sopenharmony_ci Orps(scratch, dst); 1971cb0ef41Sopenharmony_ci // Canonicalize NaNs by quieting and clearing the payload. 1981cb0ef41Sopenharmony_ci Cmpunordps(dst, dst, scratch); 1991cb0ef41Sopenharmony_ci Orps(scratch, dst); 2001cb0ef41Sopenharmony_ci Psrld(dst, dst, byte{10}); 2011cb0ef41Sopenharmony_ci Andnps(dst, dst, scratch); 2021cb0ef41Sopenharmony_ci} 2031cb0ef41Sopenharmony_ci 2041cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4Max(XMMRegister dst, XMMRegister lhs, 2051cb0ef41Sopenharmony_ci XMMRegister rhs, XMMRegister scratch) { 2061cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 2071cb0ef41Sopenharmony_ci // The maxps instruction doesn't propagate NaNs and +0's in its first 2081cb0ef41Sopenharmony_ci // operand. Perform maxps in both orders, merge the results, and adjust. 2091cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 2101cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, AVX); 2111cb0ef41Sopenharmony_ci vmaxps(scratch, lhs, rhs); 2121cb0ef41Sopenharmony_ci vmaxps(dst, rhs, lhs); 2131cb0ef41Sopenharmony_ci } else if (dst == lhs || dst == rhs) { 2141cb0ef41Sopenharmony_ci XMMRegister src = dst == lhs ? rhs : lhs; 2151cb0ef41Sopenharmony_ci movaps(scratch, src); 2161cb0ef41Sopenharmony_ci maxps(scratch, dst); 2171cb0ef41Sopenharmony_ci maxps(dst, src); 2181cb0ef41Sopenharmony_ci } else { 2191cb0ef41Sopenharmony_ci movaps(scratch, lhs); 2201cb0ef41Sopenharmony_ci maxps(scratch, rhs); 2211cb0ef41Sopenharmony_ci movaps(dst, rhs); 2221cb0ef41Sopenharmony_ci maxps(dst, lhs); 2231cb0ef41Sopenharmony_ci } 2241cb0ef41Sopenharmony_ci // Find discrepancies. 2251cb0ef41Sopenharmony_ci Xorps(dst, scratch); 2261cb0ef41Sopenharmony_ci // Propagate NaNs, which may be non-canonical. 2271cb0ef41Sopenharmony_ci Orps(scratch, dst); 2281cb0ef41Sopenharmony_ci // Propagate sign discrepancy and (subtle) quiet NaNs. 2291cb0ef41Sopenharmony_ci Subps(scratch, scratch, dst); 2301cb0ef41Sopenharmony_ci // Canonicalize NaNs by clearing the payload. Sign is non-deterministic. 2311cb0ef41Sopenharmony_ci Cmpunordps(dst, dst, scratch); 2321cb0ef41Sopenharmony_ci Psrld(dst, dst, byte{10}); 2331cb0ef41Sopenharmony_ci Andnps(dst, dst, scratch); 2341cb0ef41Sopenharmony_ci} 2351cb0ef41Sopenharmony_ci 2361cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs, 2371cb0ef41Sopenharmony_ci XMMRegister rhs, XMMRegister scratch) { 2381cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 2391cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 2401cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, AVX); 2411cb0ef41Sopenharmony_ci // The minpd instruction doesn't propagate NaNs and +0's in its first 2421cb0ef41Sopenharmony_ci // operand. Perform minpd in both orders, merge the resuls, and adjust. 2431cb0ef41Sopenharmony_ci vminpd(scratch, lhs, rhs); 2441cb0ef41Sopenharmony_ci vminpd(dst, rhs, lhs); 2451cb0ef41Sopenharmony_ci // propagate -0's and NaNs, which may be non-canonical. 2461cb0ef41Sopenharmony_ci vorpd(scratch, scratch, dst); 2471cb0ef41Sopenharmony_ci // Canonicalize NaNs by quieting and clearing the payload. 2481cb0ef41Sopenharmony_ci vcmpunordpd(dst, dst, scratch); 2491cb0ef41Sopenharmony_ci vorpd(scratch, scratch, dst); 2501cb0ef41Sopenharmony_ci vpsrlq(dst, dst, byte{13}); 2511cb0ef41Sopenharmony_ci vandnpd(dst, dst, scratch); 2521cb0ef41Sopenharmony_ci } else { 2531cb0ef41Sopenharmony_ci // Compare lhs with rhs, and rhs with lhs, and have the results in scratch 2541cb0ef41Sopenharmony_ci // and dst. If dst overlaps with lhs or rhs, we can save a move. 2551cb0ef41Sopenharmony_ci if (dst == lhs || dst == rhs) { 2561cb0ef41Sopenharmony_ci XMMRegister src = dst == lhs ? rhs : lhs; 2571cb0ef41Sopenharmony_ci movaps(scratch, src); 2581cb0ef41Sopenharmony_ci minpd(scratch, dst); 2591cb0ef41Sopenharmony_ci minpd(dst, src); 2601cb0ef41Sopenharmony_ci } else { 2611cb0ef41Sopenharmony_ci movaps(scratch, lhs); 2621cb0ef41Sopenharmony_ci movaps(dst, rhs); 2631cb0ef41Sopenharmony_ci minpd(scratch, rhs); 2641cb0ef41Sopenharmony_ci minpd(dst, lhs); 2651cb0ef41Sopenharmony_ci } 2661cb0ef41Sopenharmony_ci orpd(scratch, dst); 2671cb0ef41Sopenharmony_ci cmpunordpd(dst, scratch); 2681cb0ef41Sopenharmony_ci orpd(scratch, dst); 2691cb0ef41Sopenharmony_ci psrlq(dst, byte{13}); 2701cb0ef41Sopenharmony_ci andnpd(dst, scratch); 2711cb0ef41Sopenharmony_ci } 2721cb0ef41Sopenharmony_ci} 2731cb0ef41Sopenharmony_ci 2741cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2Max(XMMRegister dst, XMMRegister lhs, 2751cb0ef41Sopenharmony_ci XMMRegister rhs, XMMRegister scratch) { 2761cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 2771cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 2781cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, AVX); 2791cb0ef41Sopenharmony_ci // The maxpd instruction doesn't propagate NaNs and +0's in its first 2801cb0ef41Sopenharmony_ci // operand. Perform maxpd in both orders, merge the resuls, and adjust. 2811cb0ef41Sopenharmony_ci vmaxpd(scratch, lhs, rhs); 2821cb0ef41Sopenharmony_ci vmaxpd(dst, rhs, lhs); 2831cb0ef41Sopenharmony_ci // Find discrepancies. 2841cb0ef41Sopenharmony_ci vxorpd(dst, dst, scratch); 2851cb0ef41Sopenharmony_ci // Propagate NaNs, which may be non-canonical. 2861cb0ef41Sopenharmony_ci vorpd(scratch, scratch, dst); 2871cb0ef41Sopenharmony_ci // Propagate sign discrepancy and (subtle) quiet NaNs. 2881cb0ef41Sopenharmony_ci vsubpd(scratch, scratch, dst); 2891cb0ef41Sopenharmony_ci // Canonicalize NaNs by clearing the payload. Sign is non-deterministic. 2901cb0ef41Sopenharmony_ci vcmpunordpd(dst, dst, scratch); 2911cb0ef41Sopenharmony_ci vpsrlq(dst, dst, byte{13}); 2921cb0ef41Sopenharmony_ci vandnpd(dst, dst, scratch); 2931cb0ef41Sopenharmony_ci } else { 2941cb0ef41Sopenharmony_ci if (dst == lhs || dst == rhs) { 2951cb0ef41Sopenharmony_ci XMMRegister src = dst == lhs ? rhs : lhs; 2961cb0ef41Sopenharmony_ci movaps(scratch, src); 2971cb0ef41Sopenharmony_ci maxpd(scratch, dst); 2981cb0ef41Sopenharmony_ci maxpd(dst, src); 2991cb0ef41Sopenharmony_ci } else { 3001cb0ef41Sopenharmony_ci movaps(scratch, lhs); 3011cb0ef41Sopenharmony_ci movaps(dst, rhs); 3021cb0ef41Sopenharmony_ci maxpd(scratch, rhs); 3031cb0ef41Sopenharmony_ci maxpd(dst, lhs); 3041cb0ef41Sopenharmony_ci } 3051cb0ef41Sopenharmony_ci xorpd(dst, scratch); 3061cb0ef41Sopenharmony_ci orpd(scratch, dst); 3071cb0ef41Sopenharmony_ci subpd(scratch, dst); 3081cb0ef41Sopenharmony_ci cmpunordpd(dst, scratch); 3091cb0ef41Sopenharmony_ci psrlq(dst, byte{13}); 3101cb0ef41Sopenharmony_ci andnpd(dst, scratch); 3111cb0ef41Sopenharmony_ci } 3121cb0ef41Sopenharmony_ci} 3131cb0ef41Sopenharmony_ci 3141cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) { 3151cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 3161cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX2)) { 3171cb0ef41Sopenharmony_ci CpuFeatureScope avx2_scope(this, AVX2); 3181cb0ef41Sopenharmony_ci vbroadcastss(dst, src); 3191cb0ef41Sopenharmony_ci } else if (CpuFeatures::IsSupported(AVX)) { 3201cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 3211cb0ef41Sopenharmony_ci vshufps(dst, src, src, 0); 3221cb0ef41Sopenharmony_ci } else { 3231cb0ef41Sopenharmony_ci if (dst == src) { 3241cb0ef41Sopenharmony_ci // 1 byte shorter than pshufd. 3251cb0ef41Sopenharmony_ci shufps(dst, src, 0); 3261cb0ef41Sopenharmony_ci } else { 3271cb0ef41Sopenharmony_ci pshufd(dst, src, 0); 3281cb0ef41Sopenharmony_ci } 3291cb0ef41Sopenharmony_ci } 3301cb0ef41Sopenharmony_ci} 3311cb0ef41Sopenharmony_ci 3321cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4ExtractLane(FloatRegister dst, XMMRegister src, 3331cb0ef41Sopenharmony_ci uint8_t lane) { 3341cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 3351cb0ef41Sopenharmony_ci DCHECK_LT(lane, 4); 3361cb0ef41Sopenharmony_ci // These instructions are shorter than insertps, but will leave junk in 3371cb0ef41Sopenharmony_ci // the top lanes of dst. 3381cb0ef41Sopenharmony_ci if (lane == 0) { 3391cb0ef41Sopenharmony_ci if (dst != src) { 3401cb0ef41Sopenharmony_ci Movaps(dst, src); 3411cb0ef41Sopenharmony_ci } 3421cb0ef41Sopenharmony_ci } else if (lane == 1) { 3431cb0ef41Sopenharmony_ci Movshdup(dst, src); 3441cb0ef41Sopenharmony_ci } else if (lane == 2 && dst == src) { 3451cb0ef41Sopenharmony_ci // Check dst == src to avoid false dependency on dst. 3461cb0ef41Sopenharmony_ci Movhlps(dst, src); 3471cb0ef41Sopenharmony_ci } else if (dst == src) { 3481cb0ef41Sopenharmony_ci Shufps(dst, src, src, lane); 3491cb0ef41Sopenharmony_ci } else { 3501cb0ef41Sopenharmony_ci Pshufd(dst, src, lane); 3511cb0ef41Sopenharmony_ci } 3521cb0ef41Sopenharmony_ci} 3531cb0ef41Sopenharmony_ci 3541cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src, 3551cb0ef41Sopenharmony_ci uint8_t laneidx) { 3561cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 3571cb0ef41Sopenharmony_ci if (laneidx == 0) { 3581cb0ef41Sopenharmony_ci Movss(dst, src); 3591cb0ef41Sopenharmony_ci } else { 3601cb0ef41Sopenharmony_ci DCHECK_GE(3, laneidx); 3611cb0ef41Sopenharmony_ci Extractps(dst, src, laneidx); 3621cb0ef41Sopenharmony_ci } 3631cb0ef41Sopenharmony_ci} 3641cb0ef41Sopenharmony_ci 3651cb0ef41Sopenharmony_citemplate <typename Op> 3661cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src, 3671cb0ef41Sopenharmony_ci XMMRegister scratch) { 3681cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 3691cb0ef41Sopenharmony_ci DCHECK(!CpuFeatures::IsSupported(AVX2)); 3701cb0ef41Sopenharmony_ci CpuFeatureScope ssse3_scope(this, SSSE3); 3711cb0ef41Sopenharmony_ci Movd(dst, src); 3721cb0ef41Sopenharmony_ci Xorps(scratch, scratch); 3731cb0ef41Sopenharmony_ci Pshufb(dst, scratch); 3741cb0ef41Sopenharmony_ci} 3751cb0ef41Sopenharmony_ci 3761cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src, 3771cb0ef41Sopenharmony_ci XMMRegister scratch) { 3781cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 3791cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX2)) { 3801cb0ef41Sopenharmony_ci CpuFeatureScope avx2_scope(this, AVX2); 3811cb0ef41Sopenharmony_ci Movd(scratch, src); 3821cb0ef41Sopenharmony_ci vpbroadcastb(dst, scratch); 3831cb0ef41Sopenharmony_ci } else { 3841cb0ef41Sopenharmony_ci I8x16SplatPreAvx2(dst, src, scratch); 3851cb0ef41Sopenharmony_ci } 3861cb0ef41Sopenharmony_ci} 3871cb0ef41Sopenharmony_ci 3881cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src, 3891cb0ef41Sopenharmony_ci XMMRegister scratch) { 3901cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 3911cb0ef41Sopenharmony_ci DCHECK_OPERAND_IS_NOT_REG(src); 3921cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX2)) { 3931cb0ef41Sopenharmony_ci CpuFeatureScope avx2_scope(this, AVX2); 3941cb0ef41Sopenharmony_ci vpbroadcastb(dst, src); 3951cb0ef41Sopenharmony_ci } else { 3961cb0ef41Sopenharmony_ci I8x16SplatPreAvx2(dst, src, scratch); 3971cb0ef41Sopenharmony_ci } 3981cb0ef41Sopenharmony_ci} 3991cb0ef41Sopenharmony_ci 4001cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1, 4011cb0ef41Sopenharmony_ci uint8_t src2, Register tmp1, 4021cb0ef41Sopenharmony_ci XMMRegister tmp2) { 4031cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 4041cb0ef41Sopenharmony_ci DCHECK_NE(dst, tmp2); 4051cb0ef41Sopenharmony_ci // Perform 16-bit shift, then mask away low bits. 4061cb0ef41Sopenharmony_ci if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) { 4071cb0ef41Sopenharmony_ci movaps(dst, src1); 4081cb0ef41Sopenharmony_ci src1 = dst; 4091cb0ef41Sopenharmony_ci } 4101cb0ef41Sopenharmony_ci 4111cb0ef41Sopenharmony_ci uint8_t shift = truncate_to_int3(src2); 4121cb0ef41Sopenharmony_ci Psllw(dst, src1, byte{shift}); 4131cb0ef41Sopenharmony_ci 4141cb0ef41Sopenharmony_ci uint8_t bmask = static_cast<uint8_t>(0xff << shift); 4151cb0ef41Sopenharmony_ci uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask; 4161cb0ef41Sopenharmony_ci Move(tmp1, mask); 4171cb0ef41Sopenharmony_ci Movd(tmp2, tmp1); 4181cb0ef41Sopenharmony_ci Pshufd(tmp2, tmp2, uint8_t{0}); 4191cb0ef41Sopenharmony_ci Pand(dst, tmp2); 4201cb0ef41Sopenharmony_ci} 4211cb0ef41Sopenharmony_ci 4221cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1, 4231cb0ef41Sopenharmony_ci Register src2, Register tmp1, 4241cb0ef41Sopenharmony_ci XMMRegister tmp2, XMMRegister tmp3) { 4251cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 4261cb0ef41Sopenharmony_ci DCHECK(!AreAliased(dst, tmp2, tmp3)); 4271cb0ef41Sopenharmony_ci DCHECK(!AreAliased(src1, tmp2, tmp3)); 4281cb0ef41Sopenharmony_ci 4291cb0ef41Sopenharmony_ci // Take shift value modulo 8. 4301cb0ef41Sopenharmony_ci Move(tmp1, src2); 4311cb0ef41Sopenharmony_ci And(tmp1, Immediate(7)); 4321cb0ef41Sopenharmony_ci Add(tmp1, Immediate(8)); 4331cb0ef41Sopenharmony_ci // Create a mask to unset high bits. 4341cb0ef41Sopenharmony_ci Movd(tmp3, tmp1); 4351cb0ef41Sopenharmony_ci Pcmpeqd(tmp2, tmp2); 4361cb0ef41Sopenharmony_ci Psrlw(tmp2, tmp2, tmp3); 4371cb0ef41Sopenharmony_ci Packuswb(tmp2, tmp2); 4381cb0ef41Sopenharmony_ci if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) { 4391cb0ef41Sopenharmony_ci movaps(dst, src1); 4401cb0ef41Sopenharmony_ci src1 = dst; 4411cb0ef41Sopenharmony_ci } 4421cb0ef41Sopenharmony_ci // Mask off the unwanted bits before word-shifting. 4431cb0ef41Sopenharmony_ci Pand(dst, src1, tmp2); 4441cb0ef41Sopenharmony_ci Add(tmp1, Immediate(-8)); 4451cb0ef41Sopenharmony_ci Movd(tmp3, tmp1); 4461cb0ef41Sopenharmony_ci Psllw(dst, dst, tmp3); 4471cb0ef41Sopenharmony_ci} 4481cb0ef41Sopenharmony_ci 4491cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1, 4501cb0ef41Sopenharmony_ci uint8_t src2, XMMRegister tmp) { 4511cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 4521cb0ef41Sopenharmony_ci // Unpack bytes into words, do word (16-bit) shifts, and repack. 4531cb0ef41Sopenharmony_ci DCHECK_NE(dst, tmp); 4541cb0ef41Sopenharmony_ci uint8_t shift = truncate_to_int3(src2) + 8; 4551cb0ef41Sopenharmony_ci 4561cb0ef41Sopenharmony_ci Punpckhbw(tmp, src1); 4571cb0ef41Sopenharmony_ci Punpcklbw(dst, src1); 4581cb0ef41Sopenharmony_ci Psraw(tmp, shift); 4591cb0ef41Sopenharmony_ci Psraw(dst, shift); 4601cb0ef41Sopenharmony_ci Packsswb(dst, tmp); 4611cb0ef41Sopenharmony_ci} 4621cb0ef41Sopenharmony_ci 4631cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1, 4641cb0ef41Sopenharmony_ci Register src2, Register tmp1, 4651cb0ef41Sopenharmony_ci XMMRegister tmp2, XMMRegister tmp3) { 4661cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 4671cb0ef41Sopenharmony_ci DCHECK(!AreAliased(dst, tmp2, tmp3)); 4681cb0ef41Sopenharmony_ci DCHECK_NE(src1, tmp2); 4691cb0ef41Sopenharmony_ci 4701cb0ef41Sopenharmony_ci // Unpack the bytes into words, do arithmetic shifts, and repack. 4711cb0ef41Sopenharmony_ci Punpckhbw(tmp2, src1); 4721cb0ef41Sopenharmony_ci Punpcklbw(dst, src1); 4731cb0ef41Sopenharmony_ci // Prepare shift value 4741cb0ef41Sopenharmony_ci Move(tmp1, src2); 4751cb0ef41Sopenharmony_ci // Take shift value modulo 8. 4761cb0ef41Sopenharmony_ci And(tmp1, Immediate(7)); 4771cb0ef41Sopenharmony_ci Add(tmp1, Immediate(8)); 4781cb0ef41Sopenharmony_ci Movd(tmp3, tmp1); 4791cb0ef41Sopenharmony_ci Psraw(tmp2, tmp3); 4801cb0ef41Sopenharmony_ci Psraw(dst, tmp3); 4811cb0ef41Sopenharmony_ci Packsswb(dst, tmp2); 4821cb0ef41Sopenharmony_ci} 4831cb0ef41Sopenharmony_ci 4841cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1, 4851cb0ef41Sopenharmony_ci uint8_t src2, Register tmp1, 4861cb0ef41Sopenharmony_ci XMMRegister tmp2) { 4871cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 4881cb0ef41Sopenharmony_ci DCHECK_NE(dst, tmp2); 4891cb0ef41Sopenharmony_ci if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) { 4901cb0ef41Sopenharmony_ci movaps(dst, src1); 4911cb0ef41Sopenharmony_ci src1 = dst; 4921cb0ef41Sopenharmony_ci } 4931cb0ef41Sopenharmony_ci 4941cb0ef41Sopenharmony_ci // Perform 16-bit shift, then mask away high bits. 4951cb0ef41Sopenharmony_ci uint8_t shift = truncate_to_int3(src2); 4961cb0ef41Sopenharmony_ci Psrlw(dst, src1, shift); 4971cb0ef41Sopenharmony_ci 4981cb0ef41Sopenharmony_ci uint8_t bmask = 0xff >> shift; 4991cb0ef41Sopenharmony_ci uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask; 5001cb0ef41Sopenharmony_ci Move(tmp1, mask); 5011cb0ef41Sopenharmony_ci Movd(tmp2, tmp1); 5021cb0ef41Sopenharmony_ci Pshufd(tmp2, tmp2, byte{0}); 5031cb0ef41Sopenharmony_ci Pand(dst, tmp2); 5041cb0ef41Sopenharmony_ci} 5051cb0ef41Sopenharmony_ci 5061cb0ef41Sopenharmony_civoid SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1, 5071cb0ef41Sopenharmony_ci Register src2, Register tmp1, 5081cb0ef41Sopenharmony_ci XMMRegister tmp2, XMMRegister tmp3) { 5091cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 5101cb0ef41Sopenharmony_ci DCHECK(!AreAliased(dst, tmp2, tmp3)); 5111cb0ef41Sopenharmony_ci DCHECK_NE(src1, tmp2); 5121cb0ef41Sopenharmony_ci 5131cb0ef41Sopenharmony_ci // Unpack the bytes into words, do logical shifts, and repack. 5141cb0ef41Sopenharmony_ci Punpckhbw(tmp2, src1); 5151cb0ef41Sopenharmony_ci Punpcklbw(dst, src1); 5161cb0ef41Sopenharmony_ci // Prepare shift value. 5171cb0ef41Sopenharmony_ci Move(tmp1, src2); 5181cb0ef41Sopenharmony_ci // Take shift value modulo 8. 5191cb0ef41Sopenharmony_ci And(tmp1, Immediate(7)); 5201cb0ef41Sopenharmony_ci Add(tmp1, Immediate(8)); 5211cb0ef41Sopenharmony_ci Movd(tmp3, tmp1); 5221cb0ef41Sopenharmony_ci Psrlw(tmp2, tmp3); 5231cb0ef41Sopenharmony_ci Psrlw(dst, tmp3); 5241cb0ef41Sopenharmony_ci Packuswb(dst, tmp2); 5251cb0ef41Sopenharmony_ci} 5261cb0ef41Sopenharmony_ci 5271cb0ef41Sopenharmony_citemplate <typename Op> 5281cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8SplatPreAvx2(XMMRegister dst, Op src) { 5291cb0ef41Sopenharmony_ci DCHECK(!CpuFeatures::IsSupported(AVX2)); 5301cb0ef41Sopenharmony_ci Movd(dst, src); 5311cb0ef41Sopenharmony_ci Pshuflw(dst, dst, uint8_t{0x0}); 5321cb0ef41Sopenharmony_ci Punpcklqdq(dst, dst); 5331cb0ef41Sopenharmony_ci} 5341cb0ef41Sopenharmony_ci 5351cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8Splat(XMMRegister dst, Register src) { 5361cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 5371cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX2)) { 5381cb0ef41Sopenharmony_ci CpuFeatureScope avx2_scope(this, AVX2); 5391cb0ef41Sopenharmony_ci Movd(dst, src); 5401cb0ef41Sopenharmony_ci vpbroadcastw(dst, dst); 5411cb0ef41Sopenharmony_ci } else { 5421cb0ef41Sopenharmony_ci I16x8SplatPreAvx2(dst, src); 5431cb0ef41Sopenharmony_ci } 5441cb0ef41Sopenharmony_ci} 5451cb0ef41Sopenharmony_ci 5461cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8Splat(XMMRegister dst, Operand src) { 5471cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 5481cb0ef41Sopenharmony_ci DCHECK_OPERAND_IS_NOT_REG(src); 5491cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX2)) { 5501cb0ef41Sopenharmony_ci CpuFeatureScope avx2_scope(this, AVX2); 5511cb0ef41Sopenharmony_ci vpbroadcastw(dst, src); 5521cb0ef41Sopenharmony_ci } else { 5531cb0ef41Sopenharmony_ci I16x8SplatPreAvx2(dst, src); 5541cb0ef41Sopenharmony_ci } 5551cb0ef41Sopenharmony_ci} 5561cb0ef41Sopenharmony_ci 5571cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, 5581cb0ef41Sopenharmony_ci XMMRegister src2, XMMRegister scratch, 5591cb0ef41Sopenharmony_ci bool is_signed) { 5601cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 5611cb0ef41Sopenharmony_ci is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1); 5621cb0ef41Sopenharmony_ci is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2); 5631cb0ef41Sopenharmony_ci Pmullw(dst, scratch); 5641cb0ef41Sopenharmony_ci} 5651cb0ef41Sopenharmony_ci 5661cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, 5671cb0ef41Sopenharmony_ci XMMRegister src2, 5681cb0ef41Sopenharmony_ci XMMRegister scratch) { 5691cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 5701cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 5711cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 5721cb0ef41Sopenharmony_ci vpunpckhbw(scratch, src1, src1); 5731cb0ef41Sopenharmony_ci vpsraw(scratch, scratch, 8); 5741cb0ef41Sopenharmony_ci vpunpckhbw(dst, src2, src2); 5751cb0ef41Sopenharmony_ci vpsraw(dst, dst, 8); 5761cb0ef41Sopenharmony_ci vpmullw(dst, dst, scratch); 5771cb0ef41Sopenharmony_ci } else { 5781cb0ef41Sopenharmony_ci if (dst != src1) { 5791cb0ef41Sopenharmony_ci movaps(dst, src1); 5801cb0ef41Sopenharmony_ci } 5811cb0ef41Sopenharmony_ci movaps(scratch, src2); 5821cb0ef41Sopenharmony_ci punpckhbw(dst, dst); 5831cb0ef41Sopenharmony_ci psraw(dst, 8); 5841cb0ef41Sopenharmony_ci punpckhbw(scratch, scratch); 5851cb0ef41Sopenharmony_ci psraw(scratch, 8); 5861cb0ef41Sopenharmony_ci pmullw(dst, scratch); 5871cb0ef41Sopenharmony_ci } 5881cb0ef41Sopenharmony_ci} 5891cb0ef41Sopenharmony_ci 5901cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, 5911cb0ef41Sopenharmony_ci XMMRegister src2, 5921cb0ef41Sopenharmony_ci XMMRegister scratch) { 5931cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 5941cb0ef41Sopenharmony_ci // The logic here is slightly complicated to handle all the cases of register 5951cb0ef41Sopenharmony_ci // aliasing. This allows flexibility for callers in TurboFan and Liftoff. 5961cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 5971cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 5981cb0ef41Sopenharmony_ci if (src1 == src2) { 5991cb0ef41Sopenharmony_ci vpxor(scratch, scratch, scratch); 6001cb0ef41Sopenharmony_ci vpunpckhbw(dst, src1, scratch); 6011cb0ef41Sopenharmony_ci vpmullw(dst, dst, dst); 6021cb0ef41Sopenharmony_ci } else { 6031cb0ef41Sopenharmony_ci if (dst == src2) { 6041cb0ef41Sopenharmony_ci // We overwrite dst, then use src2, so swap src1 and src2. 6051cb0ef41Sopenharmony_ci std::swap(src1, src2); 6061cb0ef41Sopenharmony_ci } 6071cb0ef41Sopenharmony_ci vpxor(scratch, scratch, scratch); 6081cb0ef41Sopenharmony_ci vpunpckhbw(dst, src1, scratch); 6091cb0ef41Sopenharmony_ci vpunpckhbw(scratch, src2, scratch); 6101cb0ef41Sopenharmony_ci vpmullw(dst, dst, scratch); 6111cb0ef41Sopenharmony_ci } 6121cb0ef41Sopenharmony_ci } else { 6131cb0ef41Sopenharmony_ci if (src1 == src2) { 6141cb0ef41Sopenharmony_ci xorps(scratch, scratch); 6151cb0ef41Sopenharmony_ci if (dst != src1) { 6161cb0ef41Sopenharmony_ci movaps(dst, src1); 6171cb0ef41Sopenharmony_ci } 6181cb0ef41Sopenharmony_ci punpckhbw(dst, scratch); 6191cb0ef41Sopenharmony_ci pmullw(dst, scratch); 6201cb0ef41Sopenharmony_ci } else { 6211cb0ef41Sopenharmony_ci // When dst == src1, nothing special needs to be done. 6221cb0ef41Sopenharmony_ci // When dst == src2, swap src1 and src2, since we overwrite dst. 6231cb0ef41Sopenharmony_ci // When dst is unique, copy src1 to dst first. 6241cb0ef41Sopenharmony_ci if (dst == src2) { 6251cb0ef41Sopenharmony_ci std::swap(src1, src2); 6261cb0ef41Sopenharmony_ci // Now, dst == src1. 6271cb0ef41Sopenharmony_ci } else if (dst != src1) { 6281cb0ef41Sopenharmony_ci // dst != src1 && dst != src2. 6291cb0ef41Sopenharmony_ci movaps(dst, src1); 6301cb0ef41Sopenharmony_ci } 6311cb0ef41Sopenharmony_ci xorps(scratch, scratch); 6321cb0ef41Sopenharmony_ci punpckhbw(dst, scratch); 6331cb0ef41Sopenharmony_ci punpckhbw(scratch, src2); 6341cb0ef41Sopenharmony_ci psrlw(scratch, 8); 6351cb0ef41Sopenharmony_ci pmullw(dst, scratch); 6361cb0ef41Sopenharmony_ci } 6371cb0ef41Sopenharmony_ci } 6381cb0ef41Sopenharmony_ci} 6391cb0ef41Sopenharmony_ci 6401cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst, 6411cb0ef41Sopenharmony_ci XMMRegister src) { 6421cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 6431cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 6441cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 6451cb0ef41Sopenharmony_ci // src = |a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p| (high) 6461cb0ef41Sopenharmony_ci // dst = |i|i|j|j|k|k|l|l|m|m|n|n|o|o|p|p| 6471cb0ef41Sopenharmony_ci vpunpckhbw(dst, src, src); 6481cb0ef41Sopenharmony_ci vpsraw(dst, dst, 8); 6491cb0ef41Sopenharmony_ci } else { 6501cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE4_1); 6511cb0ef41Sopenharmony_ci if (dst == src) { 6521cb0ef41Sopenharmony_ci // 2 bytes shorter than pshufd, but has depdency on dst. 6531cb0ef41Sopenharmony_ci movhlps(dst, src); 6541cb0ef41Sopenharmony_ci pmovsxbw(dst, dst); 6551cb0ef41Sopenharmony_ci } else { 6561cb0ef41Sopenharmony_ci // No dependency on dst. 6571cb0ef41Sopenharmony_ci pshufd(dst, src, 0xEE); 6581cb0ef41Sopenharmony_ci pmovsxbw(dst, dst); 6591cb0ef41Sopenharmony_ci } 6601cb0ef41Sopenharmony_ci } 6611cb0ef41Sopenharmony_ci} 6621cb0ef41Sopenharmony_ci 6631cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, 6641cb0ef41Sopenharmony_ci XMMRegister src, 6651cb0ef41Sopenharmony_ci XMMRegister scratch) { 6661cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 6671cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 6681cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 6691cb0ef41Sopenharmony_ci // tmp = |0|0|0|0|0|0|0|0 | 0|0|0|0|0|0|0|0| 6701cb0ef41Sopenharmony_ci // src = |a|b|c|d|e|f|g|h | i|j|k|l|m|n|o|p| 6711cb0ef41Sopenharmony_ci // dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h| 6721cb0ef41Sopenharmony_ci XMMRegister tmp = dst == src ? scratch : dst; 6731cb0ef41Sopenharmony_ci vpxor(tmp, tmp, tmp); 6741cb0ef41Sopenharmony_ci vpunpckhbw(dst, src, tmp); 6751cb0ef41Sopenharmony_ci } else { 6761cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE4_1); 6771cb0ef41Sopenharmony_ci if (dst == src) { 6781cb0ef41Sopenharmony_ci // xorps can be executed on more ports than pshufd. 6791cb0ef41Sopenharmony_ci xorps(scratch, scratch); 6801cb0ef41Sopenharmony_ci punpckhbw(dst, scratch); 6811cb0ef41Sopenharmony_ci } else { 6821cb0ef41Sopenharmony_ci // No dependency on dst. 6831cb0ef41Sopenharmony_ci pshufd(dst, src, 0xEE); 6841cb0ef41Sopenharmony_ci pmovzxbw(dst, dst); 6851cb0ef41Sopenharmony_ci } 6861cb0ef41Sopenharmony_ci } 6871cb0ef41Sopenharmony_ci} 6881cb0ef41Sopenharmony_ci 6891cb0ef41Sopenharmony_civoid SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, 6901cb0ef41Sopenharmony_ci XMMRegister src2, 6911cb0ef41Sopenharmony_ci XMMRegister scratch) { 6921cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 6931cb0ef41Sopenharmony_ci // k = i16x8.splat(0x8000) 6941cb0ef41Sopenharmony_ci Pcmpeqd(scratch, scratch); 6951cb0ef41Sopenharmony_ci Psllw(scratch, scratch, byte{15}); 6961cb0ef41Sopenharmony_ci 6971cb0ef41Sopenharmony_ci if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) { 6981cb0ef41Sopenharmony_ci movaps(dst, src1); 6991cb0ef41Sopenharmony_ci src1 = dst; 7001cb0ef41Sopenharmony_ci } 7011cb0ef41Sopenharmony_ci 7021cb0ef41Sopenharmony_ci Pmulhrsw(dst, src1, src2); 7031cb0ef41Sopenharmony_ci Pcmpeqw(scratch, dst); 7041cb0ef41Sopenharmony_ci Pxor(dst, scratch); 7051cb0ef41Sopenharmony_ci} 7061cb0ef41Sopenharmony_ci 7071cb0ef41Sopenharmony_civoid SharedTurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, 7081cb0ef41Sopenharmony_ci XMMRegister src, 7091cb0ef41Sopenharmony_ci XMMRegister tmp) { 7101cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 7111cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 7121cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 7131cb0ef41Sopenharmony_ci // src = |a|b|c|d|e|f|g|h| (low) 7141cb0ef41Sopenharmony_ci // scratch = |0|a|0|c|0|e|0|g| 7151cb0ef41Sopenharmony_ci vpsrld(tmp, src, 16); 7161cb0ef41Sopenharmony_ci // dst = |0|b|0|d|0|f|0|h| 7171cb0ef41Sopenharmony_ci vpblendw(dst, src, tmp, 0xAA); 7181cb0ef41Sopenharmony_ci // dst = |a+b|c+d|e+f|g+h| 7191cb0ef41Sopenharmony_ci vpaddd(dst, tmp, dst); 7201cb0ef41Sopenharmony_ci } else if (CpuFeatures::IsSupported(SSE4_1)) { 7211cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE4_1); 7221cb0ef41Sopenharmony_ci // There is a potentially better lowering if we get rip-relative 7231cb0ef41Sopenharmony_ci // constants, see https://github.com/WebAssembly/simd/pull/380. 7241cb0ef41Sopenharmony_ci movaps(tmp, src); 7251cb0ef41Sopenharmony_ci psrld(tmp, 16); 7261cb0ef41Sopenharmony_ci if (dst != src) { 7271cb0ef41Sopenharmony_ci movaps(dst, src); 7281cb0ef41Sopenharmony_ci } 7291cb0ef41Sopenharmony_ci pblendw(dst, tmp, 0xAA); 7301cb0ef41Sopenharmony_ci paddd(dst, tmp); 7311cb0ef41Sopenharmony_ci } else { 7321cb0ef41Sopenharmony_ci // src = |a|b|c|d|e|f|g|h| 7331cb0ef41Sopenharmony_ci // tmp = i32x4.splat(0x0000FFFF) 7341cb0ef41Sopenharmony_ci pcmpeqd(tmp, tmp); 7351cb0ef41Sopenharmony_ci psrld(tmp, byte{16}); 7361cb0ef41Sopenharmony_ci // tmp =|0|b|0|d|0|f|0|h| 7371cb0ef41Sopenharmony_ci andps(tmp, src); 7381cb0ef41Sopenharmony_ci // dst = |0|a|0|c|0|e|0|g| 7391cb0ef41Sopenharmony_ci if (dst != src) { 7401cb0ef41Sopenharmony_ci movaps(dst, src); 7411cb0ef41Sopenharmony_ci } 7421cb0ef41Sopenharmony_ci psrld(dst, byte{16}); 7431cb0ef41Sopenharmony_ci // dst = |a+b|c+d|e+f|g+h| 7441cb0ef41Sopenharmony_ci paddd(dst, tmp); 7451cb0ef41Sopenharmony_ci } 7461cb0ef41Sopenharmony_ci} 7471cb0ef41Sopenharmony_ci 7481cb0ef41Sopenharmony_ci// 1. Multiply low word into scratch. 7491cb0ef41Sopenharmony_ci// 2. Multiply high word (can be signed or unsigned) into dst. 7501cb0ef41Sopenharmony_ci// 3. Unpack and interleave scratch and dst into dst. 7511cb0ef41Sopenharmony_civoid SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1, 7521cb0ef41Sopenharmony_ci XMMRegister src2, XMMRegister scratch, 7531cb0ef41Sopenharmony_ci bool low, bool is_signed) { 7541cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 7551cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 7561cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 7571cb0ef41Sopenharmony_ci vpmullw(scratch, src1, src2); 7581cb0ef41Sopenharmony_ci is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2); 7591cb0ef41Sopenharmony_ci low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst); 7601cb0ef41Sopenharmony_ci } else { 7611cb0ef41Sopenharmony_ci DCHECK_EQ(dst, src1); 7621cb0ef41Sopenharmony_ci movaps(scratch, src1); 7631cb0ef41Sopenharmony_ci pmullw(dst, src2); 7641cb0ef41Sopenharmony_ci is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2); 7651cb0ef41Sopenharmony_ci low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch); 7661cb0ef41Sopenharmony_ci } 7671cb0ef41Sopenharmony_ci} 7681cb0ef41Sopenharmony_ci 7691cb0ef41Sopenharmony_civoid SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst, 7701cb0ef41Sopenharmony_ci XMMRegister src) { 7711cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 7721cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 7731cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 7741cb0ef41Sopenharmony_ci // src = |a|b|c|d|e|f|g|h| (high) 7751cb0ef41Sopenharmony_ci // dst = |e|e|f|f|g|g|h|h| 7761cb0ef41Sopenharmony_ci vpunpckhwd(dst, src, src); 7771cb0ef41Sopenharmony_ci vpsrad(dst, dst, 16); 7781cb0ef41Sopenharmony_ci } else { 7791cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE4_1); 7801cb0ef41Sopenharmony_ci if (dst == src) { 7811cb0ef41Sopenharmony_ci // 2 bytes shorter than pshufd, but has depdency on dst. 7821cb0ef41Sopenharmony_ci movhlps(dst, src); 7831cb0ef41Sopenharmony_ci pmovsxwd(dst, dst); 7841cb0ef41Sopenharmony_ci } else { 7851cb0ef41Sopenharmony_ci // No dependency on dst. 7861cb0ef41Sopenharmony_ci pshufd(dst, src, 0xEE); 7871cb0ef41Sopenharmony_ci pmovsxwd(dst, dst); 7881cb0ef41Sopenharmony_ci } 7891cb0ef41Sopenharmony_ci } 7901cb0ef41Sopenharmony_ci} 7911cb0ef41Sopenharmony_ci 7921cb0ef41Sopenharmony_civoid SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst, 7931cb0ef41Sopenharmony_ci XMMRegister src, 7941cb0ef41Sopenharmony_ci XMMRegister scratch) { 7951cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 7961cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 7971cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 7981cb0ef41Sopenharmony_ci // scratch = |0|0|0|0|0|0|0|0| 7991cb0ef41Sopenharmony_ci // src = |a|b|c|d|e|f|g|h| 8001cb0ef41Sopenharmony_ci // dst = |0|a|0|b|0|c|0|d| 8011cb0ef41Sopenharmony_ci XMMRegister tmp = dst == src ? scratch : dst; 8021cb0ef41Sopenharmony_ci vpxor(tmp, tmp, tmp); 8031cb0ef41Sopenharmony_ci vpunpckhwd(dst, src, tmp); 8041cb0ef41Sopenharmony_ci } else { 8051cb0ef41Sopenharmony_ci if (dst == src) { 8061cb0ef41Sopenharmony_ci // xorps can be executed on more ports than pshufd. 8071cb0ef41Sopenharmony_ci xorps(scratch, scratch); 8081cb0ef41Sopenharmony_ci punpckhwd(dst, scratch); 8091cb0ef41Sopenharmony_ci } else { 8101cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE4_1); 8111cb0ef41Sopenharmony_ci // No dependency on dst. 8121cb0ef41Sopenharmony_ci pshufd(dst, src, 0xEE); 8131cb0ef41Sopenharmony_ci pmovzxwd(dst, dst); 8141cb0ef41Sopenharmony_ci } 8151cb0ef41Sopenharmony_ci } 8161cb0ef41Sopenharmony_ci} 8171cb0ef41Sopenharmony_ci 8181cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2Neg(XMMRegister dst, XMMRegister src, 8191cb0ef41Sopenharmony_ci XMMRegister scratch) { 8201cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 8211cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 8221cb0ef41Sopenharmony_ci CpuFeatureScope scope(this, AVX); 8231cb0ef41Sopenharmony_ci vpxor(scratch, scratch, scratch); 8241cb0ef41Sopenharmony_ci vpsubq(dst, scratch, src); 8251cb0ef41Sopenharmony_ci } else { 8261cb0ef41Sopenharmony_ci if (dst == src) { 8271cb0ef41Sopenharmony_ci movaps(scratch, src); 8281cb0ef41Sopenharmony_ci std::swap(src, scratch); 8291cb0ef41Sopenharmony_ci } 8301cb0ef41Sopenharmony_ci pxor(dst, dst); 8311cb0ef41Sopenharmony_ci psubq(dst, src); 8321cb0ef41Sopenharmony_ci } 8331cb0ef41Sopenharmony_ci} 8341cb0ef41Sopenharmony_ci 8351cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2Abs(XMMRegister dst, XMMRegister src, 8361cb0ef41Sopenharmony_ci XMMRegister scratch) { 8371cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 8381cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 8391cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 8401cb0ef41Sopenharmony_ci XMMRegister tmp = dst == src ? scratch : dst; 8411cb0ef41Sopenharmony_ci vpxor(tmp, tmp, tmp); 8421cb0ef41Sopenharmony_ci vpsubq(tmp, tmp, src); 8431cb0ef41Sopenharmony_ci vblendvpd(dst, src, tmp, src); 8441cb0ef41Sopenharmony_ci } else { 8451cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE3); 8461cb0ef41Sopenharmony_ci movshdup(scratch, src); 8471cb0ef41Sopenharmony_ci if (dst != src) { 8481cb0ef41Sopenharmony_ci movaps(dst, src); 8491cb0ef41Sopenharmony_ci } 8501cb0ef41Sopenharmony_ci psrad(scratch, 31); 8511cb0ef41Sopenharmony_ci xorps(dst, scratch); 8521cb0ef41Sopenharmony_ci psubq(dst, scratch); 8531cb0ef41Sopenharmony_ci } 8541cb0ef41Sopenharmony_ci} 8551cb0ef41Sopenharmony_ci 8561cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2GtS(XMMRegister dst, XMMRegister src0, 8571cb0ef41Sopenharmony_ci XMMRegister src1, XMMRegister scratch) { 8581cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 8591cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 8601cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 8611cb0ef41Sopenharmony_ci vpcmpgtq(dst, src0, src1); 8621cb0ef41Sopenharmony_ci } else if (CpuFeatures::IsSupported(SSE4_2)) { 8631cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE4_2); 8641cb0ef41Sopenharmony_ci if (dst == src0) { 8651cb0ef41Sopenharmony_ci pcmpgtq(dst, src1); 8661cb0ef41Sopenharmony_ci } else if (dst == src1) { 8671cb0ef41Sopenharmony_ci movaps(scratch, src0); 8681cb0ef41Sopenharmony_ci pcmpgtq(scratch, src1); 8691cb0ef41Sopenharmony_ci movaps(dst, scratch); 8701cb0ef41Sopenharmony_ci } else { 8711cb0ef41Sopenharmony_ci movaps(dst, src0); 8721cb0ef41Sopenharmony_ci pcmpgtq(dst, src1); 8731cb0ef41Sopenharmony_ci } 8741cb0ef41Sopenharmony_ci } else { 8751cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE3); 8761cb0ef41Sopenharmony_ci DCHECK_NE(dst, src0); 8771cb0ef41Sopenharmony_ci DCHECK_NE(dst, src1); 8781cb0ef41Sopenharmony_ci movaps(dst, src1); 8791cb0ef41Sopenharmony_ci movaps(scratch, src0); 8801cb0ef41Sopenharmony_ci psubq(dst, src0); 8811cb0ef41Sopenharmony_ci pcmpeqd(scratch, src1); 8821cb0ef41Sopenharmony_ci andps(dst, scratch); 8831cb0ef41Sopenharmony_ci movaps(scratch, src0); 8841cb0ef41Sopenharmony_ci pcmpgtd(scratch, src1); 8851cb0ef41Sopenharmony_ci orps(dst, scratch); 8861cb0ef41Sopenharmony_ci movshdup(dst, dst); 8871cb0ef41Sopenharmony_ci } 8881cb0ef41Sopenharmony_ci} 8891cb0ef41Sopenharmony_ci 8901cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0, 8911cb0ef41Sopenharmony_ci XMMRegister src1, XMMRegister scratch) { 8921cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 8931cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 8941cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 8951cb0ef41Sopenharmony_ci vpcmpgtq(dst, src1, src0); 8961cb0ef41Sopenharmony_ci vpcmpeqd(scratch, scratch, scratch); 8971cb0ef41Sopenharmony_ci vpxor(dst, dst, scratch); 8981cb0ef41Sopenharmony_ci } else if (CpuFeatures::IsSupported(SSE4_2)) { 8991cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE4_2); 9001cb0ef41Sopenharmony_ci DCHECK_NE(dst, src0); 9011cb0ef41Sopenharmony_ci if (dst != src1) { 9021cb0ef41Sopenharmony_ci movaps(dst, src1); 9031cb0ef41Sopenharmony_ci } 9041cb0ef41Sopenharmony_ci pcmpgtq(dst, src0); 9051cb0ef41Sopenharmony_ci pcmpeqd(scratch, scratch); 9061cb0ef41Sopenharmony_ci xorps(dst, scratch); 9071cb0ef41Sopenharmony_ci } else { 9081cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE3); 9091cb0ef41Sopenharmony_ci DCHECK_NE(dst, src0); 9101cb0ef41Sopenharmony_ci DCHECK_NE(dst, src1); 9111cb0ef41Sopenharmony_ci movaps(dst, src0); 9121cb0ef41Sopenharmony_ci movaps(scratch, src1); 9131cb0ef41Sopenharmony_ci psubq(dst, src1); 9141cb0ef41Sopenharmony_ci pcmpeqd(scratch, src0); 9151cb0ef41Sopenharmony_ci andps(dst, scratch); 9161cb0ef41Sopenharmony_ci movaps(scratch, src1); 9171cb0ef41Sopenharmony_ci pcmpgtd(scratch, src0); 9181cb0ef41Sopenharmony_ci orps(dst, scratch); 9191cb0ef41Sopenharmony_ci movshdup(dst, dst); 9201cb0ef41Sopenharmony_ci pcmpeqd(scratch, scratch); 9211cb0ef41Sopenharmony_ci xorps(dst, scratch); 9221cb0ef41Sopenharmony_ci } 9231cb0ef41Sopenharmony_ci} 9241cb0ef41Sopenharmony_ci 9251cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src, 9261cb0ef41Sopenharmony_ci uint8_t shift, XMMRegister xmm_tmp) { 9271cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 9281cb0ef41Sopenharmony_ci DCHECK_GT(64, shift); 9291cb0ef41Sopenharmony_ci DCHECK_NE(xmm_tmp, dst); 9301cb0ef41Sopenharmony_ci DCHECK_NE(xmm_tmp, src); 9311cb0ef41Sopenharmony_ci // Use logical right shift to emulate arithmetic right shifts: 9321cb0ef41Sopenharmony_ci // Given: 9331cb0ef41Sopenharmony_ci // signed >> c 9341cb0ef41Sopenharmony_ci // == (signed + 2^63 - 2^63) >> c 9351cb0ef41Sopenharmony_ci // == ((signed + 2^63) >> c) - (2^63 >> c) 9361cb0ef41Sopenharmony_ci // ^^^^^^^^^ 9371cb0ef41Sopenharmony_ci // xmm_tmp 9381cb0ef41Sopenharmony_ci // signed + 2^63 is an unsigned number, so we can use logical right shifts. 9391cb0ef41Sopenharmony_ci 9401cb0ef41Sopenharmony_ci // xmm_tmp = wasm_i64x2_const(0x80000000'00000000). 9411cb0ef41Sopenharmony_ci Pcmpeqd(xmm_tmp, xmm_tmp); 9421cb0ef41Sopenharmony_ci Psllq(xmm_tmp, byte{63}); 9431cb0ef41Sopenharmony_ci 9441cb0ef41Sopenharmony_ci if (!CpuFeatures::IsSupported(AVX) && (dst != src)) { 9451cb0ef41Sopenharmony_ci movaps(dst, src); 9461cb0ef41Sopenharmony_ci src = dst; 9471cb0ef41Sopenharmony_ci } 9481cb0ef41Sopenharmony_ci // Add a bias of 2^63 to convert signed to unsigned. 9491cb0ef41Sopenharmony_ci // Since only highest bit changes, use pxor instead of paddq. 9501cb0ef41Sopenharmony_ci Pxor(dst, src, xmm_tmp); 9511cb0ef41Sopenharmony_ci // Logically shift both value and bias. 9521cb0ef41Sopenharmony_ci Psrlq(dst, shift); 9531cb0ef41Sopenharmony_ci Psrlq(xmm_tmp, shift); 9541cb0ef41Sopenharmony_ci // Subtract shifted bias to convert back to signed value. 9551cb0ef41Sopenharmony_ci Psubq(dst, xmm_tmp); 9561cb0ef41Sopenharmony_ci} 9571cb0ef41Sopenharmony_ci 9581cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src, 9591cb0ef41Sopenharmony_ci Register shift, XMMRegister xmm_tmp, 9601cb0ef41Sopenharmony_ci XMMRegister xmm_shift, 9611cb0ef41Sopenharmony_ci Register tmp_shift) { 9621cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 9631cb0ef41Sopenharmony_ci DCHECK_NE(xmm_tmp, dst); 9641cb0ef41Sopenharmony_ci DCHECK_NE(xmm_tmp, src); 9651cb0ef41Sopenharmony_ci DCHECK_NE(xmm_shift, dst); 9661cb0ef41Sopenharmony_ci DCHECK_NE(xmm_shift, src); 9671cb0ef41Sopenharmony_ci // tmp_shift can alias shift since we don't use shift after masking it. 9681cb0ef41Sopenharmony_ci 9691cb0ef41Sopenharmony_ci // See I64x2ShrS with constant shift for explanation of this algorithm. 9701cb0ef41Sopenharmony_ci Pcmpeqd(xmm_tmp, xmm_tmp); 9711cb0ef41Sopenharmony_ci Psllq(xmm_tmp, byte{63}); 9721cb0ef41Sopenharmony_ci 9731cb0ef41Sopenharmony_ci // Shift modulo 64. 9741cb0ef41Sopenharmony_ci Move(tmp_shift, shift); 9751cb0ef41Sopenharmony_ci And(tmp_shift, Immediate(0x3F)); 9761cb0ef41Sopenharmony_ci Movd(xmm_shift, tmp_shift); 9771cb0ef41Sopenharmony_ci 9781cb0ef41Sopenharmony_ci if (!CpuFeatures::IsSupported(AVX) && (dst != src)) { 9791cb0ef41Sopenharmony_ci movaps(dst, src); 9801cb0ef41Sopenharmony_ci src = dst; 9811cb0ef41Sopenharmony_ci } 9821cb0ef41Sopenharmony_ci Pxor(dst, src, xmm_tmp); 9831cb0ef41Sopenharmony_ci Psrlq(dst, xmm_shift); 9841cb0ef41Sopenharmony_ci Psrlq(xmm_tmp, xmm_shift); 9851cb0ef41Sopenharmony_ci Psubq(dst, xmm_tmp); 9861cb0ef41Sopenharmony_ci} 9871cb0ef41Sopenharmony_ci 9881cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs, 9891cb0ef41Sopenharmony_ci XMMRegister rhs, XMMRegister tmp1, 9901cb0ef41Sopenharmony_ci XMMRegister tmp2) { 9911cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 9921cb0ef41Sopenharmony_ci DCHECK(!AreAliased(dst, tmp1, tmp2)); 9931cb0ef41Sopenharmony_ci DCHECK(!AreAliased(lhs, tmp1, tmp2)); 9941cb0ef41Sopenharmony_ci DCHECK(!AreAliased(rhs, tmp1, tmp2)); 9951cb0ef41Sopenharmony_ci 9961cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 9971cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 9981cb0ef41Sopenharmony_ci // 1. Multiply high dword of each qword of left with right. 9991cb0ef41Sopenharmony_ci vpsrlq(tmp1, lhs, byte{32}); 10001cb0ef41Sopenharmony_ci vpmuludq(tmp1, tmp1, rhs); 10011cb0ef41Sopenharmony_ci // 2. Multiply high dword of each qword of right with left. 10021cb0ef41Sopenharmony_ci vpsrlq(tmp2, rhs, byte{32}); 10031cb0ef41Sopenharmony_ci vpmuludq(tmp2, tmp2, lhs); 10041cb0ef41Sopenharmony_ci // 3. Add 1 and 2, then shift left by 32 (this is the high dword of result). 10051cb0ef41Sopenharmony_ci vpaddq(tmp2, tmp2, tmp1); 10061cb0ef41Sopenharmony_ci vpsllq(tmp2, tmp2, byte{32}); 10071cb0ef41Sopenharmony_ci // 4. Multiply low dwords (this is the low dword of result). 10081cb0ef41Sopenharmony_ci vpmuludq(dst, lhs, rhs); 10091cb0ef41Sopenharmony_ci // 5. Add 3 and 4. 10101cb0ef41Sopenharmony_ci vpaddq(dst, dst, tmp2); 10111cb0ef41Sopenharmony_ci } else { 10121cb0ef41Sopenharmony_ci // Same algorithm as AVX version, but with moves to not overwrite inputs. 10131cb0ef41Sopenharmony_ci movaps(tmp1, lhs); 10141cb0ef41Sopenharmony_ci movaps(tmp2, rhs); 10151cb0ef41Sopenharmony_ci psrlq(tmp1, byte{32}); 10161cb0ef41Sopenharmony_ci pmuludq(tmp1, rhs); 10171cb0ef41Sopenharmony_ci psrlq(tmp2, byte{32}); 10181cb0ef41Sopenharmony_ci pmuludq(tmp2, lhs); 10191cb0ef41Sopenharmony_ci paddq(tmp2, tmp1); 10201cb0ef41Sopenharmony_ci psllq(tmp2, byte{32}); 10211cb0ef41Sopenharmony_ci if (dst == rhs) { 10221cb0ef41Sopenharmony_ci // pmuludq is commutative 10231cb0ef41Sopenharmony_ci pmuludq(dst, lhs); 10241cb0ef41Sopenharmony_ci } else { 10251cb0ef41Sopenharmony_ci if (dst != lhs) { 10261cb0ef41Sopenharmony_ci movaps(dst, lhs); 10271cb0ef41Sopenharmony_ci } 10281cb0ef41Sopenharmony_ci pmuludq(dst, rhs); 10291cb0ef41Sopenharmony_ci } 10301cb0ef41Sopenharmony_ci paddq(dst, tmp2); 10311cb0ef41Sopenharmony_ci } 10321cb0ef41Sopenharmony_ci} 10331cb0ef41Sopenharmony_ci 10341cb0ef41Sopenharmony_ci// 1. Unpack src0, src1 into even-number elements of scratch. 10351cb0ef41Sopenharmony_ci// 2. Unpack src1, src0 into even-number elements of dst. 10361cb0ef41Sopenharmony_ci// 3. Multiply 1. with 2. 10371cb0ef41Sopenharmony_ci// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq. 10381cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1, 10391cb0ef41Sopenharmony_ci XMMRegister src2, XMMRegister scratch, 10401cb0ef41Sopenharmony_ci bool low, bool is_signed) { 10411cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 10421cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 10431cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 10441cb0ef41Sopenharmony_ci if (low) { 10451cb0ef41Sopenharmony_ci vpunpckldq(scratch, src1, src1); 10461cb0ef41Sopenharmony_ci vpunpckldq(dst, src2, src2); 10471cb0ef41Sopenharmony_ci } else { 10481cb0ef41Sopenharmony_ci vpunpckhdq(scratch, src1, src1); 10491cb0ef41Sopenharmony_ci vpunpckhdq(dst, src2, src2); 10501cb0ef41Sopenharmony_ci } 10511cb0ef41Sopenharmony_ci if (is_signed) { 10521cb0ef41Sopenharmony_ci vpmuldq(dst, scratch, dst); 10531cb0ef41Sopenharmony_ci } else { 10541cb0ef41Sopenharmony_ci vpmuludq(dst, scratch, dst); 10551cb0ef41Sopenharmony_ci } 10561cb0ef41Sopenharmony_ci } else { 10571cb0ef41Sopenharmony_ci uint8_t mask = low ? 0x50 : 0xFA; 10581cb0ef41Sopenharmony_ci pshufd(scratch, src1, mask); 10591cb0ef41Sopenharmony_ci pshufd(dst, src2, mask); 10601cb0ef41Sopenharmony_ci if (is_signed) { 10611cb0ef41Sopenharmony_ci CpuFeatureScope sse4_scope(this, SSE4_1); 10621cb0ef41Sopenharmony_ci pmuldq(dst, scratch); 10631cb0ef41Sopenharmony_ci } else { 10641cb0ef41Sopenharmony_ci pmuludq(dst, scratch); 10651cb0ef41Sopenharmony_ci } 10661cb0ef41Sopenharmony_ci } 10671cb0ef41Sopenharmony_ci} 10681cb0ef41Sopenharmony_ci 10691cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst, 10701cb0ef41Sopenharmony_ci XMMRegister src) { 10711cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 10721cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 10731cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 10741cb0ef41Sopenharmony_ci vpunpckhqdq(dst, src, src); 10751cb0ef41Sopenharmony_ci vpmovsxdq(dst, dst); 10761cb0ef41Sopenharmony_ci } else { 10771cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE4_1); 10781cb0ef41Sopenharmony_ci if (dst == src) { 10791cb0ef41Sopenharmony_ci movhlps(dst, src); 10801cb0ef41Sopenharmony_ci } else { 10811cb0ef41Sopenharmony_ci pshufd(dst, src, 0xEE); 10821cb0ef41Sopenharmony_ci } 10831cb0ef41Sopenharmony_ci pmovsxdq(dst, dst); 10841cb0ef41Sopenharmony_ci } 10851cb0ef41Sopenharmony_ci} 10861cb0ef41Sopenharmony_ci 10871cb0ef41Sopenharmony_civoid SharedTurboAssembler::I64x2UConvertI32x4High(XMMRegister dst, 10881cb0ef41Sopenharmony_ci XMMRegister src, 10891cb0ef41Sopenharmony_ci XMMRegister scratch) { 10901cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 10911cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 10921cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 10931cb0ef41Sopenharmony_ci vpxor(scratch, scratch, scratch); 10941cb0ef41Sopenharmony_ci vpunpckhdq(dst, src, scratch); 10951cb0ef41Sopenharmony_ci } else { 10961cb0ef41Sopenharmony_ci if (dst == src) { 10971cb0ef41Sopenharmony_ci // xorps can be executed on more ports than pshufd. 10981cb0ef41Sopenharmony_ci xorps(scratch, scratch); 10991cb0ef41Sopenharmony_ci punpckhdq(dst, scratch); 11001cb0ef41Sopenharmony_ci } else { 11011cb0ef41Sopenharmony_ci CpuFeatureScope sse_scope(this, SSE4_1); 11021cb0ef41Sopenharmony_ci // No dependency on dst. 11031cb0ef41Sopenharmony_ci pshufd(dst, src, 0xEE); 11041cb0ef41Sopenharmony_ci pmovzxdq(dst, dst); 11051cb0ef41Sopenharmony_ci } 11061cb0ef41Sopenharmony_ci } 11071cb0ef41Sopenharmony_ci} 11081cb0ef41Sopenharmony_ci 11091cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Not(XMMRegister dst, XMMRegister src, 11101cb0ef41Sopenharmony_ci XMMRegister scratch) { 11111cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 11121cb0ef41Sopenharmony_ci if (dst == src) { 11131cb0ef41Sopenharmony_ci Pcmpeqd(scratch, scratch); 11141cb0ef41Sopenharmony_ci Pxor(dst, scratch); 11151cb0ef41Sopenharmony_ci } else { 11161cb0ef41Sopenharmony_ci Pcmpeqd(dst, dst); 11171cb0ef41Sopenharmony_ci Pxor(dst, src); 11181cb0ef41Sopenharmony_ci } 11191cb0ef41Sopenharmony_ci} 11201cb0ef41Sopenharmony_ci 11211cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask, 11221cb0ef41Sopenharmony_ci XMMRegister src1, XMMRegister src2, 11231cb0ef41Sopenharmony_ci XMMRegister scratch) { 11241cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 11251cb0ef41Sopenharmony_ci // v128.select = v128.or(v128.and(v1, c), v128.andnot(v2, c)). 11261cb0ef41Sopenharmony_ci // pandn(x, y) = !x & y, so we have to flip the mask and input. 11271cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 11281cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 11291cb0ef41Sopenharmony_ci vpandn(scratch, mask, src2); 11301cb0ef41Sopenharmony_ci vpand(dst, src1, mask); 11311cb0ef41Sopenharmony_ci vpor(dst, dst, scratch); 11321cb0ef41Sopenharmony_ci } else { 11331cb0ef41Sopenharmony_ci DCHECK_EQ(dst, mask); 11341cb0ef41Sopenharmony_ci // Use float ops as they are 1 byte shorter than int ops. 11351cb0ef41Sopenharmony_ci movaps(scratch, mask); 11361cb0ef41Sopenharmony_ci andnps(scratch, src2); 11371cb0ef41Sopenharmony_ci andps(dst, src1); 11381cb0ef41Sopenharmony_ci orps(dst, scratch); 11391cb0ef41Sopenharmony_ci } 11401cb0ef41Sopenharmony_ci} 11411cb0ef41Sopenharmony_ci 11421cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src, 11431cb0ef41Sopenharmony_ci XMMRegister scratch) { 11441cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 11451cb0ef41Sopenharmony_ci // The trap handler uses the current pc to creating a landing, so that it can 11461cb0ef41Sopenharmony_ci // determine if a trap occured in Wasm code due to a OOB load. Make sure the 11471cb0ef41Sopenharmony_ci // first instruction in each case below is the one that loads. 11481cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX2)) { 11491cb0ef41Sopenharmony_ci CpuFeatureScope avx2_scope(this, AVX2); 11501cb0ef41Sopenharmony_ci vpbroadcastb(dst, src); 11511cb0ef41Sopenharmony_ci } else if (CpuFeatures::IsSupported(AVX)) { 11521cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 11531cb0ef41Sopenharmony_ci // Avoid dependency on previous value of dst. 11541cb0ef41Sopenharmony_ci vpinsrb(dst, scratch, src, uint8_t{0}); 11551cb0ef41Sopenharmony_ci vpxor(scratch, scratch, scratch); 11561cb0ef41Sopenharmony_ci vpshufb(dst, dst, scratch); 11571cb0ef41Sopenharmony_ci } else { 11581cb0ef41Sopenharmony_ci CpuFeatureScope ssse4_scope(this, SSE4_1); 11591cb0ef41Sopenharmony_ci pinsrb(dst, src, uint8_t{0}); 11601cb0ef41Sopenharmony_ci xorps(scratch, scratch); 11611cb0ef41Sopenharmony_ci pshufb(dst, scratch); 11621cb0ef41Sopenharmony_ci } 11631cb0ef41Sopenharmony_ci} 11641cb0ef41Sopenharmony_ci 11651cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src, 11661cb0ef41Sopenharmony_ci XMMRegister scratch) { 11671cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 11681cb0ef41Sopenharmony_ci // The trap handler uses the current pc to creating a landing, so that it can 11691cb0ef41Sopenharmony_ci // determine if a trap occured in Wasm code due to a OOB load. Make sure the 11701cb0ef41Sopenharmony_ci // first instruction in each case below is the one that loads. 11711cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX2)) { 11721cb0ef41Sopenharmony_ci CpuFeatureScope avx2_scope(this, AVX2); 11731cb0ef41Sopenharmony_ci vpbroadcastw(dst, src); 11741cb0ef41Sopenharmony_ci } else if (CpuFeatures::IsSupported(AVX)) { 11751cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 11761cb0ef41Sopenharmony_ci // Avoid dependency on previous value of dst. 11771cb0ef41Sopenharmony_ci vpinsrw(dst, scratch, src, uint8_t{0}); 11781cb0ef41Sopenharmony_ci vpshuflw(dst, dst, uint8_t{0}); 11791cb0ef41Sopenharmony_ci vpunpcklqdq(dst, dst, dst); 11801cb0ef41Sopenharmony_ci } else { 11811cb0ef41Sopenharmony_ci pinsrw(dst, src, uint8_t{0}); 11821cb0ef41Sopenharmony_ci pshuflw(dst, dst, uint8_t{0}); 11831cb0ef41Sopenharmony_ci movlhps(dst, dst); 11841cb0ef41Sopenharmony_ci } 11851cb0ef41Sopenharmony_ci} 11861cb0ef41Sopenharmony_ci 11871cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) { 11881cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 11891cb0ef41Sopenharmony_ci // The trap handler uses the current pc to creating a landing, so that it can 11901cb0ef41Sopenharmony_ci // determine if a trap occured in Wasm code due to a OOB load. Make sure the 11911cb0ef41Sopenharmony_ci // first instruction in each case below is the one that loads. 11921cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(AVX)) { 11931cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); 11941cb0ef41Sopenharmony_ci vbroadcastss(dst, src); 11951cb0ef41Sopenharmony_ci } else { 11961cb0ef41Sopenharmony_ci movss(dst, src); 11971cb0ef41Sopenharmony_ci shufps(dst, dst, byte{0}); 11981cb0ef41Sopenharmony_ci } 11991cb0ef41Sopenharmony_ci} 12001cb0ef41Sopenharmony_ci 12011cb0ef41Sopenharmony_civoid SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src, 12021cb0ef41Sopenharmony_ci uint8_t laneidx) { 12031cb0ef41Sopenharmony_ci ASM_CODE_COMMENT(this); 12041cb0ef41Sopenharmony_ci if (laneidx == 0) { 12051cb0ef41Sopenharmony_ci Movlps(dst, src); 12061cb0ef41Sopenharmony_ci } else { 12071cb0ef41Sopenharmony_ci DCHECK_EQ(1, laneidx); 12081cb0ef41Sopenharmony_ci Movhps(dst, src); 12091cb0ef41Sopenharmony_ci } 12101cb0ef41Sopenharmony_ci} 12111cb0ef41Sopenharmony_ci 12121cb0ef41Sopenharmony_ci// Helper macro to define qfma macro-assembler. This takes care of every 12131cb0ef41Sopenharmony_ci// possible case of register aliasing to minimize the number of instructions. 12141cb0ef41Sopenharmony_ci#define QFMA(ps_or_pd) \ 12151cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(FMA3)) { \ 12161cb0ef41Sopenharmony_ci CpuFeatureScope fma3_scope(this, FMA3); \ 12171cb0ef41Sopenharmony_ci if (dst == src1) { \ 12181cb0ef41Sopenharmony_ci vfmadd231##ps_or_pd(dst, src2, src3); \ 12191cb0ef41Sopenharmony_ci } else if (dst == src2) { \ 12201cb0ef41Sopenharmony_ci vfmadd132##ps_or_pd(dst, src1, src3); \ 12211cb0ef41Sopenharmony_ci } else if (dst == src3) { \ 12221cb0ef41Sopenharmony_ci vfmadd213##ps_or_pd(dst, src2, src1); \ 12231cb0ef41Sopenharmony_ci } else { \ 12241cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); \ 12251cb0ef41Sopenharmony_ci vmovups(dst, src1); \ 12261cb0ef41Sopenharmony_ci vfmadd231##ps_or_pd(dst, src2, src3); \ 12271cb0ef41Sopenharmony_ci } \ 12281cb0ef41Sopenharmony_ci } else if (CpuFeatures::IsSupported(AVX)) { \ 12291cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); \ 12301cb0ef41Sopenharmony_ci vmul##ps_or_pd(tmp, src2, src3); \ 12311cb0ef41Sopenharmony_ci vadd##ps_or_pd(dst, src1, tmp); \ 12321cb0ef41Sopenharmony_ci } else { \ 12331cb0ef41Sopenharmony_ci if (dst == src1) { \ 12341cb0ef41Sopenharmony_ci movaps(tmp, src2); \ 12351cb0ef41Sopenharmony_ci mul##ps_or_pd(tmp, src3); \ 12361cb0ef41Sopenharmony_ci add##ps_or_pd(dst, tmp); \ 12371cb0ef41Sopenharmony_ci } else if (dst == src2) { \ 12381cb0ef41Sopenharmony_ci DCHECK_NE(src2, src1); \ 12391cb0ef41Sopenharmony_ci mul##ps_or_pd(src2, src3); \ 12401cb0ef41Sopenharmony_ci add##ps_or_pd(src2, src1); \ 12411cb0ef41Sopenharmony_ci } else if (dst == src3) { \ 12421cb0ef41Sopenharmony_ci DCHECK_NE(src3, src1); \ 12431cb0ef41Sopenharmony_ci mul##ps_or_pd(src3, src2); \ 12441cb0ef41Sopenharmony_ci add##ps_or_pd(src3, src1); \ 12451cb0ef41Sopenharmony_ci } else { \ 12461cb0ef41Sopenharmony_ci movaps(dst, src2); \ 12471cb0ef41Sopenharmony_ci mul##ps_or_pd(dst, src3); \ 12481cb0ef41Sopenharmony_ci add##ps_or_pd(dst, src1); \ 12491cb0ef41Sopenharmony_ci } \ 12501cb0ef41Sopenharmony_ci } 12511cb0ef41Sopenharmony_ci 12521cb0ef41Sopenharmony_ci// Helper macro to define qfms macro-assembler. This takes care of every 12531cb0ef41Sopenharmony_ci// possible case of register aliasing to minimize the number of instructions. 12541cb0ef41Sopenharmony_ci#define QFMS(ps_or_pd) \ 12551cb0ef41Sopenharmony_ci if (CpuFeatures::IsSupported(FMA3)) { \ 12561cb0ef41Sopenharmony_ci CpuFeatureScope fma3_scope(this, FMA3); \ 12571cb0ef41Sopenharmony_ci if (dst == src1) { \ 12581cb0ef41Sopenharmony_ci vfnmadd231##ps_or_pd(dst, src2, src3); \ 12591cb0ef41Sopenharmony_ci } else if (dst == src2) { \ 12601cb0ef41Sopenharmony_ci vfnmadd132##ps_or_pd(dst, src1, src3); \ 12611cb0ef41Sopenharmony_ci } else if (dst == src3) { \ 12621cb0ef41Sopenharmony_ci vfnmadd213##ps_or_pd(dst, src2, src1); \ 12631cb0ef41Sopenharmony_ci } else { \ 12641cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); \ 12651cb0ef41Sopenharmony_ci vmovups(dst, src1); \ 12661cb0ef41Sopenharmony_ci vfnmadd231##ps_or_pd(dst, src2, src3); \ 12671cb0ef41Sopenharmony_ci } \ 12681cb0ef41Sopenharmony_ci } else if (CpuFeatures::IsSupported(AVX)) { \ 12691cb0ef41Sopenharmony_ci CpuFeatureScope avx_scope(this, AVX); \ 12701cb0ef41Sopenharmony_ci vmul##ps_or_pd(tmp, src2, src3); \ 12711cb0ef41Sopenharmony_ci vsub##ps_or_pd(dst, src1, tmp); \ 12721cb0ef41Sopenharmony_ci } else { \ 12731cb0ef41Sopenharmony_ci movaps(tmp, src2); \ 12741cb0ef41Sopenharmony_ci mul##ps_or_pd(tmp, src3); \ 12751cb0ef41Sopenharmony_ci if (dst != src1) { \ 12761cb0ef41Sopenharmony_ci movaps(dst, src1); \ 12771cb0ef41Sopenharmony_ci } \ 12781cb0ef41Sopenharmony_ci sub##ps_or_pd(dst, tmp); \ 12791cb0ef41Sopenharmony_ci } 12801cb0ef41Sopenharmony_ci 12811cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4Qfma(XMMRegister dst, XMMRegister src1, 12821cb0ef41Sopenharmony_ci XMMRegister src2, XMMRegister src3, 12831cb0ef41Sopenharmony_ci XMMRegister tmp) { 12841cb0ef41Sopenharmony_ci QFMA(ps) 12851cb0ef41Sopenharmony_ci} 12861cb0ef41Sopenharmony_ci 12871cb0ef41Sopenharmony_civoid SharedTurboAssembler::F32x4Qfms(XMMRegister dst, XMMRegister src1, 12881cb0ef41Sopenharmony_ci XMMRegister src2, XMMRegister src3, 12891cb0ef41Sopenharmony_ci XMMRegister tmp) { 12901cb0ef41Sopenharmony_ci QFMS(ps) 12911cb0ef41Sopenharmony_ci} 12921cb0ef41Sopenharmony_ci 12931cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2Qfma(XMMRegister dst, XMMRegister src1, 12941cb0ef41Sopenharmony_ci XMMRegister src2, XMMRegister src3, 12951cb0ef41Sopenharmony_ci XMMRegister tmp) { 12961cb0ef41Sopenharmony_ci QFMA(pd); 12971cb0ef41Sopenharmony_ci} 12981cb0ef41Sopenharmony_ci 12991cb0ef41Sopenharmony_civoid SharedTurboAssembler::F64x2Qfms(XMMRegister dst, XMMRegister src1, 13001cb0ef41Sopenharmony_ci XMMRegister src2, XMMRegister src3, 13011cb0ef41Sopenharmony_ci XMMRegister tmp) { 13021cb0ef41Sopenharmony_ci QFMS(pd); 13031cb0ef41Sopenharmony_ci} 13041cb0ef41Sopenharmony_ci 13051cb0ef41Sopenharmony_ci#undef QFMOP 13061cb0ef41Sopenharmony_ci 13071cb0ef41Sopenharmony_ci} // namespace internal 13081cb0ef41Sopenharmony_ci} // namespace v8 13091cb0ef41Sopenharmony_ci 13101cb0ef41Sopenharmony_ci#undef DCHECK_OPERAND_IS_NOT_REG 1311