// Copyright 2019, VIXL authors // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of ARM Limited nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include #include #include #include #include #include #include "test-runner.h" #include "test-utils.h" #include "aarch64/cpu-aarch64.h" #include "aarch64/disasm-aarch64.h" #include "aarch64/macro-assembler-aarch64.h" #include "aarch64/simulator-aarch64.h" #include "aarch64/test-utils-aarch64.h" #include "test-assembler-aarch64.h" #define TEST_SVE(name) TEST_SVE_INNER("ASM", name) namespace vixl { namespace aarch64 { // Conveniently initialise P registers with scalar bit patterns. The destination // lane size is ignored. This is optimised for call-site clarity, not generated // code quality. // // Usage: // // Initialise(&masm, p0, 0x1234); // Sets p0 = 0b'0001'0010'0011'0100 void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value3, uint64_t value2, uint64_t value1, uint64_t value0) { // Generate a literal pool, as in the array form. UseScratchRegisterScope temps(masm); Register temp = temps.AcquireX(); Label data; Label done; masm->Adr(temp, &data); masm->Ldr(pd, SVEMemOperand(temp)); masm->B(&done); { ExactAssemblyScope total(masm, kPRegMaxSizeInBytes); masm->bind(&data); masm->dc64(value0); masm->dc64(value1); masm->dc64(value2); masm->dc64(value3); } masm->Bind(&done); } void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value2, uint64_t value1, uint64_t value0) { Initialise(masm, pd, 0, value2, value1, value0); } void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value1, uint64_t value0) { Initialise(masm, pd, 0, 0, value1, value0); } void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value0) { Initialise(masm, pd, 0, 0, 0, value0); } // Conveniently initialise P registers by lane. This is optimised for call-site // clarity, not generated code quality. // // Usage: // // int values[] = { 0x0, 0x1, 0x2 }; // Initialise(&masm, p0.VnS(), values); // Sets p0 = 0b'0000'0001'0010 // // The rightmost (highest-indexed) array element maps to the lowest-numbered // lane. Unspecified lanes are set to 0 (inactive). // // Each element of the `values` array is mapped onto a lane in `pd`. The // architecture only respects the lower bit, and writes zero the upper bits, but // other (encodable) values can be specified if required by the test. template void Initialise(MacroAssembler* masm, const PRegisterWithLaneSize& pd, const T (&values)[N]) { // Turn the array into 64-bit chunks. uint64_t chunks[4] = {0, 0, 0, 0}; VIXL_STATIC_ASSERT(sizeof(chunks) == kPRegMaxSizeInBytes); int p_bits_per_lane = pd.GetLaneSizeInBits() / kZRegBitsPerPRegBit; VIXL_ASSERT((64 % p_bits_per_lane) == 0); VIXL_ASSERT((N * p_bits_per_lane) <= kPRegMaxSize); uint64_t p_lane_mask = GetUintMask(p_bits_per_lane); VIXL_STATIC_ASSERT(N <= kPRegMaxSize); size_t bit = 0; for (int n = static_cast(N - 1); n >= 0; n--) { VIXL_ASSERT(bit < (sizeof(chunks) * kBitsPerByte)); uint64_t value = values[n] & p_lane_mask; chunks[bit / 64] |= value << (bit % 64); bit += p_bits_per_lane; } Initialise(masm, pd, chunks[3], chunks[2], chunks[1], chunks[0]); } // Ensure that basic test infrastructure works. TEST_SVE(sve_test_infrastructure_z) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Mov(x0, 0x0123456789abcdef); // Test basic `Insr` behaviour. __ Insr(z0.VnB(), 1); __ Insr(z0.VnB(), 2); __ Insr(z0.VnB(), x0); __ Insr(z0.VnB(), -42); __ Insr(z0.VnB(), 0); // Test array inputs. int z1_inputs[] = {3, 4, 5, -42, 0}; InsrHelper(&masm, z1.VnH(), z1_inputs); // Test that sign-extension works as intended for various lane sizes. __ Dup(z2.VnD(), 0); // Clear the register first. __ Insr(z2.VnB(), -42); // 0xd6 __ Insr(z2.VnB(), 0xfe); // 0xfe __ Insr(z2.VnH(), -42); // 0xffd6 __ Insr(z2.VnH(), 0xfedc); // 0xfedc __ Insr(z2.VnS(), -42); // 0xffffffd6 __ Insr(z2.VnS(), 0xfedcba98); // 0xfedcba98 // Use another register for VnD(), so we can support 128-bit Z registers. __ Insr(z3.VnD(), -42); // 0xffffffffffffffd6 __ Insr(z3.VnD(), 0xfedcba9876543210); // 0xfedcba9876543210 END(); if (CAN_RUN()) { RUN(); // Test that array checks work properly on a register initialised // lane-by-lane. int z0_inputs_b[] = {0x01, 0x02, 0xef, 0xd6, 0x00}; ASSERT_EQUAL_SVE(z0_inputs_b, z0.VnB()); // Test that lane-by-lane checks work properly on a register initialised // by array. for (size_t i = 0; i < ArrayLength(z1_inputs); i++) { // The rightmost (highest-indexed) array element maps to the // lowest-numbered lane. int lane = static_cast(ArrayLength(z1_inputs) - i - 1); ASSERT_EQUAL_SVE_LANE(z1_inputs[i], z1.VnH(), lane); } uint64_t z2_inputs_d[] = {0x0000d6feffd6fedc, 0xffffffd6fedcba98}; ASSERT_EQUAL_SVE(z2_inputs_d, z2.VnD()); uint64_t z3_inputs_d[] = {0xffffffffffffffd6, 0xfedcba9876543210}; ASSERT_EQUAL_SVE(z3_inputs_d, z3.VnD()); } } // Ensure that basic test infrastructure works. TEST_SVE(sve_test_infrastructure_p) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Simple cases: move boolean (0 or 1) values. int p0_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0}; Initialise(&masm, p0.VnB(), p0_inputs); int p1_inputs[] = {1, 0, 1, 1, 0, 1, 1, 1}; Initialise(&masm, p1.VnH(), p1_inputs); int p2_inputs[] = {1, 1, 0, 1}; Initialise(&masm, p2.VnS(), p2_inputs); int p3_inputs[] = {0, 1}; Initialise(&masm, p3.VnD(), p3_inputs); // Advanced cases: move numeric value into architecturally-ignored bits. // B-sized lanes get one bit in a P register, so there are no ignored bits. // H-sized lanes get two bits in a P register. int p4_inputs[] = {0x3, 0x2, 0x1, 0x0, 0x1, 0x2, 0x3}; Initialise(&masm, p4.VnH(), p4_inputs); // S-sized lanes get four bits in a P register. int p5_inputs[] = {0xc, 0x7, 0x9, 0x6, 0xf}; Initialise(&masm, p5.VnS(), p5_inputs); // D-sized lanes get eight bits in a P register. int p6_inputs[] = {0x81, 0xcc, 0x55}; Initialise(&masm, p6.VnD(), p6_inputs); // The largest possible P register has 32 bytes. int p7_inputs[] = {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f}; Initialise(&masm, p7.VnD(), p7_inputs); END(); if (CAN_RUN()) { RUN(); // Test that lane-by-lane checks work properly. The rightmost // (highest-indexed) array element maps to the lowest-numbered lane. for (size_t i = 0; i < ArrayLength(p0_inputs); i++) { int lane = static_cast(ArrayLength(p0_inputs) - i - 1); ASSERT_EQUAL_SVE_LANE(p0_inputs[i], p0.VnB(), lane); } for (size_t i = 0; i < ArrayLength(p1_inputs); i++) { int lane = static_cast(ArrayLength(p1_inputs) - i - 1); ASSERT_EQUAL_SVE_LANE(p1_inputs[i], p1.VnH(), lane); } for (size_t i = 0; i < ArrayLength(p2_inputs); i++) { int lane = static_cast(ArrayLength(p2_inputs) - i - 1); ASSERT_EQUAL_SVE_LANE(p2_inputs[i], p2.VnS(), lane); } for (size_t i = 0; i < ArrayLength(p3_inputs); i++) { int lane = static_cast(ArrayLength(p3_inputs) - i - 1); ASSERT_EQUAL_SVE_LANE(p3_inputs[i], p3.VnD(), lane); } // Test that array checks work properly on predicates initialised with a // possibly-different lane size. // 0b...11'10'01'00'01'10'11 int p4_expected[] = {0x39, 0x1b}; ASSERT_EQUAL_SVE(p4_expected, p4.VnD()); ASSERT_EQUAL_SVE(p5_inputs, p5.VnS()); // 0b...10000001'11001100'01010101 int p6_expected[] = {2, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p6_expected, p6.VnH()); // 0b...10011100'10011101'10011110'10011111 int p7_expected[] = {1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p7_expected, p7.VnB()); } } // Test that writes to V registers clear the high bits of the corresponding Z // register. TEST_SVE(sve_v_write_clear) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kFP, CPUFeatures::kSVE); START(); // The Simulator has two mechanisms for writing V registers: // - Write*Register, calling through to SimRegisterBase::Write. // - LogicVRegister::ClearForWrite followed by one or more lane updates. // Try to cover both variants. // Prepare some known inputs. uint8_t data[kQRegSizeInBytes]; for (size_t i = 0; i < kQRegSizeInBytes; i++) { data[i] = 42 + i; } __ Mov(x10, reinterpret_cast(data)); __ Fmov(d30, 42.0); // Use Index to label the lane indices, so failures are easy to detect and // diagnose. __ Index(z0.VnB(), 0, 1); __ Index(z1.VnB(), 0, 1); __ Index(z2.VnB(), 0, 1); __ Index(z3.VnB(), 0, 1); __ Index(z4.VnB(), 0, 1); __ Index(z10.VnB(), 0, -1); __ Index(z11.VnB(), 0, -1); __ Index(z12.VnB(), 0, -1); __ Index(z13.VnB(), 0, -1); __ Index(z14.VnB(), 0, -1); // Instructions using Write*Register (and SimRegisterBase::Write). __ Ldr(b0, MemOperand(x10)); __ Fcvt(h1, d30); __ Fmov(s2, 1.5f); __ Fmov(d3, d30); __ Ldr(q4, MemOperand(x10)); // Instructions using LogicVRegister::ClearForWrite. // These also (incidentally) test that across-lane instructions correctly // ignore the high-order Z register lanes. __ Sminv(b10, v10.V16B()); __ Addv(h11, v11.V4H()); __ Saddlv(s12, v12.V8H()); __ Dup(v13.V8B(), b13, kDRegSizeInBytes); __ Uaddl(v14.V8H(), v14.V8B(), v14.V8B()); END(); if (CAN_RUN()) { RUN(); // Check the Q part first. ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000002a, v0); ASSERT_EQUAL_128(0x0000000000000000, 0x0000000000005140, v1); // 42.0 (f16) ASSERT_EQUAL_128(0x0000000000000000, 0x000000003fc00000, v2); // 1.5 (f32) ASSERT_EQUAL_128(0x0000000000000000, 0x4045000000000000, v3); // 42.0 (f64) ASSERT_EQUAL_128(0x3938373635343332, 0x31302f2e2d2c2b2a, v4); ASSERT_EQUAL_128(0x0000000000000000, 0x00000000000000f1, v10); // -15 // 0xf9fa + 0xfbfc + 0xfdfe + 0xff00 -> 0xf2f4 ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000f2f4, v11); // 0xfffff1f2 + 0xfffff3f4 + ... + 0xfffffdfe + 0xffffff00 -> 0xffffc6c8 ASSERT_EQUAL_128(0x0000000000000000, 0x00000000ffffc6c8, v12); ASSERT_EQUAL_128(0x0000000000000000, 0xf8f8f8f8f8f8f8f8, v13); // [-8] x 8 // [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000] // + [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000] // -> [0x01f2, 0x01f4, 0x01f6, 0x01f8, 0x01fa, 0x01fc, 0x01fe, 0x0000] ASSERT_EQUAL_128(0x01f201f401f601f8, 0x01fa01fc01fe0000, v14); // Check that the upper lanes are all clear. for (int i = kQRegSizeInBytes; i < core.GetSVELaneCount(kBRegSize); i++) { ASSERT_EQUAL_SVE_LANE(0x00, z0.VnB(), i); ASSERT_EQUAL_SVE_LANE(0x00, z1.VnB(), i); ASSERT_EQUAL_SVE_LANE(0x00, z2.VnB(), i); ASSERT_EQUAL_SVE_LANE(0x00, z3.VnB(), i); ASSERT_EQUAL_SVE_LANE(0x00, z4.VnB(), i); ASSERT_EQUAL_SVE_LANE(0x00, z10.VnB(), i); ASSERT_EQUAL_SVE_LANE(0x00, z11.VnB(), i); ASSERT_EQUAL_SVE_LANE(0x00, z12.VnB(), i); ASSERT_EQUAL_SVE_LANE(0x00, z13.VnB(), i); ASSERT_EQUAL_SVE_LANE(0x00, z14.VnB(), i); } } } static void MlaMlsHelper(Test* config, unsigned lane_size_in_bits) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int zd_inputs[] = {0xbb, 0xcc, 0xdd, 0xee}; int za_inputs[] = {-39, 1, -3, 2}; int zn_inputs[] = {-5, -20, 9, 8}; int zm_inputs[] = {9, -5, 4, 5}; ZRegister zd = z0.WithLaneSize(lane_size_in_bits); ZRegister za = z1.WithLaneSize(lane_size_in_bits); ZRegister zn = z2.WithLaneSize(lane_size_in_bits); ZRegister zm = z3.WithLaneSize(lane_size_in_bits); // TODO: Use a simple `Dup` once it accepts arbitrary immediates. InsrHelper(&masm, zd, zd_inputs); InsrHelper(&masm, za, za_inputs); InsrHelper(&masm, zn, zn_inputs); InsrHelper(&masm, zm, zm_inputs); int p0_inputs[] = {1, 1, 0, 1}; int p1_inputs[] = {1, 0, 1, 1}; int p2_inputs[] = {0, 1, 1, 1}; int p3_inputs[] = {1, 1, 1, 0}; Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), p0_inputs); Initialise(&masm, p1.WithLaneSize(lane_size_in_bits), p1_inputs); Initialise(&masm, p2.WithLaneSize(lane_size_in_bits), p2_inputs); Initialise(&masm, p3.WithLaneSize(lane_size_in_bits), p3_inputs); // The Mla macro automatically selects between mla, mad and movprfx + mla // based on what registers are aliased. ZRegister mla_da_result = z10.WithLaneSize(lane_size_in_bits); ZRegister mla_dn_result = z11.WithLaneSize(lane_size_in_bits); ZRegister mla_dm_result = z12.WithLaneSize(lane_size_in_bits); ZRegister mla_d_result = z13.WithLaneSize(lane_size_in_bits); __ Mov(mla_da_result, za); __ Mla(mla_da_result, p0.Merging(), mla_da_result, zn, zm); __ Mov(mla_dn_result, zn); __ Mla(mla_dn_result, p1.Merging(), za, mla_dn_result, zm); __ Mov(mla_dm_result, zm); __ Mla(mla_dm_result, p2.Merging(), za, zn, mla_dm_result); __ Mov(mla_d_result, zd); __ Mla(mla_d_result, p3.Merging(), za, zn, zm); // The Mls macro automatically selects between mls, msb and movprfx + mls // based on what registers are aliased. ZRegister mls_da_result = z20.WithLaneSize(lane_size_in_bits); ZRegister mls_dn_result = z21.WithLaneSize(lane_size_in_bits); ZRegister mls_dm_result = z22.WithLaneSize(lane_size_in_bits); ZRegister mls_d_result = z23.WithLaneSize(lane_size_in_bits); __ Mov(mls_da_result, za); __ Mls(mls_da_result, p0.Merging(), mls_da_result, zn, zm); __ Mov(mls_dn_result, zn); __ Mls(mls_dn_result, p1.Merging(), za, mls_dn_result, zm); __ Mov(mls_dm_result, zm); __ Mls(mls_dm_result, p2.Merging(), za, zn, mls_dm_result); __ Mov(mls_d_result, zd); __ Mls(mls_d_result, p3.Merging(), za, zn, zm); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits)); ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits)); ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits)); int mla[] = {-84, 101, 33, 42}; int mls[] = {6, -99, -39, -38}; int mla_da_expected[] = {mla[0], mla[1], za_inputs[2], mla[3]}; ASSERT_EQUAL_SVE(mla_da_expected, mla_da_result); int mla_dn_expected[] = {mla[0], zn_inputs[1], mla[2], mla[3]}; ASSERT_EQUAL_SVE(mla_dn_expected, mla_dn_result); int mla_dm_expected[] = {zm_inputs[0], mla[1], mla[2], mla[3]}; ASSERT_EQUAL_SVE(mla_dm_expected, mla_dm_result); int mla_d_expected[] = {mla[0], mla[1], mla[2], zd_inputs[3]}; ASSERT_EQUAL_SVE(mla_d_expected, mla_d_result); int mls_da_expected[] = {mls[0], mls[1], za_inputs[2], mls[3]}; ASSERT_EQUAL_SVE(mls_da_expected, mls_da_result); int mls_dn_expected[] = {mls[0], zn_inputs[1], mls[2], mls[3]}; ASSERT_EQUAL_SVE(mls_dn_expected, mls_dn_result); int mls_dm_expected[] = {zm_inputs[0], mls[1], mls[2], mls[3]}; ASSERT_EQUAL_SVE(mls_dm_expected, mls_dm_result); int mls_d_expected[] = {mls[0], mls[1], mls[2], zd_inputs[3]}; ASSERT_EQUAL_SVE(mls_d_expected, mls_d_result); } } TEST_SVE(sve_mla_mls_b) { MlaMlsHelper(config, kBRegSize); } TEST_SVE(sve_mla_mls_h) { MlaMlsHelper(config, kHRegSize); } TEST_SVE(sve_mla_mls_s) { MlaMlsHelper(config, kSRegSize); } TEST_SVE(sve_mla_mls_d) { MlaMlsHelper(config, kDRegSize); } TEST_SVE(sve_bitwise_unpredicate_logical) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t z8_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef}; InsrHelper(&masm, z8.VnD(), z8_inputs); uint64_t z15_inputs[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff}; InsrHelper(&masm, z15.VnD(), z15_inputs); __ And(z1.VnD(), z8.VnD(), z15.VnD()); __ Bic(z2.VnD(), z8.VnD(), z15.VnD()); __ Eor(z3.VnD(), z8.VnD(), z15.VnD()); __ Orr(z4.VnD(), z8.VnD(), z15.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t z1_expected[] = {0xfedcaa8854540000, 0x0000454588aacdef}; uint64_t z2_expected[] = {0x0000101022003210, 0x0123002201010000}; uint64_t z3_expected[] = {0x01235476ab89fedc, 0xcdef98ba67453210}; uint64_t z4_expected[] = {0xfffffefeffddfedc, 0xcdefddffefefffff}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); ASSERT_EQUAL_SVE(z2_expected, z2.VnD()); ASSERT_EQUAL_SVE(z3_expected, z3.VnD()); ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); } } TEST_SVE(sve_last_r) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON); START(); __ Pfalse(p1.VnB()); int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0}; Initialise(&masm, p2.VnB(), p2_inputs); Initialise(&masm, p3.VnB(), p3_inputs); __ Ptrue(p4.VnB()); __ Index(z0.VnB(), 0x10, 1); __ Lasta(x1, p1, z0.VnB()); __ Lastb(x2, p1, z0.VnB()); __ Lasta(x3, p2, z0.VnB()); __ Lastb(x4, p2, z0.VnB()); __ Lasta(x5, p3, z0.VnB()); __ Lastb(x6, p3, z0.VnB()); __ Lasta(x7, p4, z0.VnB()); __ Punpklo(p3.VnH(), p3.VnB()); __ Index(z0.VnH(), 0x1110, 1); __ Lasta(x9, p1, z0.VnH()); __ Lastb(x10, p3, z0.VnH()); __ Lasta(x12, p4, z0.VnH()); __ Index(z0.VnS(), 0x11111110, 1); __ Lastb(x13, p1, z0.VnS()); __ Lasta(x14, p2, z0.VnS()); __ Lastb(x18, p4, z0.VnS()); __ Index(z0.VnD(), 0x1111111111111110, 1); __ Lasta(x19, p1, z0.VnD()); __ Lastb(x20, p3, z0.VnD()); __ Lasta(x21, p3, z0.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_64(0x0000000000000010, x1); ASSERT_EQUAL_64(0x0000000000000011, x3); ASSERT_EQUAL_64(0x0000000000000010, x4); ASSERT_EQUAL_64(0x0000000000000019, x5); ASSERT_EQUAL_64(0x0000000000000018, x6); ASSERT_EQUAL_64(0x0000000000000010, x7); ASSERT_EQUAL_64(0x0000000000001110, x9); ASSERT_EQUAL_64(0x0000000000001110, x12); ASSERT_EQUAL_64(0x0000000011111111, x14); ASSERT_EQUAL_64(0x1111111111111110, x19); int vl = core.GetSVELaneCount(kBRegSize) * 8; switch (vl) { case 128: ASSERT_EQUAL_64(0x000000000000001f, x2); ASSERT_EQUAL_64(0x0000000000001116, x10); ASSERT_EQUAL_64(0x0000000011111113, x13); ASSERT_EQUAL_64(0x0000000011111113, x18); ASSERT_EQUAL_64(0x1111111111111111, x20); ASSERT_EQUAL_64(0x1111111111111110, x21); break; case 384: ASSERT_EQUAL_64(0x000000000000003f, x2); ASSERT_EQUAL_64(0x0000000000001118, x10); ASSERT_EQUAL_64(0x000000001111111b, x13); ASSERT_EQUAL_64(0x000000001111111b, x18); ASSERT_EQUAL_64(0x1111111111111112, x20); ASSERT_EQUAL_64(0x1111111111111113, x21); break; case 2048: ASSERT_EQUAL_64(0x000000000000000f, x2); ASSERT_EQUAL_64(0x0000000000001118, x10); ASSERT_EQUAL_64(0x000000001111114f, x13); ASSERT_EQUAL_64(0x000000001111114f, x18); ASSERT_EQUAL_64(0x1111111111111112, x20); ASSERT_EQUAL_64(0x1111111111111113, x21); break; default: printf("WARNING: Some tests skipped due to unexpected VL.\n"); break; } } } TEST_SVE(sve_last_v) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON); START(); __ Pfalse(p1.VnB()); int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0}; Initialise(&masm, p2.VnB(), p2_inputs); Initialise(&masm, p3.VnB(), p3_inputs); __ Ptrue(p4.VnB()); __ Index(z0.VnB(), 0x10, 1); __ Lasta(b1, p1, z0.VnB()); __ Lastb(b2, p1, z0.VnB()); __ Lasta(b3, p2, z0.VnB()); __ Lastb(b4, p2, z0.VnB()); __ Lasta(b5, p3, z0.VnB()); __ Lastb(b6, p3, z0.VnB()); __ Lasta(b7, p4, z0.VnB()); __ Punpklo(p3.VnH(), p3.VnB()); __ Index(z0.VnH(), 0x1110, 1); __ Lasta(h9, p1, z0.VnH()); __ Lastb(h10, p3, z0.VnH()); __ Lasta(h12, p4, z0.VnH()); __ Index(z0.VnS(), 0x11111110, 1); __ Lastb(s13, p1, z0.VnS()); __ Lasta(s14, p2, z0.VnS()); __ Lastb(s18, p4, z0.VnS()); __ Index(z0.VnD(), 0x1111111111111110, 1); __ Lasta(d19, p1, z0.VnD()); __ Lastb(d20, p3, z0.VnD()); __ Lasta(d21, p3, z0.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_128(0, 0x0000000000000010, q1); ASSERT_EQUAL_128(0, 0x0000000000000011, q3); ASSERT_EQUAL_128(0, 0x0000000000000010, q4); ASSERT_EQUAL_128(0, 0x0000000000000019, q5); ASSERT_EQUAL_128(0, 0x0000000000000018, q6); ASSERT_EQUAL_128(0, 0x0000000000000010, q7); ASSERT_EQUAL_128(0, 0x0000000000001110, q9); ASSERT_EQUAL_128(0, 0x0000000000001110, q12); ASSERT_EQUAL_128(0, 0x0000000011111111, q14); ASSERT_EQUAL_128(0, 0x1111111111111110, q19); int vl = core.GetSVELaneCount(kBRegSize) * 8; switch (vl) { case 128: ASSERT_EQUAL_128(0, 0x000000000000001f, q2); ASSERT_EQUAL_128(0, 0x0000000000001116, q10); ASSERT_EQUAL_128(0, 0x0000000011111113, q13); ASSERT_EQUAL_128(0, 0x0000000011111113, q18); ASSERT_EQUAL_128(0, 0x1111111111111111, q20); ASSERT_EQUAL_128(0, 0x1111111111111110, q21); break; case 384: ASSERT_EQUAL_128(0, 0x000000000000003f, q2); ASSERT_EQUAL_128(0, 0x0000000000001118, q10); ASSERT_EQUAL_128(0, 0x000000001111111b, q13); ASSERT_EQUAL_128(0, 0x000000001111111b, q18); ASSERT_EQUAL_128(0, 0x1111111111111112, q20); ASSERT_EQUAL_128(0, 0x1111111111111113, q21); break; case 2048: ASSERT_EQUAL_128(0, 0x000000000000000f, q2); ASSERT_EQUAL_128(0, 0x0000000000001118, q10); ASSERT_EQUAL_128(0, 0x000000001111114f, q13); ASSERT_EQUAL_128(0, 0x000000001111114f, q18); ASSERT_EQUAL_128(0, 0x1111111111111112, q20); ASSERT_EQUAL_128(0, 0x1111111111111113, q21); break; default: printf("WARNING: Some tests skipped due to unexpected VL.\n"); break; } } } TEST_SVE(sve_clast_r) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON); START(); __ Pfalse(p1.VnB()); int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0}; Initialise(&masm, p2.VnB(), p2_inputs); Initialise(&masm, p3.VnB(), p3_inputs); __ Ptrue(p4.VnB()); __ Index(z0.VnB(), 0x10, 1); __ Mov(x1, -1); __ Mov(x2, -1); __ Clasta(x1, p1, x1, z0.VnB()); __ Clastb(x2, p1, x2, z0.VnB()); __ Clasta(x3, p2, x3, z0.VnB()); __ Clastb(x4, p2, x4, z0.VnB()); __ Clasta(x5, p3, x5, z0.VnB()); __ Clastb(x6, p3, x6, z0.VnB()); __ Clasta(x7, p4, x7, z0.VnB()); __ Punpklo(p3.VnH(), p3.VnB()); __ Index(z0.VnH(), 0x1110, 1); __ Mov(x9, -1); __ Clasta(x9, p1, x9, z0.VnH()); __ Clastb(x10, p3, x10, z0.VnH()); __ Clasta(x12, p4, x12, z0.VnH()); __ Index(z0.VnS(), 0x11111110, 1); __ Mov(x13, -1); __ Clasta(x13, p1, x13, z0.VnS()); __ Clastb(x14, p2, x14, z0.VnS()); __ Clasta(x18, p4, x18, z0.VnS()); __ Index(z0.VnD(), 0x1111111111111110, 1); __ Mov(x19, -1); __ Clasta(x19, p1, x19, z0.VnD()); __ Clastb(x20, p2, x20, z0.VnD()); __ Clasta(x21, p4, x21, z0.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_64(0x00000000000000ff, x1); ASSERT_EQUAL_64(0x00000000000000ff, x2); ASSERT_EQUAL_64(0x0000000000000011, x3); ASSERT_EQUAL_64(0x0000000000000010, x4); ASSERT_EQUAL_64(0x0000000000000019, x5); ASSERT_EQUAL_64(0x0000000000000018, x6); ASSERT_EQUAL_64(0x0000000000000010, x7); ASSERT_EQUAL_64(0x000000000000ffff, x9); ASSERT_EQUAL_64(0x0000000000001110, x12); ASSERT_EQUAL_64(0x00000000ffffffff, x13); ASSERT_EQUAL_64(0x0000000011111110, x14); ASSERT_EQUAL_64(0x0000000011111110, x18); ASSERT_EQUAL_64(0xffffffffffffffff, x19); ASSERT_EQUAL_64(0x1111111111111110, x20); ASSERT_EQUAL_64(0x1111111111111110, x21); int vl = core.GetSVELaneCount(kBRegSize) * 8; switch (vl) { case 128: ASSERT_EQUAL_64(0x0000000000001116, x10); break; case 384: ASSERT_EQUAL_64(0x0000000000001118, x10); break; case 2048: ASSERT_EQUAL_64(0x0000000000001118, x10); break; default: printf("WARNING: Some tests skipped due to unexpected VL.\n"); break; } } } TEST_SVE(sve_clast_v) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON); START(); __ Pfalse(p1.VnB()); int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0}; Initialise(&masm, p2.VnB(), p2_inputs); Initialise(&masm, p3.VnB(), p3_inputs); __ Ptrue(p4.VnB()); __ Index(z0.VnB(), 0x10, 1); __ Dup(z1.VnB(), -1); __ Dup(z2.VnB(), -1); __ Clasta(b1, p1, b1, z0.VnB()); __ Clastb(b2, p1, b2, z0.VnB()); __ Clasta(b3, p2, b3, z0.VnB()); __ Clastb(b4, p2, b4, z0.VnB()); __ Clasta(b5, p3, b5, z0.VnB()); __ Clastb(b6, p3, b6, z0.VnB()); __ Clasta(b7, p4, b7, z0.VnB()); __ Punpklo(p3.VnH(), p3.VnB()); __ Index(z0.VnH(), 0x1110, 1); __ Dup(z9.VnB(), -1); __ Clasta(h9, p1, h9, z0.VnH()); __ Clastb(h10, p3, h10, z0.VnH()); __ Clasta(h12, p4, h12, z0.VnH()); __ Index(z0.VnS(), 0x11111110, 1); __ Dup(z13.VnB(), -1); __ Clasta(s13, p1, s13, z0.VnS()); __ Clastb(s14, p2, s14, z0.VnS()); __ Clasta(s18, p4, s18, z0.VnS()); __ Index(z0.VnD(), 0x1111111111111110, 1); __ Dup(z19.VnB(), -1); __ Clasta(d19, p1, d19, z0.VnD()); __ Clastb(d20, p2, d20, z0.VnD()); __ Clasta(d21, p4, d21, z0.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_128(0, 0x00000000000000ff, q1); ASSERT_EQUAL_128(0, 0x00000000000000ff, q2); ASSERT_EQUAL_128(0, 0x0000000000000011, q3); ASSERT_EQUAL_128(0, 0x0000000000000010, q4); ASSERT_EQUAL_128(0, 0x0000000000000019, q5); ASSERT_EQUAL_128(0, 0x0000000000000018, q6); ASSERT_EQUAL_128(0, 0x0000000000000010, q7); ASSERT_EQUAL_128(0, 0x000000000000ffff, q9); ASSERT_EQUAL_128(0, 0x0000000000001110, q12); ASSERT_EQUAL_128(0, 0x00000000ffffffff, q13); ASSERT_EQUAL_128(0, 0x0000000011111110, q14); ASSERT_EQUAL_128(0, 0x0000000011111110, q18); ASSERT_EQUAL_128(0, 0xffffffffffffffff, q19); ASSERT_EQUAL_128(0, 0x1111111111111110, q20); ASSERT_EQUAL_128(0, 0x1111111111111110, q21); int vl = core.GetSVELaneCount(kBRegSize) * 8; switch (vl) { case 128: ASSERT_EQUAL_128(0, 0x0000000000001116, q10); break; case 384: ASSERT_EQUAL_128(0, 0x0000000000001118, q10); break; case 2048: ASSERT_EQUAL_128(0, 0x0000000000001118, q10); break; default: printf("WARNING: Some tests skipped due to unexpected VL.\n"); break; } } } TEST_SVE(sve_clast_z) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON); START(); __ Pfalse(p1.VnB()); int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0}; Initialise(&masm, p2.VnB(), p2_inputs); Initialise(&masm, p3.VnB(), p3_inputs); __ Ptrue(p4.VnB()); __ Index(z0.VnB(), 0x10, 1); __ Dup(z1.VnB(), 0xff); __ Dup(z2.VnB(), 0xff); __ Clasta(z1.VnB(), p1, z1.VnB(), z0.VnB()); __ Clastb(z2.VnB(), p1, z2.VnB(), z0.VnB()); __ Clasta(z3.VnB(), p2, z3.VnB(), z0.VnB()); __ Clastb(z4.VnB(), p2, z4.VnB(), z0.VnB()); __ Clasta(z5.VnB(), p3, z5.VnB(), z0.VnB()); __ Clastb(z6.VnB(), p3, z6.VnB(), z0.VnB()); __ Clasta(z7.VnB(), p4, z7.VnB(), z0.VnB()); __ Punpklo(p3.VnH(), p3.VnB()); __ Index(z0.VnH(), 0x1110, 1); __ Dup(z9.VnB(), 0xff); __ Clasta(z9.VnH(), p1, z9.VnH(), z0.VnH()); __ Clastb(z10.VnH(), p3, z10.VnH(), z0.VnH()); __ Clasta(z12.VnH(), p4, z12.VnH(), z0.VnH()); __ Index(z0.VnS(), 0x11111110, 1); __ Dup(z13.VnB(), 0xff); __ Clasta(z13.VnS(), p1, z13.VnS(), z0.VnS()); __ Clastb(z14.VnS(), p2, z14.VnS(), z0.VnS()); __ Clasta(z16.VnS(), p4, z16.VnS(), z0.VnS()); __ Index(z0.VnD(), 0x1111111111111110, 1); __ Dup(z17.VnB(), 0xff); __ Clasta(z17.VnD(), p1, z17.VnD(), z0.VnD()); __ Clastb(z18.VnD(), p2, z18.VnD(), z0.VnD()); __ Clasta(z20.VnD(), p4, z20.VnD(), z0.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t z1_expected[] = {0xffffffffffffffff, 0xffffffffffffffff}; uint64_t z2_expected[] = {0xffffffffffffffff, 0xffffffffffffffff}; uint64_t z3_expected[] = {0x1111111111111111, 0x1111111111111111}; uint64_t z4_expected[] = {0x1010101010101010, 0x1010101010101010}; uint64_t z5_expected[] = {0x1919191919191919, 0x1919191919191919}; uint64_t z6_expected[] = {0x1818181818181818, 0x1818181818181818}; uint64_t z7_expected[] = {0x1010101010101010, 0x1010101010101010}; uint64_t z9_expected[] = {0xffffffffffffffff, 0xffffffffffffffff}; uint64_t z12_expected[] = {0x1110111011101110, 0x1110111011101110}; uint64_t z13_expected[] = {0xffffffffffffffff, 0xffffffffffffffff}; uint64_t z14_expected[] = {0x1111111011111110, 0x1111111011111110}; uint64_t z16_expected[] = {0x1111111011111110, 0x1111111011111110}; uint64_t z17_expected[] = {0xffffffffffffffff, 0xffffffffffffffff}; uint64_t z18_expected[] = {0x1111111111111110, 0x1111111111111110}; uint64_t z20_expected[] = {0x1111111111111110, 0x1111111111111110}; uint64_t z10_expected_vl128[] = {0x1116111611161116, 0x1116111611161116}; uint64_t z10_expected_vl_long[] = {0x1118111811181118, 0x1118111811181118}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); ASSERT_EQUAL_SVE(z2_expected, z2.VnD()); ASSERT_EQUAL_SVE(z3_expected, z3.VnD()); ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); ASSERT_EQUAL_SVE(z6_expected, z6.VnD()); ASSERT_EQUAL_SVE(z7_expected, z7.VnD()); ASSERT_EQUAL_SVE(z9_expected, z9.VnD()); ASSERT_EQUAL_SVE(z12_expected, z12.VnD()); ASSERT_EQUAL_SVE(z13_expected, z13.VnD()); ASSERT_EQUAL_SVE(z14_expected, z14.VnD()); ASSERT_EQUAL_SVE(z16_expected, z16.VnD()); ASSERT_EQUAL_SVE(z17_expected, z17.VnD()); ASSERT_EQUAL_SVE(z18_expected, z18.VnD()); ASSERT_EQUAL_SVE(z20_expected, z20.VnD()); int vl = core.GetSVELaneCount(kBRegSize) * 8; switch (vl) { case 128: ASSERT_EQUAL_SVE(z10_expected_vl128, z10.VnD()); break; case 384: case 2048: ASSERT_EQUAL_SVE(z10_expected_vl_long, z10.VnD()); break; default: printf("WARNING: Some tests skipped due to unexpected VL.\n"); break; } } } TEST_SVE(sve_compact) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON); START(); __ Ptrue(p0.VnB()); __ Pfalse(p1.VnB()); __ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); __ Zip1(p3.VnS(), p1.VnS(), p0.VnS()); __ Zip1(p4.VnD(), p0.VnD(), p1.VnD()); __ Index(z0.VnS(), 0x11111111, 0x11111111); __ Mov(q0, q0); __ Compact(z1.VnS(), p0, z0.VnS()); __ Compact(z2.VnS(), p2, z0.VnS()); __ Compact(z0.VnS(), p3, z0.VnS()); __ Index(z3.VnD(), 0x1111111111111111, 0x1111111111111111); __ Mov(q3, q3); __ Compact(z4.VnD(), p0, z3.VnD()); __ Compact(z5.VnD(), p1, z3.VnD()); __ Compact(z6.VnD(), p4, z3.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t z1_expected[] = {0x4444444433333333, 0x2222222211111111}; uint64_t z2_expected[] = {0x0000000000000000, 0x3333333311111111}; uint64_t z0_expected[] = {0x0000000000000000, 0x4444444422222222}; uint64_t z4_expected[] = {0x2222222222222222, 0x1111111111111111}; uint64_t z5_expected[] = {0x0000000000000000, 0x0000000000000000}; uint64_t z6_expected[] = {0x0000000000000000, 0x1111111111111111}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); ASSERT_EQUAL_SVE(z2_expected, z2.VnD()); ASSERT_EQUAL_SVE(z0_expected, z0.VnD()); ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); ASSERT_EQUAL_SVE(z6_expected, z6.VnD()); } } TEST_SVE(sve_splice) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Pfalse(p1.VnB()); int p2b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int p3b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}; int p4b_inputs[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int p5b_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0}; int p6b_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}; Initialise(&masm, p2.VnB(), p2b_inputs); Initialise(&masm, p3.VnB(), p3b_inputs); Initialise(&masm, p4.VnB(), p4b_inputs); Initialise(&masm, p5.VnB(), p5b_inputs); Initialise(&masm, p6.VnB(), p6b_inputs); __ Index(z30.VnB(), 1, 1); __ Index(z0.VnB(), -1, -1); __ Splice(z0.VnB(), p0, z0.VnB(), z30.VnB()); __ Index(z1.VnB(), -1, -1); __ Splice(z1.VnB(), p1, z1.VnB(), z30.VnB()); __ Index(z2.VnB(), -1, -1); __ Splice(z2.VnB(), p2, z2.VnB(), z30.VnB()); __ Index(z3.VnB(), -1, -1); __ Splice(z3.VnB(), p3, z3.VnB(), z30.VnB()); __ Index(z4.VnB(), -1, -1); __ Splice(z4.VnB(), p4, z4.VnB(), z30.VnB()); __ Index(z5.VnB(), -1, -1); __ Splice(z5.VnB(), p5, z5.VnB(), z30.VnB()); __ Index(z6.VnB(), -1, -1); __ Splice(z6.VnB(), p6, z6.VnB(), z30.VnB()); int p2h_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0}; int p3h_inputs[] = {0, 0, 1, 0, 0, 0, 1, 0}; Initialise(&masm, p2.VnH(), p2h_inputs); Initialise(&masm, p3.VnH(), p3h_inputs); __ Index(z30.VnH(), 1, 1); __ Index(z29.VnH(), -1, -1); __ Splice(z7.VnH(), p2, z29.VnH(), z30.VnH()); __ Splice(z8.VnH(), p3, z29.VnH(), z30.VnH()); int p2s_inputs[] = {0, 0, 1, 0}; int p3s_inputs[] = {1, 0, 1, 0}; Initialise(&masm, p2.VnS(), p2s_inputs); Initialise(&masm, p3.VnS(), p3s_inputs); __ Index(z30.VnS(), 1, 1); __ Index(z29.VnS(), -1, -1); __ Splice(z9.VnS(), p2, z29.VnS(), z30.VnS()); __ Splice(z10.VnS(), p3, z29.VnS(), z30.VnS()); int p2d_inputs[] = {0, 1}; int p3d_inputs[] = {1, 0}; Initialise(&masm, p2.VnD(), p2d_inputs); Initialise(&masm, p3.VnD(), p3d_inputs); __ Index(z30.VnD(), 1, 1); __ Index(z29.VnD(), -1, -1); __ Splice(z11.VnD(), p2, z29.VnD(), z30.VnD()); __ Splice(z30.VnD(), p3, z29.VnD(), z30.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t z0_expected[] = {0xf0f1f2f3f4f5f6f7, 0xf8f9fafbfcfdfeff}; uint64_t z1_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201}; uint64_t z2_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201ff}; uint64_t z3_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201fe}; uint64_t z4_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201f0}; uint64_t z5_expected[] = {0x0c0b0a0908070605, 0x04030201f6f7f8f9}; uint64_t z6_expected[] = {0x01f0f1f2f3f4f5f6, 0xf7f8f9fafbfcfdfe}; uint64_t z7_expected[] = {0x0007000600050004, 0x000300020001fffe}; uint64_t z8_expected[] = {0x000300020001fffa, 0xfffbfffcfffdfffe}; uint64_t z9_expected[] = {0x0000000300000002, 0x00000001fffffffe}; uint64_t z10_expected[] = {0x00000001fffffffc, 0xfffffffdfffffffe}; uint64_t z11_expected[] = {0x0000000000000001, 0xffffffffffffffff}; uint64_t z30_expected[] = {0x0000000000000001, 0xfffffffffffffffe}; ASSERT_EQUAL_SVE(z0_expected, z0.VnD()); ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); ASSERT_EQUAL_SVE(z2_expected, z2.VnD()); ASSERT_EQUAL_SVE(z3_expected, z3.VnD()); ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); ASSERT_EQUAL_SVE(z6_expected, z6.VnD()); ASSERT_EQUAL_SVE(z7_expected, z7.VnD()); ASSERT_EQUAL_SVE(z8_expected, z8.VnD()); ASSERT_EQUAL_SVE(z9_expected, z9.VnD()); ASSERT_EQUAL_SVE(z10_expected, z10.VnD()); ASSERT_EQUAL_SVE(z11_expected, z11.VnD()); ASSERT_EQUAL_SVE(z30_expected, z30.VnD()); } } TEST_SVE(sve_predicate_logical) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // 0b...01011010'10110111 int p10_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1}; // Pm // 0b...11011001'01010010 int p11_inputs[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0}; // Pn // 0b...01010101'10110010 int p12_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0}; // pg Initialise(&masm, p10.VnB(), p10_inputs); Initialise(&masm, p11.VnB(), p11_inputs); Initialise(&masm, p12.VnB(), p12_inputs); __ Ands(p0.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB()); __ Mrs(x0, NZCV); __ Bics(p1.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB()); __ Mrs(x1, NZCV); __ Eor(p2.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB()); __ Nand(p3.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB()); __ Nor(p4.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB()); __ Orn(p5.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB()); __ Orr(p6.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB()); __ Sel(p7.VnB(), p12, p11.VnB(), p10.VnB()); END(); if (CAN_RUN()) { RUN(); // 0b...01010000'00010010 int p0_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0}; // 0b...00000001'00000000 int p1_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}; // 0b...00000001'10100000 int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0}; // 0b...00000101'10100000 int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0}; // 0b...00000100'00000000 int p4_expected[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // 0b...01010101'00010010 int p5_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0}; // 0b...01010001'10110010 int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0}; // 0b...01011011'00010111 int p7_expected[] = {0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1}; ASSERT_EQUAL_SVE(p0_expected, p0.VnB()); ASSERT_EQUAL_SVE(p1_expected, p1.VnB()); ASSERT_EQUAL_SVE(p2_expected, p2.VnB()); ASSERT_EQUAL_SVE(p3_expected, p3.VnB()); ASSERT_EQUAL_SVE(p4_expected, p4.VnB()); ASSERT_EQUAL_SVE(p5_expected, p5.VnB()); ASSERT_EQUAL_SVE(p6_expected, p6.VnB()); ASSERT_EQUAL_SVE(p7_expected, p7.VnB()); ASSERT_EQUAL_32(SVEFirstFlag, w0); ASSERT_EQUAL_32(SVENotLastFlag, w1); } } TEST_SVE(sve_int_compare_vectors) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int z10_inputs[] = {0x00, 0x80, 0xff, 0x7f, 0x00, 0x00, 0x00, 0xff}; int z11_inputs[] = {0x00, 0x00, 0x00, 0x00, 0x80, 0xff, 0x7f, 0xfe}; int p0_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1}; InsrHelper(&masm, z10.VnB(), z10_inputs); InsrHelper(&masm, z11.VnB(), z11_inputs); Initialise(&masm, p0.VnB(), p0_inputs); __ Cmphs(p6.VnB(), p0.Zeroing(), z10.VnB(), z11.VnB()); __ Mrs(x6, NZCV); uint64_t z12_inputs[] = {0xffffffffffffffff, 0x8000000000000000}; uint64_t z13_inputs[] = {0x0000000000000000, 0x8000000000000000}; int p1_inputs[] = {1, 1}; InsrHelper(&masm, z12.VnD(), z12_inputs); InsrHelper(&masm, z13.VnD(), z13_inputs); Initialise(&masm, p1.VnD(), p1_inputs); __ Cmphi(p7.VnD(), p1.Zeroing(), z12.VnD(), z13.VnD()); __ Mrs(x7, NZCV); int z14_inputs[] = {0, 32767, -1, -32767, 0, 0, 0, 32766}; int z15_inputs[] = {0, 0, 0, 0, 32767, -1, -32767, 32767}; int p2_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1}; InsrHelper(&masm, z14.VnH(), z14_inputs); InsrHelper(&masm, z15.VnH(), z15_inputs); Initialise(&masm, p2.VnH(), p2_inputs); __ Cmpge(p8.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH()); __ Mrs(x8, NZCV); __ Cmpeq(p9.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH()); __ Mrs(x9, NZCV); int z16_inputs[] = {0, -1, 0, 0}; int z17_inputs[] = {0, 0, 2147483647, -2147483648}; int p3_inputs[] = {1, 1, 1, 1}; InsrHelper(&masm, z16.VnS(), z16_inputs); InsrHelper(&masm, z17.VnS(), z17_inputs); Initialise(&masm, p3.VnS(), p3_inputs); __ Cmpgt(p10.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS()); __ Mrs(x10, NZCV); __ Cmpne(p11.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS()); __ Mrs(x11, NZCV); // Architectural aliases testing. __ Cmpls(p12.VnB(), p0.Zeroing(), z11.VnB(), z10.VnB()); // HS __ Cmplo(p13.VnD(), p1.Zeroing(), z13.VnD(), z12.VnD()); // HI __ Cmple(p14.VnH(), p2.Zeroing(), z15.VnH(), z14.VnH()); // GE __ Cmplt(p15.VnS(), p3.Zeroing(), z17.VnS(), z16.VnS()); // GT END(); if (CAN_RUN()) { RUN(); int p6_expected[] = {1, 0, 1, 1, 0, 0, 0, 1}; for (size_t i = 0; i < ArrayLength(p6_expected); i++) { int lane = static_cast(ArrayLength(p6_expected) - i - 1); ASSERT_EQUAL_SVE_LANE(p6_expected[i], p6.VnB(), lane); } int p7_expected[] = {1, 0}; ASSERT_EQUAL_SVE(p7_expected, p7.VnD()); int p8_expected[] = {1, 0, 0, 0, 0, 1, 1, 0}; ASSERT_EQUAL_SVE(p8_expected, p8.VnH()); int p9_expected[] = {1, 0, 0, 0, 0, 0, 0, 0}; ASSERT_EQUAL_SVE(p9_expected, p9.VnH()); int p10_expected[] = {0, 0, 0, 1}; ASSERT_EQUAL_SVE(p10_expected, p10.VnS()); int p11_expected[] = {0, 1, 1, 1}; ASSERT_EQUAL_SVE(p11_expected, p11.VnS()); // Reuse the expected results to verify the architectural aliases. ASSERT_EQUAL_SVE(p6_expected, p12.VnB()); ASSERT_EQUAL_SVE(p7_expected, p13.VnD()); ASSERT_EQUAL_SVE(p8_expected, p14.VnH()); ASSERT_EQUAL_SVE(p10_expected, p15.VnS()); ASSERT_EQUAL_32(SVEFirstFlag, w6); ASSERT_EQUAL_32(NoFlag, w7); ASSERT_EQUAL_32(NoFlag, w8); ASSERT_EQUAL_32(NoFlag, w9); ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10); } } TEST_SVE(sve_int_compare_vectors_wide_elements) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int src1_inputs_1[] = {0, 1, -1, -128, 127, 100, -66}; int src2_inputs_1[] = {0, -1}; int mask_inputs_1[] = {1, 1, 1, 1, 1, 0, 1}; InsrHelper(&masm, z13.VnB(), src1_inputs_1); InsrHelper(&masm, z19.VnD(), src2_inputs_1); Initialise(&masm, p0.VnB(), mask_inputs_1); __ Cmpge(p2.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD()); __ Mrs(x2, NZCV); __ Cmpgt(p3.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD()); __ Mrs(x3, NZCV); int src1_inputs_2[] = {0, 32767, -1, -32767, 1, 1234, 0, 32766}; int src2_inputs_2[] = {0, -32767}; int mask_inputs_2[] = {1, 0, 1, 1, 1, 1, 1, 1}; InsrHelper(&masm, z13.VnH(), src1_inputs_2); InsrHelper(&masm, z19.VnD(), src2_inputs_2); Initialise(&masm, p0.VnH(), mask_inputs_2); __ Cmple(p4.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD()); __ Mrs(x4, NZCV); __ Cmplt(p5.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD()); __ Mrs(x5, NZCV); int src1_inputs_3[] = {0, -1, 2147483647, -2147483648}; int src2_inputs_3[] = {0, -2147483648}; int mask_inputs_3[] = {1, 1, 1, 1}; InsrHelper(&masm, z13.VnS(), src1_inputs_3); InsrHelper(&masm, z19.VnD(), src2_inputs_3); Initialise(&masm, p0.VnS(), mask_inputs_3); __ Cmpeq(p6.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD()); __ Mrs(x6, NZCV); __ Cmpne(p7.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD()); __ Mrs(x7, NZCV); int src1_inputs_4[] = {0x00, 0x80, 0x7f, 0xff, 0x7f, 0xf0, 0x0f, 0x55}; int src2_inputs_4[] = {0x00, 0x7f}; int mask_inputs_4[] = {1, 1, 1, 1, 0, 1, 1, 1}; InsrHelper(&masm, z13.VnB(), src1_inputs_4); InsrHelper(&masm, z19.VnD(), src2_inputs_4); Initialise(&masm, p0.VnB(), mask_inputs_4); __ Cmplo(p8.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD()); __ Mrs(x8, NZCV); __ Cmpls(p9.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD()); __ Mrs(x9, NZCV); int src1_inputs_5[] = {0x0000, 0x8000, 0x7fff, 0xffff}; int src2_inputs_5[] = {0x8000, 0xffff}; int mask_inputs_5[] = {1, 1, 1, 1}; InsrHelper(&masm, z13.VnS(), src1_inputs_5); InsrHelper(&masm, z19.VnD(), src2_inputs_5); Initialise(&masm, p0.VnS(), mask_inputs_5); __ Cmphi(p10.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD()); __ Mrs(x10, NZCV); __ Cmphs(p11.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD()); __ Mrs(x11, NZCV); END(); if (CAN_RUN()) { RUN(); int p2_expected[] = {1, 1, 1, 0, 1, 0, 0}; ASSERT_EQUAL_SVE(p2_expected, p2.VnB()); int p3_expected[] = {1, 1, 0, 0, 1, 0, 0}; ASSERT_EQUAL_SVE(p3_expected, p3.VnB()); int p4_expected[] = {0x1, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0}; ASSERT_EQUAL_SVE(p4_expected, p4.VnH()); int p5_expected[] = {0x0, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0}; ASSERT_EQUAL_SVE(p5_expected, p5.VnH()); int p6_expected[] = {0x1, 0x0, 0x0, 0x1}; ASSERT_EQUAL_SVE(p6_expected, p6.VnS()); int p7_expected[] = {0x0, 0x1, 0x1, 0x0}; ASSERT_EQUAL_SVE(p7_expected, p7.VnS()); int p8_expected[] = {1, 0, 0, 0, 0, 0, 1, 1}; ASSERT_EQUAL_SVE(p8_expected, p8.VnB()); int p9_expected[] = {1, 0, 1, 0, 0, 0, 1, 1}; ASSERT_EQUAL_SVE(p9_expected, p9.VnB()); int p10_expected[] = {0x0, 0x0, 0x0, 0x0}; ASSERT_EQUAL_SVE(p10_expected, p10.VnS()); int p11_expected[] = {0x0, 0x1, 0x0, 0x1}; ASSERT_EQUAL_SVE(p11_expected, p11.VnS()); ASSERT_EQUAL_32(NoFlag, w2); ASSERT_EQUAL_32(NoFlag, w3); ASSERT_EQUAL_32(NoFlag, w4); ASSERT_EQUAL_32(SVENotLastFlag, w5); ASSERT_EQUAL_32(SVEFirstFlag, w6); ASSERT_EQUAL_32(SVENotLastFlag, w7); ASSERT_EQUAL_32(SVEFirstFlag, w8); ASSERT_EQUAL_32(SVEFirstFlag, w9); ASSERT_EQUAL_32(SVENotLastFlag | SVENoneFlag, w10); ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w11); } } TEST_SVE(sve_bitwise_imm) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // clang-format off uint64_t z21_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef}; uint32_t z22_inputs[] = {0xfedcba98, 0x76543210, 0x01234567, 0x89abcdef}; uint16_t z23_inputs[] = {0xfedc, 0xba98, 0x7654, 0x3210, 0x0123, 0x4567, 0x89ab, 0xcdef}; uint8_t z24_inputs[] = {0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef}; // clang-format on InsrHelper(&masm, z1.VnD(), z21_inputs); InsrHelper(&masm, z2.VnS(), z22_inputs); InsrHelper(&masm, z3.VnH(), z23_inputs); InsrHelper(&masm, z4.VnB(), z24_inputs); __ And(z1.VnD(), z1.VnD(), 0x0000ffff0000ffff); __ And(z2.VnS(), z2.VnS(), 0xff0000ff); __ And(z3.VnH(), z3.VnH(), 0x0ff0); __ And(z4.VnB(), z4.VnB(), 0x3f); InsrHelper(&masm, z5.VnD(), z21_inputs); InsrHelper(&masm, z6.VnS(), z22_inputs); InsrHelper(&masm, z7.VnH(), z23_inputs); InsrHelper(&masm, z8.VnB(), z24_inputs); __ Eor(z5.VnD(), z5.VnD(), 0x0000ffff0000ffff); __ Eor(z6.VnS(), z6.VnS(), 0xff0000ff); __ Eor(z7.VnH(), z7.VnH(), 0x0ff0); __ Eor(z8.VnB(), z8.VnB(), 0x3f); InsrHelper(&masm, z9.VnD(), z21_inputs); InsrHelper(&masm, z10.VnS(), z22_inputs); InsrHelper(&masm, z11.VnH(), z23_inputs); InsrHelper(&masm, z12.VnB(), z24_inputs); __ Orr(z9.VnD(), z9.VnD(), 0x0000ffff0000ffff); __ Orr(z10.VnS(), z10.VnS(), 0xff0000ff); __ Orr(z11.VnH(), z11.VnH(), 0x0ff0); __ Orr(z12.VnB(), z12.VnB(), 0x3f); { // The `Dup` macro maps onto either `dup` or `dupm`, but has its own test, // so here we test `dupm` directly. ExactAssemblyScope guard(&masm, 4 * kInstructionSize); __ dupm(z13.VnD(), 0x7ffffff800000000); __ dupm(z14.VnS(), 0x7ffc7ffc); __ dupm(z15.VnH(), 0x3ffc); __ dupm(z16.VnB(), 0xc3); } END(); if (CAN_RUN()) { RUN(); // clang-format off uint64_t z1_expected[] = {0x0000ba9800003210, 0x000045670000cdef}; uint32_t z2_expected[] = {0xfe000098, 0x76000010, 0x01000067, 0x890000ef}; uint16_t z3_expected[] = {0x0ed0, 0x0a90, 0x0650, 0x0210, 0x0120, 0x0560, 0x09a0, 0x0de0}; uint8_t z4_expected[] = {0x3e, 0x1c, 0x3a, 0x18, 0x36, 0x14, 0x32, 0x10, 0x01, 0x23, 0x05, 0x27, 0x09, 0x2b, 0x0d, 0x2f}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); ASSERT_EQUAL_SVE(z2_expected, z2.VnS()); ASSERT_EQUAL_SVE(z3_expected, z3.VnH()); ASSERT_EQUAL_SVE(z4_expected, z4.VnB()); uint64_t z5_expected[] = {0xfedc45677654cdef, 0x0123ba9889ab3210}; uint32_t z6_expected[] = {0x01dcba67, 0x895432ef, 0xfe234598, 0x76abcd10}; uint16_t z7_expected[] = {0xf12c, 0xb568, 0x79a4, 0x3de0, 0x0ed3, 0x4a97, 0x865b, 0xc21f}; uint8_t z8_expected[] = {0xc1, 0xe3, 0x85, 0xa7, 0x49, 0x6b, 0x0d, 0x2f, 0x3e, 0x1c, 0x7a, 0x58, 0xb6, 0x94, 0xf2, 0xd0}; ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); ASSERT_EQUAL_SVE(z6_expected, z6.VnS()); ASSERT_EQUAL_SVE(z7_expected, z7.VnH()); ASSERT_EQUAL_SVE(z8_expected, z8.VnB()); uint64_t z9_expected[] = {0xfedcffff7654ffff, 0x0123ffff89abffff}; uint32_t z10_expected[] = {0xffdcbaff, 0xff5432ff, 0xff2345ff, 0xffabcdff}; uint16_t z11_expected[] = {0xfffc, 0xbff8, 0x7ff4, 0x3ff0, 0x0ff3, 0x4ff7, 0x8ffb, 0xcfff}; uint8_t z12_expected[] = {0xff, 0xff, 0xbf, 0xbf, 0x7f, 0x7f, 0x3f, 0x3f, 0x3f, 0x3f, 0x7f, 0x7f, 0xbf, 0xbf, 0xff, 0xff}; ASSERT_EQUAL_SVE(z9_expected, z9.VnD()); ASSERT_EQUAL_SVE(z10_expected, z10.VnS()); ASSERT_EQUAL_SVE(z11_expected, z11.VnH()); ASSERT_EQUAL_SVE(z12_expected, z12.VnB()); uint64_t z13_expected[] = {0x7ffffff800000000, 0x7ffffff800000000}; uint32_t z14_expected[] = {0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc}; uint16_t z15_expected[] = {0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc ,0x3ffc}; ASSERT_EQUAL_SVE(z13_expected, z13.VnD()); ASSERT_EQUAL_SVE(z14_expected, z14.VnS()); ASSERT_EQUAL_SVE(z15_expected, z15.VnH()); // clang-format on } } TEST_SVE(sve_dup_imm) { // The `Dup` macro can generate `dup`, `dupm`, and it can synthesise // unencodable immediates. SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Encodable with `dup` (shift 0). __ Dup(z0.VnD(), -1); __ Dup(z1.VnS(), 0x7f); __ Dup(z2.VnH(), -0x80); __ Dup(z3.VnB(), 42); // Encodable with `dup` (shift 8). __ Dup(z4.VnD(), -42 * 256); __ Dup(z5.VnS(), -0x8000); __ Dup(z6.VnH(), 0x7f00); // B-sized lanes cannot take a shift of 8. // Encodable with `dupm` (but not `dup`). __ Dup(z10.VnD(), 0x3fc); __ Dup(z11.VnS(), -516097); // 0xfff81fff, as a signed int. __ Dup(z12.VnH(), 0x0001); // All values that fit B-sized lanes are encodable with `dup`. // Cases that require immediate synthesis. __ Dup(z20.VnD(), 0x1234); __ Dup(z21.VnD(), -4242); __ Dup(z22.VnD(), 0xfedcba9876543210); __ Dup(z23.VnS(), 0x01020304); __ Dup(z24.VnS(), -0x01020304); __ Dup(z25.VnH(), 0x3c38); // All values that fit B-sized lanes are directly encodable. END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(0xffffffffffffffff, z0.VnD()); ASSERT_EQUAL_SVE(0x0000007f, z1.VnS()); ASSERT_EQUAL_SVE(0xff80, z2.VnH()); ASSERT_EQUAL_SVE(0x2a, z3.VnB()); ASSERT_EQUAL_SVE(0xffffffffffffd600, z4.VnD()); ASSERT_EQUAL_SVE(0xffff8000, z5.VnS()); ASSERT_EQUAL_SVE(0x7f00, z6.VnH()); ASSERT_EQUAL_SVE(0x00000000000003fc, z10.VnD()); ASSERT_EQUAL_SVE(0xfff81fff, z11.VnS()); ASSERT_EQUAL_SVE(0x0001, z12.VnH()); ASSERT_EQUAL_SVE(0x1234, z20.VnD()); ASSERT_EQUAL_SVE(0xffffffffffffef6e, z21.VnD()); ASSERT_EQUAL_SVE(0xfedcba9876543210, z22.VnD()); ASSERT_EQUAL_SVE(0x01020304, z23.VnS()); ASSERT_EQUAL_SVE(0xfefdfcfc, z24.VnS()); ASSERT_EQUAL_SVE(0x3c38, z25.VnH()); } } TEST_SVE(sve_inc_dec_p_scalar) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1}; Initialise(&masm, p0.VnB(), p0_inputs); int p0_b_count = 9; int p0_h_count = 5; int p0_s_count = 3; int p0_d_count = 2; // 64-bit operations preserve their high bits. __ Mov(x0, 0x123456780000002a); __ Decp(x0, p0.VnB()); __ Mov(x1, 0x123456780000002a); __ Incp(x1, p0.VnH()); // Check that saturation does not occur. __ Mov(x10, 1); __ Decp(x10, p0.VnS()); __ Mov(x11, UINT64_MAX); __ Incp(x11, p0.VnD()); __ Mov(x12, INT64_MAX); __ Incp(x12, p0.VnB()); // With an all-true predicate, these instructions increment or decrement by // the vector length. __ Ptrue(p15.VnB()); __ Mov(x20, 0x4000000000000000); __ Decp(x20, p15.VnB()); __ Mov(x21, 0x4000000000000000); __ Incp(x21, p15.VnH()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_64(0x123456780000002a - p0_b_count, x0); ASSERT_EQUAL_64(0x123456780000002a + p0_h_count, x1); ASSERT_EQUAL_64(UINT64_C(1) - p0_s_count, x10); ASSERT_EQUAL_64(UINT64_MAX + p0_d_count, x11); ASSERT_EQUAL_64(static_cast(INT64_MAX) + p0_b_count, x12); ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20); ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21); } } TEST_SVE(sve_sqinc_sqdec_p_scalar) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1}; Initialise(&masm, p0.VnB(), p0_inputs); int p0_b_count = 9; int p0_h_count = 5; int p0_s_count = 3; int p0_d_count = 2; uint64_t placeholder_high = 0x1234567800000000; // 64-bit operations preserve their high bits. __ Mov(x0, placeholder_high + 42); __ Sqdecp(x0, p0.VnB()); __ Mov(x1, placeholder_high + 42); __ Sqincp(x1, p0.VnH()); // 32-bit operations sign-extend into their high bits. __ Mov(x2, placeholder_high + 42); __ Sqdecp(x2, p0.VnS(), w2); __ Mov(x3, placeholder_high + 42); __ Sqincp(x3, p0.VnD(), w3); __ Mov(x4, placeholder_high + 1); __ Sqdecp(x4, p0.VnS(), w4); __ Mov(x5, placeholder_high - 1); __ Sqincp(x5, p0.VnD(), w5); // Check that saturation behaves correctly. __ Mov(x10, 0x8000000000000001); // INT64_MIN + 1 __ Sqdecp(x10, p0.VnB()); __ Mov(x11, placeholder_high + 0x80000001); // INT32_MIN + 1 __ Sqdecp(x11, p0.VnH(), w11); __ Mov(x12, 1); __ Sqdecp(x12, p0.VnS()); __ Mov(x13, placeholder_high + 1); __ Sqdecp(x13, p0.VnD(), w13); __ Mov(x14, 0x7ffffffffffffffe); // INT64_MAX - 1 __ Sqincp(x14, p0.VnB()); __ Mov(x15, placeholder_high + 0x7ffffffe); // INT32_MAX - 1 __ Sqincp(x15, p0.VnH(), w15); // Don't use x16 and x17 since they are scratch registers by default. __ Mov(x18, 0xffffffffffffffff); __ Sqincp(x18, p0.VnS()); __ Mov(x19, placeholder_high + 0xffffffff); __ Sqincp(x19, p0.VnD(), w19); __ Mov(x20, placeholder_high + 0xffffffff); __ Sqdecp(x20, p0.VnB(), w20); // With an all-true predicate, these instructions increment or decrement by // the vector length. __ Ptrue(p15.VnB()); __ Mov(x21, 0); __ Sqdecp(x21, p15.VnB()); __ Mov(x22, 0); __ Sqincp(x22, p15.VnH()); __ Mov(x23, placeholder_high); __ Sqdecp(x23, p15.VnS(), w23); __ Mov(x24, placeholder_high); __ Sqincp(x24, p15.VnD(), w24); END(); if (CAN_RUN()) { RUN(); // 64-bit operations preserve their high bits. ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0); ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1); // 32-bit operations sign-extend into their high bits. ASSERT_EQUAL_64(42 - p0_s_count, x2); ASSERT_EQUAL_64(42 + p0_d_count, x3); ASSERT_EQUAL_64(0xffffffff00000000 | (1 - p0_s_count), x4); ASSERT_EQUAL_64(p0_d_count - 1, x5); // Check that saturation behaves correctly. ASSERT_EQUAL_64(INT64_MIN, x10); ASSERT_EQUAL_64(INT32_MIN, x11); ASSERT_EQUAL_64(1 - p0_s_count, x12); ASSERT_EQUAL_64(1 - p0_d_count, x13); ASSERT_EQUAL_64(INT64_MAX, x14); ASSERT_EQUAL_64(INT32_MAX, x15); ASSERT_EQUAL_64(p0_s_count - 1, x18); ASSERT_EQUAL_64(p0_d_count - 1, x19); ASSERT_EQUAL_64(-1 - p0_b_count, x20); // Check all-true predicates. ASSERT_EQUAL_64(-core.GetSVELaneCount(kBRegSize), x21); ASSERT_EQUAL_64(core.GetSVELaneCount(kHRegSize), x22); ASSERT_EQUAL_64(-core.GetSVELaneCount(kSRegSize), x23); ASSERT_EQUAL_64(core.GetSVELaneCount(kDRegSize), x24); } } TEST_SVE(sve_uqinc_uqdec_p_scalar) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1}; Initialise(&masm, p0.VnB(), p0_inputs); int p0_b_count = 9; int p0_h_count = 5; int p0_s_count = 3; int p0_d_count = 2; uint64_t placeholder_high = 0x1234567800000000; // 64-bit operations preserve their high bits. __ Mov(x0, placeholder_high + 42); __ Uqdecp(x0, p0.VnB()); __ Mov(x1, placeholder_high + 42); __ Uqincp(x1, p0.VnH()); // 32-bit operations zero-extend into their high bits. __ Mov(x2, placeholder_high + 42); __ Uqdecp(x2, p0.VnS(), w2); __ Mov(x3, placeholder_high + 42); __ Uqincp(x3, p0.VnD(), w3); __ Mov(x4, placeholder_high + 0x80000001); __ Uqdecp(x4, p0.VnS(), w4); __ Mov(x5, placeholder_high + 0x7fffffff); __ Uqincp(x5, p0.VnD(), w5); // Check that saturation behaves correctly. __ Mov(x10, 1); __ Uqdecp(x10, p0.VnB(), x10); __ Mov(x11, placeholder_high + 1); __ Uqdecp(x11, p0.VnH(), w11); __ Mov(x12, 0x8000000000000000); // INT64_MAX + 1 __ Uqdecp(x12, p0.VnS(), x12); __ Mov(x13, placeholder_high + 0x80000000); // INT32_MAX + 1 __ Uqdecp(x13, p0.VnD(), w13); __ Mov(x14, 0xfffffffffffffffe); // UINT64_MAX - 1 __ Uqincp(x14, p0.VnB(), x14); __ Mov(x15, placeholder_high + 0xfffffffe); // UINT32_MAX - 1 __ Uqincp(x15, p0.VnH(), w15); // Don't use x16 and x17 since they are scratch registers by default. __ Mov(x18, 0x7ffffffffffffffe); // INT64_MAX - 1 __ Uqincp(x18, p0.VnS(), x18); __ Mov(x19, placeholder_high + 0x7ffffffe); // INT32_MAX - 1 __ Uqincp(x19, p0.VnD(), w19); // With an all-true predicate, these instructions increment or decrement by // the vector length. __ Ptrue(p15.VnB()); __ Mov(x20, 0x4000000000000000); __ Uqdecp(x20, p15.VnB(), x20); __ Mov(x21, 0x4000000000000000); __ Uqincp(x21, p15.VnH(), x21); __ Mov(x22, placeholder_high + 0x40000000); __ Uqdecp(x22, p15.VnS(), w22); __ Mov(x23, placeholder_high + 0x40000000); __ Uqincp(x23, p15.VnD(), w23); END(); if (CAN_RUN()) { RUN(); // 64-bit operations preserve their high bits. ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0); ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1); // 32-bit operations zero-extend into their high bits. ASSERT_EQUAL_64(42 - p0_s_count, x2); ASSERT_EQUAL_64(42 + p0_d_count, x3); ASSERT_EQUAL_64(UINT64_C(0x80000001) - p0_s_count, x4); ASSERT_EQUAL_64(UINT64_C(0x7fffffff) + p0_d_count, x5); // Check that saturation behaves correctly. ASSERT_EQUAL_64(0, x10); ASSERT_EQUAL_64(0, x11); ASSERT_EQUAL_64(0x8000000000000000 - p0_s_count, x12); ASSERT_EQUAL_64(UINT64_C(0x80000000) - p0_d_count, x13); ASSERT_EQUAL_64(UINT64_MAX, x14); ASSERT_EQUAL_64(UINT32_MAX, x15); ASSERT_EQUAL_64(0x7ffffffffffffffe + p0_s_count, x18); ASSERT_EQUAL_64(UINT64_C(0x7ffffffe) + p0_d_count, x19); // Check all-true predicates. ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20); ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21); ASSERT_EQUAL_64(0x40000000 - core.GetSVELaneCount(kSRegSize), x22); ASSERT_EQUAL_64(0x40000000 + core.GetSVELaneCount(kDRegSize), x23); } } TEST_SVE(sve_inc_dec_p_vector) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored. int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1}; Initialise(&masm, p0.VnB(), p0_inputs); // Check that saturation does not occur. int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN}; InsrHelper(&masm, z0.VnD(), z0_inputs); int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX}; InsrHelper(&masm, z1.VnD(), z1_inputs); int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN}; InsrHelper(&masm, z2.VnS(), z2_inputs); int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX}; InsrHelper(&masm, z3.VnH(), z3_inputs); // The MacroAssembler implements non-destructive operations using movprfx. __ Decp(z10.VnD(), p0, z0.VnD()); __ Decp(z11.VnD(), p0, z1.VnD()); __ Decp(z12.VnS(), p0, z2.VnS()); __ Decp(z13.VnH(), p0, z3.VnH()); __ Incp(z14.VnD(), p0, z0.VnD()); __ Incp(z15.VnD(), p0, z1.VnD()); __ Incp(z16.VnS(), p0, z2.VnS()); __ Incp(z17.VnH(), p0, z3.VnH()); // Also test destructive forms. __ Mov(z4, z0); __ Mov(z5, z1); __ Mov(z6, z2); __ Mov(z7, z3); __ Decp(z0.VnD(), p0); __ Decp(z1.VnD(), p0); __ Decp(z2.VnS(), p0); __ Decp(z3.VnH(), p0); __ Incp(z4.VnD(), p0); __ Incp(z5.VnD(), p0); __ Incp(z6.VnS(), p0); __ Incp(z7.VnH(), p0); END(); if (CAN_RUN()) { RUN(); // z0_inputs[...] - number of active D lanes (2) int64_t z0_expected[] = {0x1234567800000040, -2, -1, 0x7ffffffffffffffe}; ASSERT_EQUAL_SVE(z0_expected, z0.VnD()); // z1_inputs[...] - number of active D lanes (2) int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); // z2_inputs[...] - number of active S lanes (3) int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, 0x7ffffffd}; ASSERT_EQUAL_SVE(z2_expected, z2.VnS()); // z3_inputs[...] - number of active H lanes (5) int16_t z3_expected[] = {0x1225, -5, -4, -6, 0x7ffb, 0x7ffa}; ASSERT_EQUAL_SVE(z3_expected, z3.VnH()); // z0_inputs[...] + number of active D lanes (2) uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002}; ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); // z1_inputs[...] + number of active D lanes (2) uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, 0x8000000000000001}; ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); // z2_inputs[...] + number of active S lanes (3) uint32_t z6_expected[] = {0x12340045, 3, 2, 4, 0x80000002, 0x80000003}; ASSERT_EQUAL_SVE(z6_expected, z6.VnS()); // z3_inputs[...] + number of active H lanes (5) uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, 0x8004}; ASSERT_EQUAL_SVE(z7_expected, z7.VnH()); // Check that the non-destructive macros produced the same results. ASSERT_EQUAL_SVE(z0_expected, z10.VnD()); ASSERT_EQUAL_SVE(z1_expected, z11.VnD()); ASSERT_EQUAL_SVE(z2_expected, z12.VnS()); ASSERT_EQUAL_SVE(z3_expected, z13.VnH()); ASSERT_EQUAL_SVE(z4_expected, z14.VnD()); ASSERT_EQUAL_SVE(z5_expected, z15.VnD()); ASSERT_EQUAL_SVE(z6_expected, z16.VnS()); ASSERT_EQUAL_SVE(z7_expected, z17.VnH()); } } TEST_SVE(sve_inc_dec_ptrue_vector) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // With an all-true predicate, these instructions increment or decrement by // the vector length. __ Ptrue(p15.VnB()); __ Dup(z0.VnD(), 0); __ Decp(z0.VnD(), p15); __ Dup(z1.VnS(), 0); __ Decp(z1.VnS(), p15); __ Dup(z2.VnH(), 0); __ Decp(z2.VnH(), p15); __ Dup(z3.VnD(), 0); __ Incp(z3.VnD(), p15); __ Dup(z4.VnS(), 0); __ Incp(z4.VnS(), p15); __ Dup(z5.VnH(), 0); __ Incp(z5.VnH(), p15); END(); if (CAN_RUN()) { RUN(); int d_lane_count = core.GetSVELaneCount(kDRegSize); int s_lane_count = core.GetSVELaneCount(kSRegSize); int h_lane_count = core.GetSVELaneCount(kHRegSize); for (int i = 0; i < d_lane_count; i++) { ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i); ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i); } for (int i = 0; i < s_lane_count; i++) { ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i); ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i); } for (int i = 0; i < h_lane_count; i++) { ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i); ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i); } } } TEST_SVE(sve_sqinc_sqdec_p_vector) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored. int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1}; Initialise(&masm, p0.VnB(), p0_inputs); // Check that saturation behaves correctly. int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN}; InsrHelper(&masm, z0.VnD(), z0_inputs); int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX}; InsrHelper(&masm, z1.VnD(), z1_inputs); int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN}; InsrHelper(&masm, z2.VnS(), z2_inputs); int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX}; InsrHelper(&masm, z3.VnH(), z3_inputs); // The MacroAssembler implements non-destructive operations using movprfx. __ Sqdecp(z10.VnD(), p0, z0.VnD()); __ Sqdecp(z11.VnD(), p0, z1.VnD()); __ Sqdecp(z12.VnS(), p0, z2.VnS()); __ Sqdecp(z13.VnH(), p0, z3.VnH()); __ Sqincp(z14.VnD(), p0, z0.VnD()); __ Sqincp(z15.VnD(), p0, z1.VnD()); __ Sqincp(z16.VnS(), p0, z2.VnS()); __ Sqincp(z17.VnH(), p0, z3.VnH()); // Also test destructive forms. __ Mov(z4, z0); __ Mov(z5, z1); __ Mov(z6, z2); __ Mov(z7, z3); __ Sqdecp(z0.VnD(), p0); __ Sqdecp(z1.VnD(), p0); __ Sqdecp(z2.VnS(), p0); __ Sqdecp(z3.VnH(), p0); __ Sqincp(z4.VnD(), p0); __ Sqincp(z5.VnD(), p0); __ Sqincp(z6.VnS(), p0); __ Sqincp(z7.VnH(), p0); END(); if (CAN_RUN()) { RUN(); // z0_inputs[...] - number of active D lanes (2) int64_t z0_expected[] = {0x1234567800000040, -2, -1, INT64_MIN}; ASSERT_EQUAL_SVE(z0_expected, z0.VnD()); // z1_inputs[...] - number of active D lanes (2) int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); // z2_inputs[...] - number of active S lanes (3) int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, INT32_MIN}; ASSERT_EQUAL_SVE(z2_expected, z2.VnS()); // z3_inputs[...] - number of active H lanes (5) int16_t z3_expected[] = {0x1225, -5, -4, -6, INT16_MIN, 0x7ffa}; ASSERT_EQUAL_SVE(z3_expected, z3.VnH()); // z0_inputs[...] + number of active D lanes (2) uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002}; ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); // z1_inputs[...] + number of active D lanes (2) uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, INT64_MAX}; ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); // z2_inputs[...] + number of active S lanes (3) uint32_t z6_expected[] = {0x12340045, 3, 2, 4, INT32_MAX, 0x80000003}; ASSERT_EQUAL_SVE(z6_expected, z6.VnS()); // z3_inputs[...] + number of active H lanes (5) uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, INT16_MAX}; ASSERT_EQUAL_SVE(z7_expected, z7.VnH()); // Check that the non-destructive macros produced the same results. ASSERT_EQUAL_SVE(z0_expected, z10.VnD()); ASSERT_EQUAL_SVE(z1_expected, z11.VnD()); ASSERT_EQUAL_SVE(z2_expected, z12.VnS()); ASSERT_EQUAL_SVE(z3_expected, z13.VnH()); ASSERT_EQUAL_SVE(z4_expected, z14.VnD()); ASSERT_EQUAL_SVE(z5_expected, z15.VnD()); ASSERT_EQUAL_SVE(z6_expected, z16.VnS()); ASSERT_EQUAL_SVE(z7_expected, z17.VnH()); } } TEST_SVE(sve_sqinc_sqdec_ptrue_vector) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // With an all-true predicate, these instructions increment or decrement by // the vector length. __ Ptrue(p15.VnB()); __ Dup(z0.VnD(), 0); __ Sqdecp(z0.VnD(), p15); __ Dup(z1.VnS(), 0); __ Sqdecp(z1.VnS(), p15); __ Dup(z2.VnH(), 0); __ Sqdecp(z2.VnH(), p15); __ Dup(z3.VnD(), 0); __ Sqincp(z3.VnD(), p15); __ Dup(z4.VnS(), 0); __ Sqincp(z4.VnS(), p15); __ Dup(z5.VnH(), 0); __ Sqincp(z5.VnH(), p15); END(); if (CAN_RUN()) { RUN(); int d_lane_count = core.GetSVELaneCount(kDRegSize); int s_lane_count = core.GetSVELaneCount(kSRegSize); int h_lane_count = core.GetSVELaneCount(kHRegSize); for (int i = 0; i < d_lane_count; i++) { ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i); ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i); } for (int i = 0; i < s_lane_count; i++) { ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i); ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i); } for (int i = 0; i < h_lane_count; i++) { ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i); ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i); } } } TEST_SVE(sve_uqinc_uqdec_p_vector) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored. int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1}; Initialise(&masm, p0.VnB(), p0_inputs); // Check that saturation behaves correctly. uint64_t z0_inputs[] = {0x1234567800000042, 0, 1, 0x8000000000000000}; InsrHelper(&masm, z0.VnD(), z0_inputs); uint64_t z1_inputs[] = {0x12345678ffffff2a, 0, UINT64_MAX, INT64_MAX}; InsrHelper(&masm, z1.VnD(), z1_inputs); uint32_t z2_inputs[] = {0x12340042, 0, UINT32_MAX, 1, INT32_MAX, 0x80000000}; InsrHelper(&masm, z2.VnS(), z2_inputs); uint16_t z3_inputs[] = {0x122a, 0, 1, UINT16_MAX, 0x8000, INT16_MAX}; InsrHelper(&masm, z3.VnH(), z3_inputs); // The MacroAssembler implements non-destructive operations using movprfx. __ Uqdecp(z10.VnD(), p0, z0.VnD()); __ Uqdecp(z11.VnD(), p0, z1.VnD()); __ Uqdecp(z12.VnS(), p0, z2.VnS()); __ Uqdecp(z13.VnH(), p0, z3.VnH()); __ Uqincp(z14.VnD(), p0, z0.VnD()); __ Uqincp(z15.VnD(), p0, z1.VnD()); __ Uqincp(z16.VnS(), p0, z2.VnS()); __ Uqincp(z17.VnH(), p0, z3.VnH()); // Also test destructive forms. __ Mov(z4, z0); __ Mov(z5, z1); __ Mov(z6, z2); __ Mov(z7, z3); __ Uqdecp(z0.VnD(), p0); __ Uqdecp(z1.VnD(), p0); __ Uqdecp(z2.VnS(), p0); __ Uqdecp(z3.VnH(), p0); __ Uqincp(z4.VnD(), p0); __ Uqincp(z5.VnD(), p0); __ Uqincp(z6.VnS(), p0); __ Uqincp(z7.VnH(), p0); END(); if (CAN_RUN()) { RUN(); // z0_inputs[...] - number of active D lanes (2) uint64_t z0_expected[] = {0x1234567800000040, 0, 0, 0x7ffffffffffffffe}; ASSERT_EQUAL_SVE(z0_expected, z0.VnD()); // z1_inputs[...] - number of active D lanes (2) uint64_t z1_expected[] = {0x12345678ffffff28, 0, 0xfffffffffffffffd, 0x7ffffffffffffffd}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); // z2_inputs[...] - number of active S lanes (3) uint32_t z2_expected[] = {0x1234003f, 0, 0xfffffffc, 0, 0x7ffffffc, 0x7ffffffd}; ASSERT_EQUAL_SVE(z2_expected, z2.VnS()); // z3_inputs[...] - number of active H lanes (5) uint16_t z3_expected[] = {0x1225, 0, 0, 0xfffa, 0x7ffb, 0x7ffa}; ASSERT_EQUAL_SVE(z3_expected, z3.VnH()); // z0_inputs[...] + number of active D lanes (2) uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002}; ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); // z1_inputs[...] + number of active D lanes (2) uint64_t z5_expected[] = {0x12345678ffffff2c, 2, UINT64_MAX, 0x8000000000000001}; ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); // z2_inputs[...] + number of active S lanes (3) uint32_t z6_expected[] = {0x12340045, 3, UINT32_MAX, 4, 0x80000002, 0x80000003}; ASSERT_EQUAL_SVE(z6_expected, z6.VnS()); // z3_inputs[...] + number of active H lanes (5) uint16_t z7_expected[] = {0x122f, 5, 6, UINT16_MAX, 0x8005, 0x8004}; ASSERT_EQUAL_SVE(z7_expected, z7.VnH()); // Check that the non-destructive macros produced the same results. ASSERT_EQUAL_SVE(z0_expected, z10.VnD()); ASSERT_EQUAL_SVE(z1_expected, z11.VnD()); ASSERT_EQUAL_SVE(z2_expected, z12.VnS()); ASSERT_EQUAL_SVE(z3_expected, z13.VnH()); ASSERT_EQUAL_SVE(z4_expected, z14.VnD()); ASSERT_EQUAL_SVE(z5_expected, z15.VnD()); ASSERT_EQUAL_SVE(z6_expected, z16.VnS()); ASSERT_EQUAL_SVE(z7_expected, z17.VnH()); } } TEST_SVE(sve_uqinc_uqdec_ptrue_vector) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // With an all-true predicate, these instructions increment or decrement by // the vector length. __ Ptrue(p15.VnB()); __ Mov(x0, 0x1234567800000000); __ Mov(x1, 0x12340000); __ Mov(x2, 0x1200); __ Dup(z0.VnD(), x0); __ Uqdecp(z0.VnD(), p15); __ Dup(z1.VnS(), x1); __ Uqdecp(z1.VnS(), p15); __ Dup(z2.VnH(), x2); __ Uqdecp(z2.VnH(), p15); __ Dup(z3.VnD(), x0); __ Uqincp(z3.VnD(), p15); __ Dup(z4.VnS(), x1); __ Uqincp(z4.VnS(), p15); __ Dup(z5.VnH(), x2); __ Uqincp(z5.VnH(), p15); END(); if (CAN_RUN()) { RUN(); int d_lane_count = core.GetSVELaneCount(kDRegSize); int s_lane_count = core.GetSVELaneCount(kSRegSize); int h_lane_count = core.GetSVELaneCount(kHRegSize); for (int i = 0; i < d_lane_count; i++) { ASSERT_EQUAL_SVE_LANE(0x1234567800000000 - d_lane_count, z0.VnD(), i); ASSERT_EQUAL_SVE_LANE(0x1234567800000000 + d_lane_count, z3.VnD(), i); } for (int i = 0; i < s_lane_count; i++) { ASSERT_EQUAL_SVE_LANE(0x12340000 - s_lane_count, z1.VnS(), i); ASSERT_EQUAL_SVE_LANE(0x12340000 + s_lane_count, z4.VnS(), i); } for (int i = 0; i < h_lane_count; i++) { ASSERT_EQUAL_SVE_LANE(0x1200 - h_lane_count, z2.VnH(), i); ASSERT_EQUAL_SVE_LANE(0x1200 + h_lane_count, z5.VnH(), i); } } } TEST_SVE(sve_index) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Simple cases. __ Index(z0.VnB(), 0, 1); __ Index(z1.VnH(), 1, 1); __ Index(z2.VnS(), 2, 1); __ Index(z3.VnD(), 3, 1); // Synthesised immediates. __ Index(z4.VnB(), 42, -1); __ Index(z5.VnH(), -1, 42); __ Index(z6.VnS(), 42, 42); // Register arguments. __ Mov(x0, 42); __ Mov(x1, -3); __ Index(z10.VnD(), x0, x1); __ Index(z11.VnB(), w0, w1); // The register size should correspond to the lane size, but VIXL allows any // register at least as big as the lane size. __ Index(z12.VnB(), x0, x1); __ Index(z13.VnH(), w0, x1); __ Index(z14.VnS(), x0, w1); // Integer overflow. __ Index(z20.VnB(), UINT8_MAX - 2, 2); __ Index(z21.VnH(), 7, -3); __ Index(z22.VnS(), INT32_MAX - 2, 1); __ Index(z23.VnD(), INT64_MIN + 6, -7); END(); if (CAN_RUN()) { RUN(); int b_lane_count = core.GetSVELaneCount(kBRegSize); int h_lane_count = core.GetSVELaneCount(kHRegSize); int s_lane_count = core.GetSVELaneCount(kSRegSize); int d_lane_count = core.GetSVELaneCount(kDRegSize); uint64_t b_mask = GetUintMask(kBRegSize); uint64_t h_mask = GetUintMask(kHRegSize); uint64_t s_mask = GetUintMask(kSRegSize); uint64_t d_mask = GetUintMask(kDRegSize); // Simple cases. for (int i = 0; i < b_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((0 + i) & b_mask, z0.VnB(), i); } for (int i = 0; i < h_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((1 + i) & h_mask, z1.VnH(), i); } for (int i = 0; i < s_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((2 + i) & s_mask, z2.VnS(), i); } for (int i = 0; i < d_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((3 + i) & d_mask, z3.VnD(), i); } // Synthesised immediates. for (int i = 0; i < b_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((42 - i) & b_mask, z4.VnB(), i); } for (int i = 0; i < h_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((-1 + (42 * i)) & h_mask, z5.VnH(), i); } for (int i = 0; i < s_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((42 + (42 * i)) & s_mask, z6.VnS(), i); } // Register arguments. for (int i = 0; i < d_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & d_mask, z10.VnD(), i); } for (int i = 0; i < b_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z11.VnB(), i); } for (int i = 0; i < b_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z12.VnB(), i); } for (int i = 0; i < h_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & h_mask, z13.VnH(), i); } for (int i = 0; i < s_lane_count; i++) { ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & s_mask, z14.VnS(), i); } // Integer overflow. uint8_t expected_z20[] = {0x05, 0x03, 0x01, 0xff, 0xfd}; ASSERT_EQUAL_SVE(expected_z20, z20.VnB()); uint16_t expected_z21[] = {0xfffb, 0xfffe, 0x0001, 0x0004, 0x0007}; ASSERT_EQUAL_SVE(expected_z21, z21.VnH()); uint32_t expected_z22[] = {0x80000000, 0x7fffffff, 0x7ffffffe, 0x7ffffffd}; ASSERT_EQUAL_SVE(expected_z22, z22.VnS()); uint64_t expected_z23[] = {0x7fffffffffffffff, 0x8000000000000006}; ASSERT_EQUAL_SVE(expected_z23, z23.VnD()); } } TEST(sve_int_compare_count_and_limit_scalars) { SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Mov(w20, 0xfffffffd); __ Mov(w21, 0xffffffff); __ Whilele(p0.VnB(), w20, w21); __ Mrs(x0, NZCV); __ Whilele(p1.VnH(), w20, w21); __ Mrs(x1, NZCV); __ Mov(w20, 0xffffffff); __ Mov(w21, 0x00000000); __ Whilelt(p2.VnS(), w20, w21); __ Mrs(x2, NZCV); __ Whilelt(p3.VnD(), w20, w21); __ Mrs(x3, NZCV); __ Mov(w20, 0xfffffffd); __ Mov(w21, 0xffffffff); __ Whilels(p4.VnB(), w20, w21); __ Mrs(x4, NZCV); __ Whilels(p5.VnH(), w20, w21); __ Mrs(x5, NZCV); __ Mov(w20, 0xffffffff); __ Mov(w21, 0x00000000); __ Whilelo(p6.VnS(), w20, w21); __ Mrs(x6, NZCV); __ Whilelo(p7.VnD(), w20, w21); __ Mrs(x7, NZCV); __ Mov(x20, 0xfffffffffffffffd); __ Mov(x21, 0xffffffffffffffff); __ Whilele(p8.VnB(), x20, x21); __ Mrs(x8, NZCV); __ Whilele(p9.VnH(), x20, x21); __ Mrs(x9, NZCV); __ Mov(x20, 0xffffffffffffffff); __ Mov(x21, 0x0000000000000000); __ Whilelt(p10.VnS(), x20, x21); __ Mrs(x10, NZCV); __ Whilelt(p11.VnD(), x20, x21); __ Mrs(x11, NZCV); __ Mov(x20, 0xfffffffffffffffd); __ Mov(x21, 0xffffffffffffffff); __ Whilels(p12.VnB(), x20, x21); __ Mrs(x12, NZCV); __ Whilels(p13.VnH(), x20, x21); __ Mrs(x13, NZCV); __ Mov(x20, 0xffffffffffffffff); __ Mov(x21, 0x0000000000000000); __ Whilelo(p14.VnS(), x20, x21); __ Mrs(x14, NZCV); __ Whilelo(p15.VnD(), x20, x21); __ Mrs(x15, NZCV); END(); if (CAN_RUN()) { RUN(); // 0b...00000000'00000111 int p0_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1}; ASSERT_EQUAL_SVE(p0_expected, p0.VnB()); // 0b...00000000'00010101 int p1_expected[] = {0, 0, 0, 0, 0, 1, 1, 1}; ASSERT_EQUAL_SVE(p1_expected, p1.VnH()); int p2_expected[] = {0x0, 0x0, 0x0, 0x1}; ASSERT_EQUAL_SVE(p2_expected, p2.VnS()); int p3_expected[] = {0x00, 0x01}; ASSERT_EQUAL_SVE(p3_expected, p3.VnD()); // 0b...11111111'11111111 int p4_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p4_expected, p4.VnB()); // 0b...01010101'01010101 int p5_expected[] = {1, 1, 1, 1, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p5_expected, p5.VnH()); int p6_expected[] = {0x0, 0x0, 0x0, 0x0}; ASSERT_EQUAL_SVE(p6_expected, p6.VnS()); int p7_expected[] = {0x00, 0x00}; ASSERT_EQUAL_SVE(p7_expected, p7.VnD()); // 0b...00000000'00000111 int p8_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1}; ASSERT_EQUAL_SVE(p8_expected, p8.VnB()); // 0b...00000000'00010101 int p9_expected[] = {0, 0, 0, 0, 0, 1, 1, 1}; ASSERT_EQUAL_SVE(p9_expected, p9.VnH()); int p10_expected[] = {0x0, 0x0, 0x0, 0x1}; ASSERT_EQUAL_SVE(p10_expected, p10.VnS()); int p11_expected[] = {0x00, 0x01}; ASSERT_EQUAL_SVE(p11_expected, p11.VnD()); // 0b...11111111'11111111 int p12_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p12_expected, p12.VnB()); // 0b...01010101'01010101 int p13_expected[] = {1, 1, 1, 1, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p13_expected, p13.VnH()); int p14_expected[] = {0x0, 0x0, 0x0, 0x0}; ASSERT_EQUAL_SVE(p14_expected, p14.VnS()); int p15_expected[] = {0x00, 0x00}; ASSERT_EQUAL_SVE(p15_expected, p15.VnD()); ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w0); ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w1); ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w2); ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w3); ASSERT_EQUAL_32(SVEFirstFlag, w4); ASSERT_EQUAL_32(SVEFirstFlag, w5); ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w6); ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w7); ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w8); ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w9); ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10); ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w11); ASSERT_EQUAL_32(SVEFirstFlag, w12); ASSERT_EQUAL_32(SVEFirstFlag, w13); ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w14); ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w15); } } TEST(sve_int_compare_count_and_limit_scalars_regression_test) { SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Mov(w0, 0x7ffffffd); __ Mov(w1, 0x7fffffff); __ Whilele(p0.VnB(), w0, w1); END(); if (CAN_RUN()) { RUN(); int p0_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p0_expected, p0.VnB()); } } TEST(sve_int_compare_vectors_signed_imm) { SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int z13_inputs[] = {0, 1, -1, -15, 126, -127, -126, -15}; int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 1, 1}; InsrHelper(&masm, z13.VnB(), z13_inputs); Initialise(&masm, p0.VnB(), mask_inputs1); __ Cmpeq(p2.VnB(), p0.Zeroing(), z13.VnB(), -15); __ Mrs(x2, NZCV); __ Cmpeq(p3.VnB(), p0.Zeroing(), z13.VnB(), -127); int z14_inputs[] = {0, 1, -1, -32767, -32766, 32767, 32766, 0}; int mask_inputs2[] = {1, 1, 1, 0, 1, 1, 1, 1}; InsrHelper(&masm, z14.VnH(), z14_inputs); Initialise(&masm, p0.VnH(), mask_inputs2); __ Cmpge(p4.VnH(), p0.Zeroing(), z14.VnH(), -1); __ Mrs(x4, NZCV); __ Cmpge(p5.VnH(), p0.Zeroing(), z14.VnH(), -32767); int z15_inputs[] = {0, 1, -1, INT_MIN}; int mask_inputs3[] = {0, 1, 1, 1}; InsrHelper(&masm, z15.VnS(), z15_inputs); Initialise(&masm, p0.VnS(), mask_inputs3); __ Cmpgt(p6.VnS(), p0.Zeroing(), z15.VnS(), 0); __ Mrs(x6, NZCV); __ Cmpgt(p7.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1); __ Cmplt(p8.VnS(), p0.Zeroing(), z15.VnS(), 0); __ Mrs(x8, NZCV); __ Cmplt(p9.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1); int64_t z16_inputs[] = {0, -1}; int mask_inputs4[] = {1, 1}; InsrHelper(&masm, z16.VnD(), z16_inputs); Initialise(&masm, p0.VnD(), mask_inputs4); __ Cmple(p10.VnD(), p0.Zeroing(), z16.VnD(), -1); __ Mrs(x10, NZCV); __ Cmple(p11.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MIN); __ Cmpne(p12.VnD(), p0.Zeroing(), z16.VnD(), -1); __ Mrs(x12, NZCV); __ Cmpne(p13.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MAX); END(); if (CAN_RUN()) { RUN(); int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1}; ASSERT_EQUAL_SVE(p2_expected, p2.VnB()); int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 0}; ASSERT_EQUAL_SVE(p3_expected, p3.VnB()); int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1, 0x1}; ASSERT_EQUAL_SVE(p4_expected, p4.VnH()); int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1}; ASSERT_EQUAL_SVE(p5_expected, p5.VnH()); int p6_expected[] = {0x0, 0x1, 0x0, 0x0}; ASSERT_EQUAL_SVE(p6_expected, p6.VnS()); int p7_expected[] = {0x0, 0x1, 0x1, 0x0}; ASSERT_EQUAL_SVE(p7_expected, p7.VnS()); int p8_expected[] = {0x0, 0x0, 0x1, 0x1}; ASSERT_EQUAL_SVE(p8_expected, p8.VnS()); int p9_expected[] = {0x0, 0x0, 0x0, 0x1}; ASSERT_EQUAL_SVE(p9_expected, p9.VnS()); int p10_expected[] = {0x00, 0x01}; ASSERT_EQUAL_SVE(p10_expected, p10.VnD()); int p11_expected[] = {0x00, 0x00}; ASSERT_EQUAL_SVE(p11_expected, p11.VnD()); int p12_expected[] = {0x01, 0x00}; ASSERT_EQUAL_SVE(p12_expected, p12.VnD()); int p13_expected[] = {0x01, 0x01}; ASSERT_EQUAL_SVE(p13_expected, p13.VnD()); ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w2); ASSERT_EQUAL_32(SVEFirstFlag, w4); ASSERT_EQUAL_32(NoFlag, w6); ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8); ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w10); ASSERT_EQUAL_32(NoFlag, w12); } } TEST(sve_int_compare_vectors_unsigned_imm) { SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint32_t src1_inputs[] = {0xf7, 0x0f, 0x8f, 0x1f, 0x83, 0x12, 0x00, 0xf1}; int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 0, 1}; InsrHelper(&masm, z13.VnB(), src1_inputs); Initialise(&masm, p0.VnB(), mask_inputs1); __ Cmphi(p2.VnB(), p0.Zeroing(), z13.VnB(), 0x0f); __ Mrs(x2, NZCV); __ Cmphi(p3.VnB(), p0.Zeroing(), z13.VnB(), 0xf0); uint32_t src2_inputs[] = {0xffff, 0x8000, 0x1fff, 0x0000, 0x1234}; int mask_inputs2[] = {1, 1, 1, 1, 0}; InsrHelper(&masm, z13.VnH(), src2_inputs); Initialise(&masm, p0.VnH(), mask_inputs2); __ Cmphs(p4.VnH(), p0.Zeroing(), z13.VnH(), 0x1f); __ Mrs(x4, NZCV); __ Cmphs(p5.VnH(), p0.Zeroing(), z13.VnH(), 0x1fff); uint32_t src3_inputs[] = {0xffffffff, 0xfedcba98, 0x0000ffff, 0x00000000}; int mask_inputs3[] = {1, 1, 1, 1}; InsrHelper(&masm, z13.VnS(), src3_inputs); Initialise(&masm, p0.VnS(), mask_inputs3); __ Cmplo(p6.VnS(), p0.Zeroing(), z13.VnS(), 0x3f); __ Mrs(x6, NZCV); __ Cmplo(p7.VnS(), p0.Zeroing(), z13.VnS(), 0x3f3f3f3f); uint64_t src4_inputs[] = {0xffffffffffffffff, 0x0000000000000000}; int mask_inputs4[] = {1, 1}; InsrHelper(&masm, z13.VnD(), src4_inputs); Initialise(&masm, p0.VnD(), mask_inputs4); __ Cmpls(p8.VnD(), p0.Zeroing(), z13.VnD(), 0x2f); __ Mrs(x8, NZCV); __ Cmpls(p9.VnD(), p0.Zeroing(), z13.VnD(), 0x800000000000000); END(); if (CAN_RUN()) { RUN(); int p2_expected[] = {1, 0, 1, 0, 1, 1, 0, 1}; ASSERT_EQUAL_SVE(p2_expected, p2.VnB()); int p3_expected[] = {1, 0, 0, 0, 0, 0, 0, 1}; ASSERT_EQUAL_SVE(p3_expected, p3.VnB()); int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0}; ASSERT_EQUAL_SVE(p4_expected, p4.VnH()); int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0}; ASSERT_EQUAL_SVE(p5_expected, p5.VnH()); int p6_expected[] = {0x0, 0x0, 0x0, 0x1}; ASSERT_EQUAL_SVE(p6_expected, p6.VnS()); int p7_expected[] = {0x0, 0x0, 0x1, 0x1}; ASSERT_EQUAL_SVE(p7_expected, p7.VnS()); int p8_expected[] = {0x00, 0x01}; ASSERT_EQUAL_SVE(p8_expected, p8.VnD()); int p9_expected[] = {0x00, 0x01}; ASSERT_EQUAL_SVE(p9_expected, p9.VnD()); ASSERT_EQUAL_32(SVEFirstFlag, w2); ASSERT_EQUAL_32(NoFlag, w4); ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w6); ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8); } } TEST(sve_int_compare_conditionally_terminate_scalars) { SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Mov(x0, 0xfedcba9887654321); __ Mov(x1, 0x1000100010001000); // Initialise Z and C. These are preserved by cterm*, and the V flag is set to // !C if the condition does not hold. __ Mov(x10, NoFlag); __ Msr(NZCV, x10); __ Ctermeq(w0, w0); __ Mrs(x2, NZCV); __ Ctermeq(x0, x1); __ Mrs(x3, NZCV); __ Ctermne(x0, x0); __ Mrs(x4, NZCV); __ Ctermne(w0, w1); __ Mrs(x5, NZCV); // As above, but with all flags initially set. __ Mov(x10, NZCVFlag); __ Msr(NZCV, x10); __ Ctermeq(w0, w0); __ Mrs(x6, NZCV); __ Ctermeq(x0, x1); __ Mrs(x7, NZCV); __ Ctermne(x0, x0); __ Mrs(x8, NZCV); __ Ctermne(w0, w1); __ Mrs(x9, NZCV); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_32(SVEFirstFlag, w2); ASSERT_EQUAL_32(VFlag, w3); ASSERT_EQUAL_32(VFlag, w4); ASSERT_EQUAL_32(SVEFirstFlag, w5); ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w6); ASSERT_EQUAL_32(ZCFlag, w7); ASSERT_EQUAL_32(ZCFlag, w8); ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w9); } } // Work out what the architectural `PredTest` pseudocode should produce for the // given result and governing predicate. template static StatusFlags GetPredTestFlags(const Td (&pd)[N], const Tg (&pg)[N], int vl) { int first = -1; int last = -1; bool any_active = false; // Only consider potentially-active lanes. int start = (N > vl) ? (N - vl) : 0; for (int i = start; i < N; i++) { if ((pg[i] & 1) == 1) { // Look for the first and last active lanes. // Note that the 'first' lane is the one with the highest index. if (last < 0) last = i; first = i; // Look for any active lanes that are also active in pd. if ((pd[i] & 1) == 1) any_active = true; } } uint32_t flags = 0; if ((first >= 0) && ((pd[first] & 1) == 1)) flags |= SVEFirstFlag; if (!any_active) flags |= SVENoneFlag; if ((last < 0) || ((pd[last] & 1) == 0)) flags |= SVENotLastFlag; return static_cast(flags); } typedef void (MacroAssembler::*PfirstPnextFn)(const PRegisterWithLaneSize& pd, const PRegister& pg, const PRegisterWithLaneSize& pn); template static void PfirstPnextHelper(Test* config, PfirstPnextFn macro, unsigned lane_size_in_bits, const Tg& pg_inputs, const Tn& pn_inputs, const Td& pd_expected) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); PRegister pg = p15; PRegister pn = p14; Initialise(&masm, pg.WithLaneSize(lane_size_in_bits), pg_inputs); Initialise(&masm, pn.WithLaneSize(lane_size_in_bits), pn_inputs); // Initialise NZCV to an impossible value, to check that we actually write it. __ Mov(x10, NZCVFlag); // If pd.Is(pn), the MacroAssembler simply passes the arguments directly to // the Assembler. __ Msr(NZCV, x10); __ Mov(p0, pn); (masm.*macro)(p0.WithLaneSize(lane_size_in_bits), pg, p0.WithLaneSize(lane_size_in_bits)); __ Mrs(x0, NZCV); // The MacroAssembler supports non-destructive use. __ Msr(NZCV, x10); (masm.*macro)(p1.WithLaneSize(lane_size_in_bits), pg, pn.WithLaneSize(lane_size_in_bits)); __ Mrs(x1, NZCV); // If pd.Aliases(pg) the macro requires a scratch register. { UseScratchRegisterScope temps(&masm); temps.Include(p13); __ Msr(NZCV, x10); __ Mov(p2, p15); (masm.*macro)(p2.WithLaneSize(lane_size_in_bits), p2, pn.WithLaneSize(lane_size_in_bits)); __ Mrs(x2, NZCV); } END(); if (CAN_RUN()) { RUN(); // Check that the inputs weren't modified. ASSERT_EQUAL_SVE(pn_inputs, pn.WithLaneSize(lane_size_in_bits)); ASSERT_EQUAL_SVE(pg_inputs, pg.WithLaneSize(lane_size_in_bits)); // Check the primary operation. ASSERT_EQUAL_SVE(pd_expected, p0.WithLaneSize(lane_size_in_bits)); ASSERT_EQUAL_SVE(pd_expected, p1.WithLaneSize(lane_size_in_bits)); ASSERT_EQUAL_SVE(pd_expected, p2.WithLaneSize(lane_size_in_bits)); // Check that the flags were properly set. StatusFlags nzcv_expected = GetPredTestFlags(pd_expected, pg_inputs, core.GetSVELaneCount(kBRegSize)); ASSERT_EQUAL_64(nzcv_expected, x0); ASSERT_EQUAL_64(nzcv_expected, x1); ASSERT_EQUAL_64(nzcv_expected, x2); } } template static void PfirstHelper(Test* config, const Tg& pg_inputs, const Tn& pn_inputs, const Td& pd_expected) { PfirstPnextHelper(config, &MacroAssembler::Pfirst, kBRegSize, // pfirst only accepts B-sized lanes. pg_inputs, pn_inputs, pd_expected); } template static void PnextHelper(Test* config, unsigned lane_size_in_bits, const Tg& pg_inputs, const Tn& pn_inputs, const Td& pd_expected) { PfirstPnextHelper(config, &MacroAssembler::Pnext, lane_size_in_bits, pg_inputs, pn_inputs, pd_expected); } TEST_SVE(sve_pfirst) { // Provide more lanes than kPRegMinSize (to check propagation if we have a // large VL), but few enough to make the test easy to read. int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int in1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0}; int in2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}; int in3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1}; int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; VIXL_ASSERT(ArrayLength(in0) > kPRegMinSize); // Pfirst finds the first active lane in pg, and activates the corresponding // lane in pn (if it isn't already active). // The first active lane in in1 is here. | // v int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}; int exp12[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0}; int exp13[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1}; int exp14[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}; PfirstHelper(config, in1, in0, exp10); PfirstHelper(config, in1, in2, exp12); PfirstHelper(config, in1, in3, exp13); PfirstHelper(config, in1, in4, exp14); // The first active lane in in2 is here. | // v int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}; int exp21[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0}; int exp23[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1}; int exp24[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}; PfirstHelper(config, in2, in0, exp20); PfirstHelper(config, in2, in1, exp21); PfirstHelper(config, in2, in3, exp23); PfirstHelper(config, in2, in4, exp24); // The first active lane in in3 is here. | // v int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int exp31[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1}; int exp32[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1}; int exp34[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; PfirstHelper(config, in3, in0, exp30); PfirstHelper(config, in3, in1, exp31); PfirstHelper(config, in3, in2, exp32); PfirstHelper(config, in3, in4, exp34); // | The first active lane in in4 is here. // v int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp41[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0}; int exp42[] = {1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}; int exp43[] = {1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1}; PfirstHelper(config, in4, in0, exp40); PfirstHelper(config, in4, in1, exp41); PfirstHelper(config, in4, in2, exp42); PfirstHelper(config, in4, in3, exp43); // If pg is all inactive, the input is passed through unchanged. PfirstHelper(config, in0, in0, in0); PfirstHelper(config, in0, in1, in1); PfirstHelper(config, in0, in2, in2); PfirstHelper(config, in0, in3, in3); // If the values of pg and pn match, the value is passed through unchanged. PfirstHelper(config, in0, in0, in0); PfirstHelper(config, in1, in1, in1); PfirstHelper(config, in2, in2, in2); PfirstHelper(config, in3, in3, in3); } TEST_SVE(sve_pfirst_alias) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Check that the Simulator behaves correctly when all arguments are aliased. int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0}; int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0}; int in_s[] = {0, 1, 1, 0}; int in_d[] = {1, 1}; Initialise(&masm, p0.VnB(), in_b); Initialise(&masm, p1.VnH(), in_h); Initialise(&masm, p2.VnS(), in_s); Initialise(&masm, p3.VnD(), in_d); // Initialise NZCV to an impossible value, to check that we actually write it. __ Mov(x10, NZCVFlag); __ Msr(NZCV, x10); __ Pfirst(p0.VnB(), p0, p0.VnB()); __ Mrs(x0, NZCV); __ Msr(NZCV, x10); __ Pfirst(p1.VnB(), p1, p1.VnB()); __ Mrs(x1, NZCV); __ Msr(NZCV, x10); __ Pfirst(p2.VnB(), p2, p2.VnB()); __ Mrs(x2, NZCV); __ Msr(NZCV, x10); __ Pfirst(p3.VnB(), p3, p3.VnB()); __ Mrs(x3, NZCV); END(); if (CAN_RUN()) { RUN(); // The first lane from pg is already active in pdn, so the P register should // be unchanged. ASSERT_EQUAL_SVE(in_b, p0.VnB()); ASSERT_EQUAL_SVE(in_h, p1.VnH()); ASSERT_EQUAL_SVE(in_s, p2.VnS()); ASSERT_EQUAL_SVE(in_d, p3.VnD()); ASSERT_EQUAL_64(SVEFirstFlag, x0); ASSERT_EQUAL_64(SVEFirstFlag, x1); ASSERT_EQUAL_64(SVEFirstFlag, x2); ASSERT_EQUAL_64(SVEFirstFlag, x3); } } TEST_SVE(sve_pnext_b) { // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize // (to check propagation if we have a large VL), but few enough to make the // test easy to read. // For now, we just use kPRegMinSize so that the test works anywhere. int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int in1[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0}; int in2[] = {0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}; int in3[] = {0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1}; int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // Pnext activates the next element that is true in pg, after the last-active // element in pn. If all pn elements are false (as in in0), it starts looking // at element 0. // There are no active lanes in in0, so the result is simply the first active // lane from pg. int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}; int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}; int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // The last active lane in in1 is here. | // v int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp21[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp31[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // | The last active lane in in2 is here. // v int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // | The last active lane in in3 is here. // v int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp23[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // | The last active lane in in4 is here. // v int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; PnextHelper(config, kBRegSize, in0, in0, exp00); PnextHelper(config, kBRegSize, in1, in0, exp10); PnextHelper(config, kBRegSize, in2, in0, exp20); PnextHelper(config, kBRegSize, in3, in0, exp30); PnextHelper(config, kBRegSize, in4, in0, exp40); PnextHelper(config, kBRegSize, in0, in1, exp01); PnextHelper(config, kBRegSize, in1, in1, exp11); PnextHelper(config, kBRegSize, in2, in1, exp21); PnextHelper(config, kBRegSize, in3, in1, exp31); PnextHelper(config, kBRegSize, in4, in1, exp41); PnextHelper(config, kBRegSize, in0, in2, exp02); PnextHelper(config, kBRegSize, in1, in2, exp12); PnextHelper(config, kBRegSize, in2, in2, exp22); PnextHelper(config, kBRegSize, in3, in2, exp32); PnextHelper(config, kBRegSize, in4, in2, exp42); PnextHelper(config, kBRegSize, in0, in3, exp03); PnextHelper(config, kBRegSize, in1, in3, exp13); PnextHelper(config, kBRegSize, in2, in3, exp23); PnextHelper(config, kBRegSize, in3, in3, exp33); PnextHelper(config, kBRegSize, in4, in3, exp43); PnextHelper(config, kBRegSize, in0, in4, exp04); PnextHelper(config, kBRegSize, in1, in4, exp14); PnextHelper(config, kBRegSize, in2, in4, exp24); PnextHelper(config, kBRegSize, in3, in4, exp34); PnextHelper(config, kBRegSize, in4, in4, exp44); } TEST_SVE(sve_pnext_h) { // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize // (to check propagation if we have a large VL), but few enough to make the // test easy to read. // For now, we just use kPRegMinSize so that the test works anywhere. int in0[] = {0, 0, 0, 0, 0, 0, 0, 0}; int in1[] = {0, 0, 0, 1, 0, 2, 1, 0}; int in2[] = {0, 1, 2, 0, 2, 0, 2, 0}; int in3[] = {0, 0, 0, 3, 0, 0, 0, 3}; int in4[] = {3, 0, 0, 0, 0, 0, 0, 0}; // Pnext activates the next element that is true in pg, after the last-active // element in pn. If all pn elements are false (as in in0), it starts looking // at element 0. // // As for other SVE instructions, elements are only considered to be active if // the _first_ bit in each field is one. Other bits are ignored. // There are no active lanes in in0, so the result is simply the first active // lane from pg. int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp10[] = {0, 0, 0, 0, 0, 0, 1, 0}; int exp20[] = {0, 1, 0, 0, 0, 0, 0, 0}; int exp30[] = {0, 0, 0, 0, 0, 0, 0, 1}; int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0}; // | The last active lane in in1 is here. // v int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp21[] = {0, 1, 0, 0, 0, 0, 0, 0}; int exp31[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0}; // | The last active lane in in2 is here. // v int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0}; // | The last active lane in in3 is here. // v int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp23[] = {0, 1, 0, 0, 0, 0, 0, 0}; int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0}; // | The last active lane in in4 is here. // v int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0}; int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0}; PnextHelper(config, kHRegSize, in0, in0, exp00); PnextHelper(config, kHRegSize, in1, in0, exp10); PnextHelper(config, kHRegSize, in2, in0, exp20); PnextHelper(config, kHRegSize, in3, in0, exp30); PnextHelper(config, kHRegSize, in4, in0, exp40); PnextHelper(config, kHRegSize, in0, in1, exp01); PnextHelper(config, kHRegSize, in1, in1, exp11); PnextHelper(config, kHRegSize, in2, in1, exp21); PnextHelper(config, kHRegSize, in3, in1, exp31); PnextHelper(config, kHRegSize, in4, in1, exp41); PnextHelper(config, kHRegSize, in0, in2, exp02); PnextHelper(config, kHRegSize, in1, in2, exp12); PnextHelper(config, kHRegSize, in2, in2, exp22); PnextHelper(config, kHRegSize, in3, in2, exp32); PnextHelper(config, kHRegSize, in4, in2, exp42); PnextHelper(config, kHRegSize, in0, in3, exp03); PnextHelper(config, kHRegSize, in1, in3, exp13); PnextHelper(config, kHRegSize, in2, in3, exp23); PnextHelper(config, kHRegSize, in3, in3, exp33); PnextHelper(config, kHRegSize, in4, in3, exp43); PnextHelper(config, kHRegSize, in0, in4, exp04); PnextHelper(config, kHRegSize, in1, in4, exp14); PnextHelper(config, kHRegSize, in2, in4, exp24); PnextHelper(config, kHRegSize, in3, in4, exp34); PnextHelper(config, kHRegSize, in4, in4, exp44); } TEST_SVE(sve_pnext_s) { // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize // (to check propagation if we have a large VL), but few enough to make the // test easy to read. // For now, we just use kPRegMinSize so that the test works anywhere. int in0[] = {0xe, 0xc, 0x8, 0x0}; int in1[] = {0x0, 0x2, 0x0, 0x1}; int in2[] = {0x0, 0x1, 0xf, 0x0}; int in3[] = {0xf, 0x0, 0x0, 0x0}; // Pnext activates the next element that is true in pg, after the last-active // element in pn. If all pn elements are false (as in in0), it starts looking // at element 0. // // As for other SVE instructions, elements are only considered to be active if // the _first_ bit in each field is one. Other bits are ignored. // There are no active lanes in in0, so the result is simply the first active // lane from pg. int exp00[] = {0, 0, 0, 0}; int exp10[] = {0, 0, 0, 1}; int exp20[] = {0, 0, 1, 0}; int exp30[] = {1, 0, 0, 0}; // | The last active lane in in1 is here. // v int exp01[] = {0, 0, 0, 0}; int exp11[] = {0, 0, 0, 0}; int exp21[] = {0, 0, 1, 0}; int exp31[] = {1, 0, 0, 0}; // | The last active lane in in2 is here. // v int exp02[] = {0, 0, 0, 0}; int exp12[] = {0, 0, 0, 0}; int exp22[] = {0, 0, 0, 0}; int exp32[] = {1, 0, 0, 0}; // | The last active lane in in3 is here. // v int exp03[] = {0, 0, 0, 0}; int exp13[] = {0, 0, 0, 0}; int exp23[] = {0, 0, 0, 0}; int exp33[] = {0, 0, 0, 0}; PnextHelper(config, kSRegSize, in0, in0, exp00); PnextHelper(config, kSRegSize, in1, in0, exp10); PnextHelper(config, kSRegSize, in2, in0, exp20); PnextHelper(config, kSRegSize, in3, in0, exp30); PnextHelper(config, kSRegSize, in0, in1, exp01); PnextHelper(config, kSRegSize, in1, in1, exp11); PnextHelper(config, kSRegSize, in2, in1, exp21); PnextHelper(config, kSRegSize, in3, in1, exp31); PnextHelper(config, kSRegSize, in0, in2, exp02); PnextHelper(config, kSRegSize, in1, in2, exp12); PnextHelper(config, kSRegSize, in2, in2, exp22); PnextHelper(config, kSRegSize, in3, in2, exp32); PnextHelper(config, kSRegSize, in0, in3, exp03); PnextHelper(config, kSRegSize, in1, in3, exp13); PnextHelper(config, kSRegSize, in2, in3, exp23); PnextHelper(config, kSRegSize, in3, in3, exp33); } TEST_SVE(sve_pnext_d) { // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize // (to check propagation if we have a large VL), but few enough to make the // test easy to read. // For now, we just use kPRegMinSize so that the test works anywhere. int in0[] = {0xfe, 0xf0}; int in1[] = {0x00, 0x55}; int in2[] = {0x33, 0xff}; // Pnext activates the next element that is true in pg, after the last-active // element in pn. If all pn elements are false (as in in0), it starts looking // at element 0. // // As for other SVE instructions, elements are only considered to be active if // the _first_ bit in each field is one. Other bits are ignored. // There are no active lanes in in0, so the result is simply the first active // lane from pg. int exp00[] = {0, 0}; int exp10[] = {0, 1}; int exp20[] = {0, 1}; // | The last active lane in in1 is here. // v int exp01[] = {0, 0}; int exp11[] = {0, 0}; int exp21[] = {1, 0}; // | The last active lane in in2 is here. // v int exp02[] = {0, 0}; int exp12[] = {0, 0}; int exp22[] = {0, 0}; PnextHelper(config, kDRegSize, in0, in0, exp00); PnextHelper(config, kDRegSize, in1, in0, exp10); PnextHelper(config, kDRegSize, in2, in0, exp20); PnextHelper(config, kDRegSize, in0, in1, exp01); PnextHelper(config, kDRegSize, in1, in1, exp11); PnextHelper(config, kDRegSize, in2, in1, exp21); PnextHelper(config, kDRegSize, in0, in2, exp02); PnextHelper(config, kDRegSize, in1, in2, exp12); PnextHelper(config, kDRegSize, in2, in2, exp22); } TEST_SVE(sve_pnext_alias) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Check that the Simulator behaves correctly when all arguments are aliased. int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0}; int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0}; int in_s[] = {0, 1, 1, 0}; int in_d[] = {1, 1}; Initialise(&masm, p0.VnB(), in_b); Initialise(&masm, p1.VnH(), in_h); Initialise(&masm, p2.VnS(), in_s); Initialise(&masm, p3.VnD(), in_d); // Initialise NZCV to an impossible value, to check that we actually write it. __ Mov(x10, NZCVFlag); __ Msr(NZCV, x10); __ Pnext(p0.VnB(), p0, p0.VnB()); __ Mrs(x0, NZCV); __ Msr(NZCV, x10); __ Pnext(p1.VnB(), p1, p1.VnB()); __ Mrs(x1, NZCV); __ Msr(NZCV, x10); __ Pnext(p2.VnB(), p2, p2.VnB()); __ Mrs(x2, NZCV); __ Msr(NZCV, x10); __ Pnext(p3.VnB(), p3, p3.VnB()); __ Mrs(x3, NZCV); END(); if (CAN_RUN()) { RUN(); // Since pg.Is(pdn), there can be no active lanes in pg above the last // active lane in pdn, so the result should always be zero. ASSERT_EQUAL_SVE(0, p0.VnB()); ASSERT_EQUAL_SVE(0, p1.VnH()); ASSERT_EQUAL_SVE(0, p2.VnS()); ASSERT_EQUAL_SVE(0, p3.VnD()); ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x0); ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x1); ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x2); ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x3); } } static void PtrueHelper(Test* config, unsigned lane_size_in_bits, FlagsUpdate s = LeaveFlags) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); PRegisterWithLaneSize p[kNumberOfPRegisters]; for (unsigned i = 0; i < kNumberOfPRegisters; i++) { p[i] = PRegister(i).WithLaneSize(lane_size_in_bits); } // Initialise NZCV to an impossible value, to check that we actually write it. StatusFlags nzcv_unmodified = NZCVFlag; __ Mov(x20, nzcv_unmodified); // We don't have enough registers to conveniently test every pattern, so take // samples from each group. __ Msr(NZCV, x20); __ Ptrue(p[0], SVE_POW2, s); __ Mrs(x0, NZCV); __ Msr(NZCV, x20); __ Ptrue(p[1], SVE_VL1, s); __ Mrs(x1, NZCV); __ Msr(NZCV, x20); __ Ptrue(p[2], SVE_VL2, s); __ Mrs(x2, NZCV); __ Msr(NZCV, x20); __ Ptrue(p[3], SVE_VL5, s); __ Mrs(x3, NZCV); __ Msr(NZCV, x20); __ Ptrue(p[4], SVE_VL6, s); __ Mrs(x4, NZCV); __ Msr(NZCV, x20); __ Ptrue(p[5], SVE_VL8, s); __ Mrs(x5, NZCV); __ Msr(NZCV, x20); __ Ptrue(p[6], SVE_VL16, s); __ Mrs(x6, NZCV); __ Msr(NZCV, x20); __ Ptrue(p[7], SVE_VL64, s); __ Mrs(x7, NZCV); __ Msr(NZCV, x20); __ Ptrue(p[8], SVE_VL256, s); __ Mrs(x8, NZCV); { // We have to use the Assembler to use values not defined by // SVEPredicateConstraint, so call `ptrues` directly.. typedef void ( MacroAssembler::*AssemblePtrueFn)(const PRegisterWithLaneSize& pd, int pattern); AssemblePtrueFn assemble = &MacroAssembler::ptrue; if (s == SetFlags) { assemble = &MacroAssembler::ptrues; } ExactAssemblyScope guard(&masm, 12 * kInstructionSize); __ msr(NZCV, x20); (masm.*assemble)(p[9], 0xe); __ mrs(x9, NZCV); __ msr(NZCV, x20); (masm.*assemble)(p[10], 0x16); __ mrs(x10, NZCV); __ msr(NZCV, x20); (masm.*assemble)(p[11], 0x1a); __ mrs(x11, NZCV); __ msr(NZCV, x20); (masm.*assemble)(p[12], 0x1c); __ mrs(x12, NZCV); } __ Msr(NZCV, x20); __ Ptrue(p[13], SVE_MUL4, s); __ Mrs(x13, NZCV); __ Msr(NZCV, x20); __ Ptrue(p[14], SVE_MUL3, s); __ Mrs(x14, NZCV); __ Msr(NZCV, x20); __ Ptrue(p[15], SVE_ALL, s); __ Mrs(x15, NZCV); END(); if (CAN_RUN()) { RUN(); int all = core.GetSVELaneCount(lane_size_in_bits); int pow2 = 1 << HighestSetBitPosition(all); int mul4 = all - (all % 4); int mul3 = all - (all % 3); // Check P register results. for (int i = 0; i < all; i++) { ASSERT_EQUAL_SVE_LANE(i < pow2, p[0], i); ASSERT_EQUAL_SVE_LANE((all >= 1) && (i < 1), p[1], i); ASSERT_EQUAL_SVE_LANE((all >= 2) && (i < 2), p[2], i); ASSERT_EQUAL_SVE_LANE((all >= 5) && (i < 5), p[3], i); ASSERT_EQUAL_SVE_LANE((all >= 6) && (i < 6), p[4], i); ASSERT_EQUAL_SVE_LANE((all >= 8) && (i < 8), p[5], i); ASSERT_EQUAL_SVE_LANE((all >= 16) && (i < 16), p[6], i); ASSERT_EQUAL_SVE_LANE((all >= 64) && (i < 64), p[7], i); ASSERT_EQUAL_SVE_LANE((all >= 256) && (i < 256), p[8], i); ASSERT_EQUAL_SVE_LANE(false, p[9], i); ASSERT_EQUAL_SVE_LANE(false, p[10], i); ASSERT_EQUAL_SVE_LANE(false, p[11], i); ASSERT_EQUAL_SVE_LANE(false, p[12], i); ASSERT_EQUAL_SVE_LANE(i < mul4, p[13], i); ASSERT_EQUAL_SVE_LANE(i < mul3, p[14], i); ASSERT_EQUAL_SVE_LANE(true, p[15], i); } // Check NZCV results. if (s == LeaveFlags) { // No flags should have been updated. for (int i = 0; i <= 15; i++) { ASSERT_EQUAL_64(nzcv_unmodified, XRegister(i)); } } else { StatusFlags zero = static_cast(SVENoneFlag | SVENotLastFlag); StatusFlags nonzero = SVEFirstFlag; // POW2 ASSERT_EQUAL_64(nonzero, x0); // VL* ASSERT_EQUAL_64((all >= 1) ? nonzero : zero, x1); ASSERT_EQUAL_64((all >= 2) ? nonzero : zero, x2); ASSERT_EQUAL_64((all >= 5) ? nonzero : zero, x3); ASSERT_EQUAL_64((all >= 6) ? nonzero : zero, x4); ASSERT_EQUAL_64((all >= 8) ? nonzero : zero, x5); ASSERT_EQUAL_64((all >= 16) ? nonzero : zero, x6); ASSERT_EQUAL_64((all >= 64) ? nonzero : zero, x7); ASSERT_EQUAL_64((all >= 256) ? nonzero : zero, x8); // #uimm5 ASSERT_EQUAL_64(zero, x9); ASSERT_EQUAL_64(zero, x10); ASSERT_EQUAL_64(zero, x11); ASSERT_EQUAL_64(zero, x12); // MUL* ASSERT_EQUAL_64((all >= 4) ? nonzero : zero, x13); ASSERT_EQUAL_64((all >= 3) ? nonzero : zero, x14); // ALL ASSERT_EQUAL_64(nonzero, x15); } } } TEST_SVE(sve_ptrue_b) { PtrueHelper(config, kBRegSize, LeaveFlags); } TEST_SVE(sve_ptrue_h) { PtrueHelper(config, kHRegSize, LeaveFlags); } TEST_SVE(sve_ptrue_s) { PtrueHelper(config, kSRegSize, LeaveFlags); } TEST_SVE(sve_ptrue_d) { PtrueHelper(config, kDRegSize, LeaveFlags); } TEST_SVE(sve_ptrues_b) { PtrueHelper(config, kBRegSize, SetFlags); } TEST_SVE(sve_ptrues_h) { PtrueHelper(config, kHRegSize, SetFlags); } TEST_SVE(sve_ptrues_s) { PtrueHelper(config, kSRegSize, SetFlags); } TEST_SVE(sve_ptrues_d) { PtrueHelper(config, kDRegSize, SetFlags); } TEST_SVE(sve_pfalse) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Initialise non-zero inputs. __ Ptrue(p0.VnB()); __ Ptrue(p1.VnH()); __ Ptrue(p2.VnS()); __ Ptrue(p3.VnD()); // The instruction only supports B-sized lanes, but the lane size has no // logical effect, so the MacroAssembler accepts anything. __ Pfalse(p0.VnB()); __ Pfalse(p1.VnH()); __ Pfalse(p2.VnS()); __ Pfalse(p3.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(0, p0.VnB()); ASSERT_EQUAL_SVE(0, p1.VnB()); ASSERT_EQUAL_SVE(0, p2.VnB()); ASSERT_EQUAL_SVE(0, p3.VnB()); } } TEST_SVE(sve_ptest) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Initialise NZCV to a known (impossible) value. StatusFlags nzcv_unmodified = NZCVFlag; __ Mov(x0, nzcv_unmodified); __ Msr(NZCV, x0); // Construct some test inputs. int in2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0}; int in3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0}; int in4[] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0}; __ Pfalse(p0.VnB()); __ Ptrue(p1.VnB()); Initialise(&masm, p2.VnB(), in2); Initialise(&masm, p3.VnB(), in3); Initialise(&masm, p4.VnB(), in4); // All-inactive pg. __ Ptest(p0, p0.VnB()); __ Mrs(x0, NZCV); __ Ptest(p0, p1.VnB()); __ Mrs(x1, NZCV); __ Ptest(p0, p2.VnB()); __ Mrs(x2, NZCV); __ Ptest(p0, p3.VnB()); __ Mrs(x3, NZCV); __ Ptest(p0, p4.VnB()); __ Mrs(x4, NZCV); // All-active pg. __ Ptest(p1, p0.VnB()); __ Mrs(x5, NZCV); __ Ptest(p1, p1.VnB()); __ Mrs(x6, NZCV); __ Ptest(p1, p2.VnB()); __ Mrs(x7, NZCV); __ Ptest(p1, p3.VnB()); __ Mrs(x8, NZCV); __ Ptest(p1, p4.VnB()); __ Mrs(x9, NZCV); // Combinations of other inputs. __ Ptest(p2, p2.VnB()); __ Mrs(x20, NZCV); __ Ptest(p2, p3.VnB()); __ Mrs(x21, NZCV); __ Ptest(p2, p4.VnB()); __ Mrs(x22, NZCV); __ Ptest(p3, p2.VnB()); __ Mrs(x23, NZCV); __ Ptest(p3, p3.VnB()); __ Mrs(x24, NZCV); __ Ptest(p3, p4.VnB()); __ Mrs(x25, NZCV); __ Ptest(p4, p2.VnB()); __ Mrs(x26, NZCV); __ Ptest(p4, p3.VnB()); __ Mrs(x27, NZCV); __ Ptest(p4, p4.VnB()); __ Mrs(x28, NZCV); END(); if (CAN_RUN()) { RUN(); StatusFlags zero = static_cast(SVENoneFlag | SVENotLastFlag); // If pg is all inactive, the value of pn is irrelevant. ASSERT_EQUAL_64(zero, x0); ASSERT_EQUAL_64(zero, x1); ASSERT_EQUAL_64(zero, x2); ASSERT_EQUAL_64(zero, x3); ASSERT_EQUAL_64(zero, x4); // All-active pg. ASSERT_EQUAL_64(zero, x5); // All-inactive pn. ASSERT_EQUAL_64(SVEFirstFlag, x6); // All-active pn. // Other pn inputs are non-zero, but the first and last lanes are inactive. ASSERT_EQUAL_64(SVENotLastFlag, x7); ASSERT_EQUAL_64(SVENotLastFlag, x8); ASSERT_EQUAL_64(SVENotLastFlag, x9); // Other inputs. ASSERT_EQUAL_64(SVEFirstFlag, x20); // pg: in2, pn: in2 ASSERT_EQUAL_64(NoFlag, x21); // pg: in2, pn: in3 ASSERT_EQUAL_64(zero, x22); // pg: in2, pn: in4 ASSERT_EQUAL_64(static_cast(SVEFirstFlag | SVENotLastFlag), x23); // pg: in3, pn: in2 ASSERT_EQUAL_64(SVEFirstFlag, x24); // pg: in3, pn: in3 ASSERT_EQUAL_64(zero, x25); // pg: in3, pn: in4 ASSERT_EQUAL_64(zero, x26); // pg: in4, pn: in2 ASSERT_EQUAL_64(zero, x27); // pg: in4, pn: in3 ASSERT_EQUAL_64(SVEFirstFlag, x28); // pg: in4, pn: in4 } } TEST_SVE(sve_cntp) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // There are {7, 5, 2, 1} active {B, H, S, D} lanes. int p0_inputs[] = {0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0}; Initialise(&masm, p0.VnB(), p0_inputs); // With an all-true predicate, these instructions measure the vector length. __ Ptrue(p10.VnB()); __ Ptrue(p11.VnH()); __ Ptrue(p12.VnS()); __ Ptrue(p13.VnD()); // `ptrue p10.b` provides an all-active pg. __ Cntp(x10, p10, p10.VnB()); __ Cntp(x11, p10, p11.VnH()); __ Cntp(x12, p10, p12.VnS()); __ Cntp(x13, p10, p13.VnD()); // Check that the predicate mask is applied properly. __ Cntp(x14, p10, p10.VnB()); __ Cntp(x15, p11, p10.VnB()); __ Cntp(x16, p12, p10.VnB()); __ Cntp(x17, p13, p10.VnB()); // Check other patterns (including some ignored bits). __ Cntp(x0, p10, p0.VnB()); __ Cntp(x1, p10, p0.VnH()); __ Cntp(x2, p10, p0.VnS()); __ Cntp(x3, p10, p0.VnD()); __ Cntp(x4, p0, p10.VnB()); __ Cntp(x5, p0, p10.VnH()); __ Cntp(x6, p0, p10.VnS()); __ Cntp(x7, p0, p10.VnD()); END(); if (CAN_RUN()) { RUN(); int vl_b = core.GetSVELaneCount(kBRegSize); int vl_h = core.GetSVELaneCount(kHRegSize); int vl_s = core.GetSVELaneCount(kSRegSize); int vl_d = core.GetSVELaneCount(kDRegSize); // Check all-active predicates in various combinations. ASSERT_EQUAL_64(vl_b, x10); ASSERT_EQUAL_64(vl_h, x11); ASSERT_EQUAL_64(vl_s, x12); ASSERT_EQUAL_64(vl_d, x13); ASSERT_EQUAL_64(vl_b, x14); ASSERT_EQUAL_64(vl_h, x15); ASSERT_EQUAL_64(vl_s, x16); ASSERT_EQUAL_64(vl_d, x17); // Check that irrelevant bits are properly ignored. ASSERT_EQUAL_64(7, x0); ASSERT_EQUAL_64(5, x1); ASSERT_EQUAL_64(2, x2); ASSERT_EQUAL_64(1, x3); ASSERT_EQUAL_64(7, x4); ASSERT_EQUAL_64(5, x5); ASSERT_EQUAL_64(2, x6); ASSERT_EQUAL_64(1, x7); } } typedef void (MacroAssembler::*CntFn)(const Register& dst, int pattern, int multiplier); template void GenerateCntSequence(MacroAssembler* masm, CntFn cnt, T acc_value, int multiplier) { // Initialise accumulators. masm->Mov(x0, acc_value); masm->Mov(x1, acc_value); masm->Mov(x2, acc_value); masm->Mov(x3, acc_value); masm->Mov(x4, acc_value); masm->Mov(x5, acc_value); masm->Mov(x6, acc_value); masm->Mov(x7, acc_value); masm->Mov(x8, acc_value); masm->Mov(x9, acc_value); masm->Mov(x10, acc_value); masm->Mov(x11, acc_value); masm->Mov(x12, acc_value); masm->Mov(x13, acc_value); masm->Mov(x14, acc_value); masm->Mov(x15, acc_value); masm->Mov(x18, acc_value); masm->Mov(x19, acc_value); masm->Mov(x20, acc_value); masm->Mov(x21, acc_value); (masm->*cnt)(Register(0, sizeof(T) * kBitsPerByte), SVE_POW2, multiplier); (masm->*cnt)(Register(1, sizeof(T) * kBitsPerByte), SVE_VL1, multiplier); (masm->*cnt)(Register(2, sizeof(T) * kBitsPerByte), SVE_VL2, multiplier); (masm->*cnt)(Register(3, sizeof(T) * kBitsPerByte), SVE_VL3, multiplier); (masm->*cnt)(Register(4, sizeof(T) * kBitsPerByte), SVE_VL4, multiplier); (masm->*cnt)(Register(5, sizeof(T) * kBitsPerByte), SVE_VL5, multiplier); (masm->*cnt)(Register(6, sizeof(T) * kBitsPerByte), SVE_VL6, multiplier); (masm->*cnt)(Register(7, sizeof(T) * kBitsPerByte), SVE_VL7, multiplier); (masm->*cnt)(Register(8, sizeof(T) * kBitsPerByte), SVE_VL8, multiplier); (masm->*cnt)(Register(9, sizeof(T) * kBitsPerByte), SVE_VL16, multiplier); (masm->*cnt)(Register(10, sizeof(T) * kBitsPerByte), SVE_VL32, multiplier); (masm->*cnt)(Register(11, sizeof(T) * kBitsPerByte), SVE_VL64, multiplier); (masm->*cnt)(Register(12, sizeof(T) * kBitsPerByte), SVE_VL128, multiplier); (masm->*cnt)(Register(13, sizeof(T) * kBitsPerByte), SVE_VL256, multiplier); (masm->*cnt)(Register(14, sizeof(T) * kBitsPerByte), 16, multiplier); (masm->*cnt)(Register(15, sizeof(T) * kBitsPerByte), 23, multiplier); (masm->*cnt)(Register(18, sizeof(T) * kBitsPerByte), 28, multiplier); (masm->*cnt)(Register(19, sizeof(T) * kBitsPerByte), SVE_MUL4, multiplier); (masm->*cnt)(Register(20, sizeof(T) * kBitsPerByte), SVE_MUL3, multiplier); (masm->*cnt)(Register(21, sizeof(T) * kBitsPerByte), SVE_ALL, multiplier); } int FixedVL(int fixed, int length) { VIXL_ASSERT(((fixed >= 1) && (fixed <= 8)) || (fixed == 16) || (fixed == 32) || (fixed == 64) || (fixed == 128) || (fixed = 256)); return (length >= fixed) ? fixed : 0; } static void CntHelper(Test* config, CntFn cnt, int multiplier, int lane_size_in_bits, int64_t acc_value = 0, bool is_increment = true) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); GenerateCntSequence(&masm, cnt, acc_value, multiplier); END(); if (CAN_RUN()) { RUN(); int all = core.GetSVELaneCount(lane_size_in_bits); int pow2 = 1 << HighestSetBitPosition(all); int mul4 = all - (all % 4); int mul3 = all - (all % 3); multiplier = is_increment ? multiplier : -multiplier; ASSERT_EQUAL_64(acc_value + (multiplier * pow2), x0); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(1, all)), x1); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(2, all)), x2); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(3, all)), x3); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(4, all)), x4); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(5, all)), x5); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(6, all)), x6); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(7, all)), x7); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(8, all)), x8); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(16, all)), x9); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(32, all)), x10); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(64, all)), x11); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(128, all)), x12); ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(256, all)), x13); ASSERT_EQUAL_64(acc_value, x14); ASSERT_EQUAL_64(acc_value, x15); ASSERT_EQUAL_64(acc_value, x18); ASSERT_EQUAL_64(acc_value + (multiplier * mul4), x19); ASSERT_EQUAL_64(acc_value + (multiplier * mul3), x20); ASSERT_EQUAL_64(acc_value + (multiplier * all), x21); } } static void IncHelper(Test* config, CntFn cnt, int multiplier, int lane_size_in_bits, int64_t acc_value) { CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true); } static void DecHelper(Test* config, CntFn cnt, int multiplier, int lane_size_in_bits, int64_t acc_value) { CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false); } TEST_SVE(sve_cntb) { CntHelper(config, &MacroAssembler::Cntb, 1, kBRegSize); CntHelper(config, &MacroAssembler::Cntb, 2, kBRegSize); CntHelper(config, &MacroAssembler::Cntb, 15, kBRegSize); CntHelper(config, &MacroAssembler::Cntb, 16, kBRegSize); } TEST_SVE(sve_cnth) { CntHelper(config, &MacroAssembler::Cnth, 1, kHRegSize); CntHelper(config, &MacroAssembler::Cnth, 2, kHRegSize); CntHelper(config, &MacroAssembler::Cnth, 15, kHRegSize); CntHelper(config, &MacroAssembler::Cnth, 16, kHRegSize); } TEST_SVE(sve_cntw) { CntHelper(config, &MacroAssembler::Cntw, 1, kWRegSize); CntHelper(config, &MacroAssembler::Cntw, 2, kWRegSize); CntHelper(config, &MacroAssembler::Cntw, 15, kWRegSize); CntHelper(config, &MacroAssembler::Cntw, 16, kWRegSize); } TEST_SVE(sve_cntd) { CntHelper(config, &MacroAssembler::Cntd, 1, kDRegSize); CntHelper(config, &MacroAssembler::Cntd, 2, kDRegSize); CntHelper(config, &MacroAssembler::Cntd, 15, kDRegSize); CntHelper(config, &MacroAssembler::Cntd, 16, kDRegSize); } TEST_SVE(sve_decb) { DecHelper(config, &MacroAssembler::Decb, 1, kBRegSize, 42); DecHelper(config, &MacroAssembler::Decb, 2, kBRegSize, -1); DecHelper(config, &MacroAssembler::Decb, 15, kBRegSize, INT64_MIN); DecHelper(config, &MacroAssembler::Decb, 16, kBRegSize, -42); } TEST_SVE(sve_dech) { DecHelper(config, &MacroAssembler::Dech, 1, kHRegSize, 42); DecHelper(config, &MacroAssembler::Dech, 2, kHRegSize, -1); DecHelper(config, &MacroAssembler::Dech, 15, kHRegSize, INT64_MIN); DecHelper(config, &MacroAssembler::Dech, 16, kHRegSize, -42); } TEST_SVE(sve_decw) { DecHelper(config, &MacroAssembler::Decw, 1, kWRegSize, 42); DecHelper(config, &MacroAssembler::Decw, 2, kWRegSize, -1); DecHelper(config, &MacroAssembler::Decw, 15, kWRegSize, INT64_MIN); DecHelper(config, &MacroAssembler::Decw, 16, kWRegSize, -42); } TEST_SVE(sve_decd) { DecHelper(config, &MacroAssembler::Decd, 1, kDRegSize, 42); DecHelper(config, &MacroAssembler::Decd, 2, kDRegSize, -1); DecHelper(config, &MacroAssembler::Decd, 15, kDRegSize, INT64_MIN); DecHelper(config, &MacroAssembler::Decd, 16, kDRegSize, -42); } TEST_SVE(sve_incb) { IncHelper(config, &MacroAssembler::Incb, 1, kBRegSize, 42); IncHelper(config, &MacroAssembler::Incb, 2, kBRegSize, -1); IncHelper(config, &MacroAssembler::Incb, 15, kBRegSize, INT64_MAX); IncHelper(config, &MacroAssembler::Incb, 16, kBRegSize, -42); } TEST_SVE(sve_inch) { IncHelper(config, &MacroAssembler::Inch, 1, kHRegSize, 42); IncHelper(config, &MacroAssembler::Inch, 2, kHRegSize, -1); IncHelper(config, &MacroAssembler::Inch, 15, kHRegSize, INT64_MAX); IncHelper(config, &MacroAssembler::Inch, 16, kHRegSize, -42); } TEST_SVE(sve_incw) { IncHelper(config, &MacroAssembler::Incw, 1, kWRegSize, 42); IncHelper(config, &MacroAssembler::Incw, 2, kWRegSize, -1); IncHelper(config, &MacroAssembler::Incw, 15, kWRegSize, INT64_MAX); IncHelper(config, &MacroAssembler::Incw, 16, kWRegSize, -42); } TEST_SVE(sve_incd) { IncHelper(config, &MacroAssembler::Incd, 1, kDRegSize, 42); IncHelper(config, &MacroAssembler::Incd, 2, kDRegSize, -1); IncHelper(config, &MacroAssembler::Incd, 15, kDRegSize, INT64_MAX); IncHelper(config, &MacroAssembler::Incd, 16, kDRegSize, -42); } template static T QAdd(T x, int y) { VIXL_ASSERT(y > INT_MIN); T result; T min = std::numeric_limits::min(); T max = std::numeric_limits::max(); if ((x >= 0) && (y >= 0)) { // For positive a and b, saturate at max. result = (max - x) < static_cast(y) ? max : x + y; } else if ((y < 0) && ((x < 0) || (min == 0))) { // For negative b, where either a negative or T unsigned. result = (x - min) < static_cast(-y) ? min : x + y; } else { result = x + y; } return result; } template static void QIncDecHelper(Test* config, CntFn cnt, int multiplier, int lane_size_in_bits, T acc_value, bool is_increment) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); GenerateCntSequence(&masm, cnt, acc_value, multiplier); END(); if (CAN_RUN()) { RUN(); int all = core.GetSVELaneCount(lane_size_in_bits); int pow2 = 1 << HighestSetBitPosition(all); int mul4 = all - (all % 4); int mul3 = all - (all % 3); multiplier = is_increment ? multiplier : -multiplier; ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13); ASSERT_EQUAL_64(acc_value, x14); ASSERT_EQUAL_64(acc_value, x15); ASSERT_EQUAL_64(acc_value, x18); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21); } } template static void QIncHelper(Test* config, CntFn cnt, int multiplier, int lane_size_in_bits, T acc_value) { QIncDecHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true); } template static void QDecHelper(Test* config, CntFn cnt, int multiplier, int lane_size_in_bits, T acc_value) { QIncDecHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false); } TEST_SVE(sve_sqdecb) { int64_t bigneg = INT64_MIN + 42; int64_t bigpos = INT64_MAX - 42; QDecHelper(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1); QDecHelper(config, &MacroAssembler::Sqdecb, 2, kBRegSize, bigneg); QDecHelper(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999); QDecHelper(config, &MacroAssembler::Sqdecb, 16, kBRegSize, bigpos); } TEST_SVE(sve_sqdech) { int64_t bigneg = INT64_MIN + 42; int64_t bigpos = INT64_MAX - 42; QDecHelper(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1); QDecHelper(config, &MacroAssembler::Sqdech, 2, kHRegSize, bigneg); QDecHelper(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999); QDecHelper(config, &MacroAssembler::Sqdech, 16, kHRegSize, bigpos); } TEST_SVE(sve_sqdecw) { int64_t bigneg = INT64_MIN + 42; int64_t bigpos = INT64_MAX - 42; QDecHelper(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1); QDecHelper(config, &MacroAssembler::Sqdecw, 2, kWRegSize, bigneg); QDecHelper(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999); QDecHelper(config, &MacroAssembler::Sqdecw, 16, kWRegSize, bigpos); } TEST_SVE(sve_sqdecd) { int64_t bigneg = INT64_MIN + 42; int64_t bigpos = INT64_MAX - 42; QDecHelper(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1); QDecHelper(config, &MacroAssembler::Sqdecd, 2, kDRegSize, bigneg); QDecHelper(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999); QDecHelper(config, &MacroAssembler::Sqdecd, 16, kDRegSize, bigpos); } TEST_SVE(sve_sqincb) { int64_t bigneg = INT64_MIN + 42; int64_t bigpos = INT64_MAX - 42; QIncHelper(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1); QIncHelper(config, &MacroAssembler::Sqincb, 2, kBRegSize, bigneg); QIncHelper(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999); QIncHelper(config, &MacroAssembler::Sqincb, 16, kBRegSize, bigpos); } TEST_SVE(sve_sqinch) { int64_t bigneg = INT64_MIN + 42; int64_t bigpos = INT64_MAX - 42; QIncHelper(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1); QIncHelper(config, &MacroAssembler::Sqinch, 2, kHRegSize, bigneg); QIncHelper(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999); QIncHelper(config, &MacroAssembler::Sqinch, 16, kHRegSize, bigpos); } TEST_SVE(sve_sqincw) { int64_t bigneg = INT64_MIN + 42; int64_t bigpos = INT64_MAX - 42; QIncHelper(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1); QIncHelper(config, &MacroAssembler::Sqincw, 2, kWRegSize, bigneg); QIncHelper(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999); QIncHelper(config, &MacroAssembler::Sqincw, 16, kWRegSize, bigpos); } TEST_SVE(sve_sqincd) { int64_t bigneg = INT64_MIN + 42; int64_t bigpos = INT64_MAX - 42; QIncHelper(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1); QIncHelper(config, &MacroAssembler::Sqincd, 2, kDRegSize, bigneg); QIncHelper(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999); QIncHelper(config, &MacroAssembler::Sqincd, 16, kDRegSize, bigpos); } TEST_SVE(sve_uqdecb) { int32_t big32 = UINT32_MAX - 42; int64_t big64 = UINT64_MAX - 42; QDecHelper(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1); QDecHelper(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42); QDecHelper(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999); QDecHelper(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big32); QDecHelper(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1); QDecHelper(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42); QDecHelper(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999); QDecHelper(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big64); } TEST_SVE(sve_uqdech) { int32_t big32 = UINT32_MAX - 42; int64_t big64 = UINT64_MAX - 42; QDecHelper(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1); QDecHelper(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42); QDecHelper(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999); QDecHelper(config, &MacroAssembler::Uqdech, 16, kHRegSize, big32); QDecHelper(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1); QDecHelper(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42); QDecHelper(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999); QDecHelper(config, &MacroAssembler::Uqdech, 16, kHRegSize, big64); } TEST_SVE(sve_uqdecw) { int32_t big32 = UINT32_MAX - 42; int64_t big64 = UINT64_MAX - 42; QDecHelper(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1); QDecHelper(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42); QDecHelper(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999); QDecHelper(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big32); QDecHelper(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1); QDecHelper(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42); QDecHelper(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999); QDecHelper(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big64); } TEST_SVE(sve_uqdecd) { int32_t big32 = UINT32_MAX - 42; int64_t big64 = UINT64_MAX - 42; QDecHelper(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1); QDecHelper(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42); QDecHelper(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999); QDecHelper(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big32); QDecHelper(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1); QDecHelper(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42); QDecHelper(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999); QDecHelper(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big64); } TEST_SVE(sve_uqincb) { int32_t big32 = UINT32_MAX - 42; int64_t big64 = UINT64_MAX - 42; QIncHelper(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1); QIncHelper(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42); QIncHelper(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999); QIncHelper(config, &MacroAssembler::Uqincb, 16, kBRegSize, big32); QIncHelper(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1); QIncHelper(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42); QIncHelper(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999); QIncHelper(config, &MacroAssembler::Uqincb, 16, kBRegSize, big64); } TEST_SVE(sve_uqinch) { int32_t big32 = UINT32_MAX - 42; int64_t big64 = UINT64_MAX - 42; QIncHelper(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1); QIncHelper(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42); QIncHelper(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999); QIncHelper(config, &MacroAssembler::Uqinch, 16, kHRegSize, big32); QIncHelper(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1); QIncHelper(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42); QIncHelper(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999); QIncHelper(config, &MacroAssembler::Uqinch, 16, kHRegSize, big64); } TEST_SVE(sve_uqincw) { int32_t big32 = UINT32_MAX - 42; int64_t big64 = UINT64_MAX - 42; QIncHelper(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1); QIncHelper(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42); QIncHelper(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999); QIncHelper(config, &MacroAssembler::Uqincw, 16, kWRegSize, big32); QIncHelper(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1); QIncHelper(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42); QIncHelper(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999); QIncHelper(config, &MacroAssembler::Uqincw, 16, kWRegSize, big64); } TEST_SVE(sve_uqincd) { int32_t big32 = UINT32_MAX - 42; int64_t big64 = UINT64_MAX - 42; QIncHelper(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1); QIncHelper(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42); QIncHelper(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999); QIncHelper(config, &MacroAssembler::Uqincd, 16, kDRegSize, big32); QIncHelper(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1); QIncHelper(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42); QIncHelper(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999); QIncHelper(config, &MacroAssembler::Uqincd, 16, kDRegSize, big64); } typedef void (MacroAssembler::*QIncDecXWFn)(const Register& dst, const Register& src, int pattern, int multiplier); static void QIncDecXWHelper(Test* config, QIncDecXWFn cnt, int multiplier, int lane_size_in_bits, int32_t acc_value, bool is_increment) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Initialise accumulators. __ Mov(x0, acc_value); __ Mov(x1, acc_value); __ Mov(x2, acc_value); __ Mov(x3, acc_value); __ Mov(x4, acc_value); __ Mov(x5, acc_value); __ Mov(x6, acc_value); __ Mov(x7, acc_value); __ Mov(x8, acc_value); __ Mov(x9, acc_value); __ Mov(x10, acc_value); __ Mov(x11, acc_value); __ Mov(x12, acc_value); __ Mov(x13, acc_value); __ Mov(x14, acc_value); __ Mov(x15, acc_value); __ Mov(x18, acc_value); __ Mov(x19, acc_value); __ Mov(x20, acc_value); __ Mov(x21, acc_value); (masm.*cnt)(x0, w0, SVE_POW2, multiplier); (masm.*cnt)(x1, w1, SVE_VL1, multiplier); (masm.*cnt)(x2, w2, SVE_VL2, multiplier); (masm.*cnt)(x3, w3, SVE_VL3, multiplier); (masm.*cnt)(x4, w4, SVE_VL4, multiplier); (masm.*cnt)(x5, w5, SVE_VL5, multiplier); (masm.*cnt)(x6, w6, SVE_VL6, multiplier); (masm.*cnt)(x7, w7, SVE_VL7, multiplier); (masm.*cnt)(x8, w8, SVE_VL8, multiplier); (masm.*cnt)(x9, w9, SVE_VL16, multiplier); (masm.*cnt)(x10, w10, SVE_VL32, multiplier); (masm.*cnt)(x11, w11, SVE_VL64, multiplier); (masm.*cnt)(x12, w12, SVE_VL128, multiplier); (masm.*cnt)(x13, w13, SVE_VL256, multiplier); (masm.*cnt)(x14, w14, 16, multiplier); (masm.*cnt)(x15, w15, 23, multiplier); (masm.*cnt)(x18, w18, 28, multiplier); (masm.*cnt)(x19, w19, SVE_MUL4, multiplier); (masm.*cnt)(x20, w20, SVE_MUL3, multiplier); (masm.*cnt)(x21, w21, SVE_ALL, multiplier); END(); if (CAN_RUN()) { RUN(); int all = core.GetSVELaneCount(lane_size_in_bits); int pow2 = 1 << HighestSetBitPosition(all); int mul4 = all - (all % 4); int mul3 = all - (all % 3); multiplier = is_increment ? multiplier : -multiplier; ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13); ASSERT_EQUAL_64(acc_value, x14); ASSERT_EQUAL_64(acc_value, x15); ASSERT_EQUAL_64(acc_value, x18); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20); ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21); } } static void QIncXWHelper(Test* config, QIncDecXWFn cnt, int multiplier, int lane_size_in_bits, int32_t acc_value) { QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true); } static void QDecXWHelper(Test* config, QIncDecXWFn cnt, int multiplier, int lane_size_in_bits, int32_t acc_value) { QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false); } TEST_SVE(sve_sqdecb_xw) { QDecXWHelper(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1); QDecXWHelper(config, &MacroAssembler::Sqdecb, 2, kBRegSize, INT32_MIN + 42); QDecXWHelper(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999); QDecXWHelper(config, &MacroAssembler::Sqdecb, 16, kBRegSize, INT32_MAX - 42); } TEST_SVE(sve_sqdech_xw) { QDecXWHelper(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1); QDecXWHelper(config, &MacroAssembler::Sqdech, 2, kHRegSize, INT32_MIN + 42); QDecXWHelper(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999); QDecXWHelper(config, &MacroAssembler::Sqdech, 16, kHRegSize, INT32_MAX - 42); } TEST_SVE(sve_sqdecw_xw) { QDecXWHelper(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1); QDecXWHelper(config, &MacroAssembler::Sqdecw, 2, kWRegSize, INT32_MIN + 42); QDecXWHelper(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999); QDecXWHelper(config, &MacroAssembler::Sqdecw, 16, kWRegSize, INT32_MAX - 42); } TEST_SVE(sve_sqdecd_xw) { QDecXWHelper(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1); QDecXWHelper(config, &MacroAssembler::Sqdecd, 2, kDRegSize, INT32_MIN + 42); QDecXWHelper(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999); QDecXWHelper(config, &MacroAssembler::Sqdecd, 16, kDRegSize, INT32_MAX - 42); } TEST_SVE(sve_sqincb_xw) { QIncXWHelper(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1); QIncXWHelper(config, &MacroAssembler::Sqincb, 2, kBRegSize, INT32_MIN + 42); QIncXWHelper(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999); QIncXWHelper(config, &MacroAssembler::Sqincb, 16, kBRegSize, INT32_MAX - 42); } TEST_SVE(sve_sqinch_xw) { QIncXWHelper(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1); QIncXWHelper(config, &MacroAssembler::Sqinch, 2, kHRegSize, INT32_MIN + 42); QIncXWHelper(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999); QIncXWHelper(config, &MacroAssembler::Sqinch, 16, kHRegSize, INT32_MAX - 42); } TEST_SVE(sve_sqincw_xw) { QIncXWHelper(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1); QIncXWHelper(config, &MacroAssembler::Sqincw, 2, kWRegSize, INT32_MIN + 42); QIncXWHelper(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999); QIncXWHelper(config, &MacroAssembler::Sqincw, 16, kWRegSize, INT32_MAX - 42); } TEST_SVE(sve_sqincd_xw) { QIncXWHelper(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1); QIncXWHelper(config, &MacroAssembler::Sqincd, 2, kDRegSize, INT32_MIN + 42); QIncXWHelper(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999); QIncXWHelper(config, &MacroAssembler::Sqincd, 16, kDRegSize, INT32_MAX - 42); } typedef void (MacroAssembler::*IncDecZFn)(const ZRegister& dst, int pattern, int multiplier); typedef void (MacroAssembler::*AddSubFn)(const ZRegister& dst, const ZRegister& src1, const ZRegister& src2); static void IncDecZHelper(Test* config, IncDecZFn fn, CntFn cnt, AddSubFn addsub, int multiplier, int lane_size_in_bits) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t acc_inputs[] = {0x7766554433221100, 0xffffffffffffffff, 0x0000000000000000, 0xffffffff0000ffff, 0x7fffffffffffffff, 0x8000000000000000, 0x7fffffff7fff7fff, 0x8000000080008000}; for (unsigned i = 0; i < kNumberOfZRegisters; i++) { for (int j = 0; j < 4; j++) { InsrHelper(&masm, ZRegister(i, kDRegSize), acc_inputs); } } for (unsigned i = 0; i < 15; i++) { __ Mov(XRegister(i), 0); } (masm.*fn)(z16.WithLaneSize(lane_size_in_bits), SVE_POW2, multiplier); (masm.*fn)(z17.WithLaneSize(lane_size_in_bits), SVE_VL1, multiplier); (masm.*fn)(z18.WithLaneSize(lane_size_in_bits), SVE_VL2, multiplier); (masm.*fn)(z19.WithLaneSize(lane_size_in_bits), SVE_VL3, multiplier); (masm.*fn)(z20.WithLaneSize(lane_size_in_bits), SVE_VL4, multiplier); (masm.*fn)(z21.WithLaneSize(lane_size_in_bits), SVE_VL7, multiplier); (masm.*fn)(z22.WithLaneSize(lane_size_in_bits), SVE_VL8, multiplier); (masm.*fn)(z23.WithLaneSize(lane_size_in_bits), SVE_VL16, multiplier); (masm.*fn)(z24.WithLaneSize(lane_size_in_bits), SVE_VL64, multiplier); (masm.*fn)(z25.WithLaneSize(lane_size_in_bits), SVE_VL256, multiplier); (masm.*fn)(z26.WithLaneSize(lane_size_in_bits), 16, multiplier); (masm.*fn)(z27.WithLaneSize(lane_size_in_bits), 28, multiplier); (masm.*fn)(z28.WithLaneSize(lane_size_in_bits), SVE_MUL3, multiplier); (masm.*fn)(z29.WithLaneSize(lane_size_in_bits), SVE_MUL4, multiplier); (masm.*fn)(z30.WithLaneSize(lane_size_in_bits), SVE_ALL, multiplier); // Perform computation using alternative instructions. (masm.*cnt)(x0, SVE_POW2, multiplier); (masm.*cnt)(x1, SVE_VL1, multiplier); (masm.*cnt)(x2, SVE_VL2, multiplier); (masm.*cnt)(x3, SVE_VL3, multiplier); (masm.*cnt)(x4, SVE_VL4, multiplier); (masm.*cnt)(x5, SVE_VL7, multiplier); (masm.*cnt)(x6, SVE_VL8, multiplier); (masm.*cnt)(x7, SVE_VL16, multiplier); (masm.*cnt)(x8, SVE_VL64, multiplier); (masm.*cnt)(x9, SVE_VL256, multiplier); (masm.*cnt)(x10, 16, multiplier); (masm.*cnt)(x11, 28, multiplier); (masm.*cnt)(x12, SVE_MUL3, multiplier); (masm.*cnt)(x13, SVE_MUL4, multiplier); (masm.*cnt)(x14, SVE_ALL, multiplier); ZRegister zscratch = z15.WithLaneSize(lane_size_in_bits); for (unsigned i = 0; i < 15; i++) { ZRegister zsrcdst = ZRegister(i, lane_size_in_bits); Register x = Register(i, kXRegSize); __ Dup(zscratch, x); (masm.*addsub)(zsrcdst, zsrcdst, zscratch); } END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z0, z16); ASSERT_EQUAL_SVE(z1, z17); ASSERT_EQUAL_SVE(z2, z18); ASSERT_EQUAL_SVE(z3, z19); ASSERT_EQUAL_SVE(z4, z20); ASSERT_EQUAL_SVE(z5, z21); ASSERT_EQUAL_SVE(z6, z22); ASSERT_EQUAL_SVE(z7, z23); ASSERT_EQUAL_SVE(z8, z24); ASSERT_EQUAL_SVE(z9, z25); ASSERT_EQUAL_SVE(z10, z26); ASSERT_EQUAL_SVE(z11, z27); ASSERT_EQUAL_SVE(z12, z28); ASSERT_EQUAL_SVE(z13, z29); ASSERT_EQUAL_SVE(z14, z30); } } TEST_SVE(sve_inc_dec_vec) { CntFn cnth = &MacroAssembler::Cnth; CntFn cntw = &MacroAssembler::Cntw; CntFn cntd = &MacroAssembler::Cntd; AddSubFn sub = &MacroAssembler::Sub; AddSubFn add = &MacroAssembler::Add; for (int mult = 1; mult <= 16; mult += 5) { IncDecZHelper(config, &MacroAssembler::Dech, cnth, sub, mult, kHRegSize); IncDecZHelper(config, &MacroAssembler::Decw, cntw, sub, mult, kSRegSize); IncDecZHelper(config, &MacroAssembler::Decd, cntd, sub, mult, kDRegSize); IncDecZHelper(config, &MacroAssembler::Inch, cnth, add, mult, kHRegSize); IncDecZHelper(config, &MacroAssembler::Incw, cntw, add, mult, kSRegSize); IncDecZHelper(config, &MacroAssembler::Incd, cntd, add, mult, kDRegSize); } } TEST_SVE(sve_unsigned_sat_inc_dec_vec) { CntFn cnth = &MacroAssembler::Cnth; CntFn cntw = &MacroAssembler::Cntw; CntFn cntd = &MacroAssembler::Cntd; AddSubFn sub = &MacroAssembler::Uqsub; AddSubFn add = &MacroAssembler::Uqadd; for (int mult = 1; mult <= 16; mult += 5) { IncDecZHelper(config, &MacroAssembler::Uqdech, cnth, sub, mult, kHRegSize); IncDecZHelper(config, &MacroAssembler::Uqdecw, cntw, sub, mult, kSRegSize); IncDecZHelper(config, &MacroAssembler::Uqdecd, cntd, sub, mult, kDRegSize); IncDecZHelper(config, &MacroAssembler::Uqinch, cnth, add, mult, kHRegSize); IncDecZHelper(config, &MacroAssembler::Uqincw, cntw, add, mult, kSRegSize); IncDecZHelper(config, &MacroAssembler::Uqincd, cntd, add, mult, kDRegSize); } } TEST_SVE(sve_signed_sat_inc_dec_vec) { CntFn cnth = &MacroAssembler::Cnth; CntFn cntw = &MacroAssembler::Cntw; CntFn cntd = &MacroAssembler::Cntd; AddSubFn sub = &MacroAssembler::Sqsub; AddSubFn add = &MacroAssembler::Sqadd; for (int mult = 1; mult <= 16; mult += 5) { IncDecZHelper(config, &MacroAssembler::Sqdech, cnth, sub, mult, kHRegSize); IncDecZHelper(config, &MacroAssembler::Sqdecw, cntw, sub, mult, kSRegSize); IncDecZHelper(config, &MacroAssembler::Sqdecd, cntd, sub, mult, kDRegSize); IncDecZHelper(config, &MacroAssembler::Sqinch, cnth, add, mult, kHRegSize); IncDecZHelper(config, &MacroAssembler::Sqincw, cntw, add, mult, kSRegSize); IncDecZHelper(config, &MacroAssembler::Sqincd, cntd, add, mult, kDRegSize); } } typedef void (MacroAssembler::*ArithPredicatedFn)(const ZRegister& zd, const PRegisterM& pg, const ZRegister& zn, const ZRegister& zm); template static void IntBinArithHelper(Test* config, ArithPredicatedFn macro, unsigned lane_size_in_bits, const Tg& pg_inputs, const Tn& zn_inputs, const Tn& zm_inputs, const Td& zd_expected) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); ZRegister src_a = z30.WithLaneSize(lane_size_in_bits); ZRegister src_b = z27.WithLaneSize(lane_size_in_bits); InsrHelper(&masm, src_a, zn_inputs); InsrHelper(&masm, src_b, zm_inputs); Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs); ZRegister zd_1 = z0.WithLaneSize(lane_size_in_bits); ZRegister zd_2 = z1.WithLaneSize(lane_size_in_bits); ZRegister zd_3 = z2.WithLaneSize(lane_size_in_bits); // `instr` zd(dst), zd(src_a), zn(src_b) __ Mov(zd_1, src_a); (masm.*macro)(zd_1, p0.Merging(), zd_1, src_b); // `instr` zd(dst), zm(src_a), zd(src_b) // Based on whether zd and zm registers are aliased, the macro of instructions // (`Instr`) swaps the order of operands if it has the commutative property, // otherwise, transfer to the reversed `Instr`, such as subr and divr. __ Mov(zd_2, src_b); (masm.*macro)(zd_2, p0.Merging(), src_a, zd_2); // `instr` zd(dst), zm(src_a), zn(src_b) // The macro of instructions (`Instr`) automatically selects between `instr` // and movprfx + `instr` based on whether zd and zn registers are aliased. // A generated movprfx instruction is predicated that using the same // governing predicate register. In order to keep the result constant, // initialize the destination register first. __ Mov(zd_3, src_a); (masm.*macro)(zd_3, p0.Merging(), src_a, src_b); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zd_expected, zd_1); for (size_t i = 0; i < ArrayLength(zd_expected); i++) { int lane = static_cast(ArrayLength(zd_expected) - i - 1); if (!core.HasSVELane(zd_1, lane)) break; if ((pg_inputs[i] & 1) != 0) { ASSERT_EQUAL_SVE_LANE(zd_expected[i], zd_1, lane); } else { ASSERT_EQUAL_SVE_LANE(zn_inputs[i], zd_1, lane); } } ASSERT_EQUAL_SVE(zd_expected, zd_3); } } TEST_SVE(sve_binary_arithmetic_predicated_add) { // clang-format off unsigned zn_b[] = {0x00, 0x01, 0x10, 0x81, 0xff, 0x0f, 0x01, 0x7f}; unsigned zm_b[] = {0x00, 0x01, 0x10, 0x00, 0x81, 0x80, 0xff, 0xff}; unsigned zn_h[] = {0x0000, 0x0123, 0x1010, 0x8181, 0xffff, 0x0f0f, 0x0101, 0x7f7f}; unsigned zm_h[] = {0x0000, 0x0123, 0x1010, 0x0000, 0x8181, 0x8080, 0xffff, 0xffff}; unsigned zn_s[] = {0x00000000, 0x01234567, 0x10101010, 0x81818181, 0xffffffff, 0x0f0f0f0f, 0x01010101, 0x7f7f7f7f}; unsigned zm_s[] = {0x00000000, 0x01234567, 0x10101010, 0x00000000, 0x81818181, 0x80808080, 0xffffffff, 0xffffffff}; uint64_t zn_d[] = {0x0000000000000000, 0x0123456789abcdef, 0x1010101010101010, 0x8181818181818181, 0xffffffffffffffff, 0x0f0f0f0f0f0f0f0f, 0x0101010101010101, 0x7f7f7f7fffffffff}; uint64_t zm_d[] = {0x0000000000000000, 0x0123456789abcdef, 0x1010101010101010, 0x0000000000000000, 0x8181818181818181, 0x8080808080808080, 0xffffffffffffffff, 0xffffffffffffffff}; int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0}; int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1}; int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1}; int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1}; unsigned add_exp_b[] = {0x00, 0x02, 0x20, 0x81, 0x80, 0x8f, 0x00, 0x7f}; unsigned add_exp_h[] = {0x0000, 0x0246, 0x1010, 0x8181, 0x8180, 0x8f8f, 0x0101, 0x7f7e}; unsigned add_exp_s[] = {0x00000000, 0x01234567, 0x20202020, 0x81818181, 0x81818180, 0x0f0f0f0f, 0x01010100, 0x7f7f7f7e}; uint64_t add_exp_d[] = {0x0000000000000000, 0x02468acf13579bde, 0x2020202020202020, 0x8181818181818181, 0xffffffffffffffff, 0x8f8f8f8f8f8f8f8f, 0x0101010101010100, 0x7f7f7f7ffffffffe}; ArithPredicatedFn fn = &MacroAssembler::Add; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, add_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, add_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, add_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, add_exp_d); unsigned sub_exp_b[] = {0x00, 0x00, 0x00, 0x81, 0x7e, 0x8f, 0x02, 0x7f}; unsigned sub_exp_h[] = {0x0000, 0x0000, 0x1010, 0x8181, 0x7e7e, 0x8e8f, 0x0101, 0x7f80}; unsigned sub_exp_s[] = {0x00000000, 0x01234567, 0x00000000, 0x81818181, 0x7e7e7e7e, 0x0f0f0f0f, 0x01010102, 0x7f7f7f80}; uint64_t sub_exp_d[] = {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x8181818181818181, 0xffffffffffffffff, 0x8e8e8e8e8e8e8e8f, 0x0101010101010102, 0x7f7f7f8000000000}; fn = &MacroAssembler::Sub; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sub_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sub_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sub_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sub_exp_d); // clang-format on } TEST_SVE(sve_binary_arithmetic_predicated_umin_umax_uabd) { // clang-format off unsigned zn_b[] = {0x00, 0xff, 0x0f, 0xff, 0xf0, 0x98, 0x55, 0x67}; unsigned zm_b[] = {0x01, 0x00, 0x0e, 0xfe, 0xfe, 0xab, 0xcd, 0x78}; unsigned zn_h[] = {0x0000, 0xffff, 0x00ff, 0xffff, 0xff00, 0xba98, 0x5555, 0x4567}; unsigned zm_h[] = {0x0001, 0x0000, 0x00ee, 0xfffe, 0xfe00, 0xabab, 0xcdcd, 0x5678}; unsigned zn_s[] = {0x00000000, 0xffffffff, 0x0000ffff, 0xffffffff, 0xffff0000, 0xfedcba98, 0x55555555, 0x01234567}; unsigned zm_s[] = {0x00000001, 0x00000000, 0x0000eeee, 0xfffffffe, 0xfffe0000, 0xabababab, 0xcdcdcdcd, 0x12345678}; uint64_t zn_d[] = {0x0000000000000000, 0xffffffffffffffff, 0x5555555555555555, 0x0000000001234567}; uint64_t zm_d[] = {0x0000000000000001, 0x0000000000000000, 0xcdcdcdcdcdcdcdcd, 0x0000000012345678}; int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0}; int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1}; int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1}; int pg_d[] = {1, 0, 1, 1}; unsigned umax_exp_b[] = {0x01, 0xff, 0x0f, 0xff, 0xfe, 0xab, 0xcd, 0x67}; unsigned umax_exp_h[] = {0x0001, 0xffff, 0x00ff, 0xffff, 0xff00, 0xba98, 0x5555, 0x5678}; unsigned umax_exp_s[] = {0x00000001, 0xffffffff, 0x0000ffff, 0xffffffff, 0xffff0000, 0xfedcba98, 0xcdcdcdcd, 0x12345678}; uint64_t umax_exp_d[] = {0x0000000000000001, 0xffffffffffffffff, 0xcdcdcdcdcdcdcdcd, 0x0000000012345678}; ArithPredicatedFn fn = &MacroAssembler::Umax; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umax_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umax_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umax_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umax_exp_d); unsigned umin_exp_b[] = {0x00, 0x00, 0x0e, 0xff, 0xf0, 0x98, 0x55, 0x67}; unsigned umin_exp_h[] = {0x0000, 0x0000, 0x00ff, 0xfffe, 0xfe00, 0xabab, 0x5555, 0x4567}; unsigned umin_exp_s[] = {0x00000000, 0xffffffff, 0x0000eeee, 0xfffffffe, 0xfffe0000, 0xfedcba98, 0x55555555, 0x01234567}; uint64_t umin_exp_d[] = {0x0000000000000000, 0xffffffffffffffff, 0x5555555555555555, 0x0000000001234567}; fn = &MacroAssembler::Umin; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umin_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umin_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umin_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umin_exp_d); unsigned uabd_exp_b[] = {0x01, 0xff, 0x01, 0xff, 0x0e, 0x13, 0x78, 0x67}; unsigned uabd_exp_h[] = {0x0001, 0xffff, 0x00ff, 0x0001, 0x0100, 0x0eed, 0x5555, 0x1111}; unsigned uabd_exp_s[] = {0x00000001, 0xffffffff, 0x00001111, 0x00000001, 0x00010000, 0xfedcba98, 0x78787878, 0x11111111}; uint64_t uabd_exp_d[] = {0x0000000000000001, 0xffffffffffffffff, 0x7878787878787878, 0x0000000011111111}; fn = &MacroAssembler::Uabd; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, uabd_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, uabd_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, uabd_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, uabd_exp_d); // clang-format on } TEST_SVE(sve_binary_arithmetic_predicated_smin_smax_sabd) { // clang-format off int zn_b[] = {0, -128, -128, -128, -128, 127, 127, 1}; int zm_b[] = {-1, 0, -1, -127, 127, 126, -1, 0}; int zn_h[] = {0, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MAX, INT16_MAX, 1}; int zm_h[] = {-1, 0, -1, INT16_MIN + 1, INT16_MAX, INT16_MAX - 1, -1, 0}; int zn_s[] = {0, INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN, INT32_MAX, INT32_MAX, 1}; int zm_s[] = {-1, 0, -1, -INT32_MAX, INT32_MAX, INT32_MAX - 1, -1, 0}; int64_t zn_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN, INT64_MAX, INT64_MAX, 1}; int64_t zm_d[] = {-1, 0, -1, INT64_MIN + 1, INT64_MAX, INT64_MAX - 1, -1, 0}; int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0}; int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1}; int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1}; int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1}; int smax_exp_b[] = {0, 0, -1, -128, 127, 127, 127, 1}; int smax_exp_h[] = {0, 0, INT16_MIN, INT16_MIN + 1, INT16_MAX, INT16_MAX, INT16_MAX, 1}; int smax_exp_s[] = {0, INT32_MIN, -1, INT32_MIN + 1, INT32_MAX, INT32_MAX, INT32_MAX, 1}; int64_t smax_exp_d[] = {0, 0, -1, INT64_MIN + 1, INT64_MIN, INT64_MAX, INT64_MAX, 1}; ArithPredicatedFn fn = &MacroAssembler::Smax; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smax_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smax_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smax_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smax_exp_d); int smin_exp_b[] = {-1, -128, -128, -128, -128, 126, -1, 1}; int smin_exp_h[] = {-1, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MAX - 1, INT16_MAX, 0}; int smin_exp_s[] = {-1, INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN, INT32_MAX, -1, 0}; int64_t smin_exp_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN, INT64_MAX - 1, -1, 0}; fn = &MacroAssembler::Smin; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smin_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smin_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smin_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smin_exp_d); unsigned sabd_exp_b[] = {1, 128, 127, 128, 255, 1, 128, 1}; unsigned sabd_exp_h[] = {1, 0x8000, 0x8000, 1, 0xffff, 1, 0x7fff, 1}; unsigned sabd_exp_s[] = {1, 0x80000000, 0x7fffffff, 1, 0xffffffff, 0x7fffffff, 0x80000000, 1}; uint64_t sabd_exp_d[] = {0, 0x8000000000000000, 0x7fffffffffffffff, 1, 0x8000000000000000, 1, 0x8000000000000000, 1}; fn = &MacroAssembler::Sabd; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sabd_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sabd_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sabd_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sabd_exp_d); // clang-format on } TEST_SVE(sve_binary_arithmetic_predicated_mul_umulh) { // clang-format off unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa}; unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08}; unsigned zn_h[] = {0x0000, 0x0001, 0x0020, 0x0800, 0x8000, 0xff00, 0x5555, 0xaaaa}; unsigned zm_h[] = {0x007f, 0x00cd, 0x0800, 0xffff, 0x5555, 0xaaaa, 0x0001, 0x1234}; unsigned zn_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800, 0x12345678, 0xffffffff, 0x55555555, 0xaaaaaaaa}; unsigned zm_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800, 0x12345678, 0x22223333, 0x55556666, 0x77778888}; uint64_t zn_d[] = {0x0000000000000000, 0x5555555555555555, 0xffffffffffffffff, 0xaaaaaaaaaaaaaaaa}; uint64_t zm_d[] = {0x0000000000000000, 0x1111111133333333, 0xddddddddeeeeeeee, 0xaaaaaaaaaaaaaaaa}; int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1}; int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1}; int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1}; int pg_d[] = {1, 1, 0, 1}; unsigned mul_exp_b[] = {0x00, 0xcd, 0x00, 0xf8, 0x80, 0x56, 0x00, 0x50}; unsigned mul_exp_h[] = {0x0000, 0x0001, 0x0000, 0xf800, 0x8000, 0xff00, 0x5555, 0x9e88}; unsigned mul_exp_s[] = {0x00000000, 0x00000001, 0x00200020, 0x00400000, 0x1df4d840, 0xddddcccd, 0x55555555, 0xb05afa50}; uint64_t mul_exp_d[] = {0x0000000000000000, 0xa4fa4fa4eeeeeeef, 0xffffffffffffffff, 0x38e38e38e38e38e4}; ArithPredicatedFn fn = &MacroAssembler::Mul; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, mul_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, mul_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, mul_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, mul_exp_d); unsigned umulh_exp_b[] = {0x00, 0x00, 0x10, 0x07, 0x80, 0xa9, 0x00, 0x05}; unsigned umulh_exp_h[] = {0x0000, 0x0001, 0x0001, 0x07ff, 0x2aaa, 0xff00, 0x0000, 0x0c22}; unsigned umulh_exp_s[] = {0x00000000, 0x00000000, 0x00200020, 0x00400080, 0x014b66dc, 0x22223332, 0x55555555, 0x4fa505af}; uint64_t umulh_exp_d[] = {0x0000000000000000, 0x05b05b05bbbbbbbb, 0xffffffffffffffff, 0x71c71c71c71c71c6}; fn = &MacroAssembler::Umulh; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umulh_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umulh_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umulh_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umulh_exp_d); // clang-format on } TEST_SVE(sve_binary_arithmetic_predicated_smulh) { // clang-format off int zn_b[] = {0, 1, -1, INT8_MIN, INT8_MAX, -1, 100, -3}; int zm_b[] = {0, INT8_MIN, INT8_MIN, INT8_MAX, INT8_MAX, -1, 2, 66}; int zn_h[] = {0, 1, -1, INT16_MIN, INT16_MAX, -1, 10000, -3}; int zm_h[] = {0, INT16_MIN, INT16_MIN, INT16_MAX, INT16_MAX, -1, 2, 6666}; int zn_s[] = {0, 1, -1, INT32_MIN, INT32_MAX, -1, 100000000, -3}; int zm_s[] = {0, INT32_MIN, INT32_MIN, INT32_MAX, INT32_MAX, -1, 2, 66666666}; int64_t zn_d[] = {0, -1, INT64_MIN, INT64_MAX}; int64_t zm_d[] = {INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX}; int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1}; int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1}; int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1}; int pg_d[] = {1, 1, 0, 1}; int exp_b[] = {0, -1, 0, -64, INT8_MAX, 0, 0, -1}; int exp_h[] = {0, 1, 0, -16384, 16383, -1, 0, -1}; int exp_s[] = {0, -1, -1, -1073741824, 1073741823, 0, 100000000, -1}; int64_t exp_d[] = {0, -1, INT64_MIN, 4611686018427387903}; ArithPredicatedFn fn = &MacroAssembler::Smulh; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d); // clang-format on } TEST_SVE(sve_binary_arithmetic_predicated_logical) { // clang-format off unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa}; unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08}; unsigned zn_h[] = {0x0000, 0x0001, 0x2020, 0x0008, 0x8000, 0xffff, 0x5555, 0xaaaa}; unsigned zm_h[] = {0x7fff, 0xabcd, 0x8000, 0xffff, 0x5555, 0xaaaa, 0x0000, 0x0800}; unsigned zn_s[] = {0x00000001, 0x20200008, 0x8000ffff, 0x5555aaaa}; unsigned zm_s[] = {0x7fffabcd, 0x8000ffff, 0x5555aaaa, 0x00000800}; uint64_t zn_d[] = {0xfedcba9876543210, 0x0123456789abcdef, 0x0001200880ff55aa, 0x0022446688aaccee}; uint64_t zm_d[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff, 0x7fcd80ff55aa0008, 0x1133557799bbddff}; int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1}; int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1}; int pg_s[] = {1, 1, 1, 0}; int pg_d[] = {1, 1, 0, 1}; unsigned and_exp_b[] = {0x00, 0x01, 0x00, 0x08, 0x80, 0xaa, 0x00, 0x08}; unsigned and_exp_h[] = {0x0000, 0x0001, 0x0000, 0x0008, 0x0000, 0xffff, 0x0000, 0x0800}; unsigned and_exp_s[] = {0x00000001, 0x00000008, 0x0000aaaa, 0x5555aaaa}; uint64_t and_exp_d[] = {0xfedcaa8854540000, 0x0000454588aacdef, 0x0001200880ff55aa, 0x0022446688aaccee}; ArithPredicatedFn fn = &MacroAssembler::And; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, and_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, and_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, and_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, and_exp_d); unsigned bic_exp_b[] = {0x00, 0x00, 0x20, 0x00, 0x80, 0x55, 0x55, 0xa2}; unsigned bic_exp_h[] = {0x0000, 0x0001, 0x2020, 0x0000, 0x8000, 0xffff, 0x5555, 0xa2aa}; unsigned bic_exp_s[] = {0x00000000, 0x20200000, 0x80005555, 0x5555aaaa}; uint64_t bic_exp_d[] = {0x0000101022003210, 0x0123002201010000, 0x0001200880ff55aa, 0x0000000000000000}; fn = &MacroAssembler::Bic; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, bic_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, bic_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, bic_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, bic_exp_d); unsigned eor_exp_b[] = {0x00, 0xcc, 0xa0, 0xf7, 0x80, 0x55, 0x55, 0xa2}; unsigned eor_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xfff7, 0xd555, 0xffff, 0x5555, 0xa2aa}; unsigned eor_exp_s[] = {0x7fffabcc, 0xa020fff7, 0xd5555555, 0x5555aaaa}; uint64_t eor_exp_d[] = {0x01235476ab89fedc, 0xcdef98ba67453210, 0x0001200880ff55aa, 0x1111111111111111}; fn = &MacroAssembler::Eor; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, eor_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, eor_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, eor_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, eor_exp_d); unsigned orr_exp_b[] = {0x00, 0xcd, 0xa0, 0xff, 0x80, 0xff, 0x55, 0xaa}; unsigned orr_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xffff, 0xd555, 0xffff, 0x5555, 0xaaaa}; unsigned orr_exp_s[] = {0x7fffabcd, 0xa020ffff, 0xd555ffff, 0x5555aaaa}; uint64_t orr_exp_d[] = {0xfffffefeffddfedc, 0xcdefddffefefffff, 0x0001200880ff55aa, 0x1133557799bbddff}; fn = &MacroAssembler::Orr; IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, orr_exp_b); IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, orr_exp_h); IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, orr_exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, orr_exp_d); // clang-format on } TEST_SVE(sve_binary_arithmetic_predicated_sdiv) { // clang-format off int zn_s[] = {0, 1, -1, 2468, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, -11111111, 87654321, 0, 0}; int zm_s[] = {1, -1, 1, 1234, -1, INT32_MIN, 1, -1, 22222222, 80000000, -1, 0}; int64_t zn_d[] = {0, 1, -1, 2468, INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX, -11111111, 87654321, 0, 0}; int64_t zm_d[] = {1, -1, 1, 1234, -1, INT64_MIN, 1, -1, 22222222, 80000000, -1, 0}; int pg_s[] = {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}; int pg_d[] = {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1}; int exp_s[] = {0, 1, -1, 2, INT32_MIN, 0, INT32_MIN, -INT32_MAX, 0, 1, 0, 0}; int64_t exp_d[] = {0, -1, -1, 2, INT64_MIN, INT64_MAX, INT64_MIN, -INT64_MAX, 0, 1, 0, 0}; ArithPredicatedFn fn = &MacroAssembler::Sdiv; IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d); // clang-format on } TEST_SVE(sve_binary_arithmetic_predicated_udiv) { // clang-format off unsigned zn_s[] = {0x00000000, 0x00000001, 0xffffffff, 0x80000000, 0xffffffff, 0x80000000, 0xffffffff, 0x0000f000}; unsigned zm_s[] = {0x00000001, 0xffffffff, 0x80000000, 0x00000002, 0x00000000, 0x00000001, 0x00008000, 0xf0000000}; uint64_t zn_d[] = {0x0000000000000000, 0x0000000000000001, 0xffffffffffffffff, 0x8000000000000000, 0xffffffffffffffff, 0x8000000000000000, 0xffffffffffffffff, 0xf0000000f0000000}; uint64_t zm_d[] = {0x0000000000000001, 0xffffffff00000000, 0x8000000000000000, 0x0000000000000002, 0x8888888888888888, 0x0000000000000001, 0x0000000080000000, 0x00000000f0000000}; int pg_s[] = {1, 1, 0, 1, 1, 0, 1, 1}; int pg_d[] = {1, 0, 1, 1, 1, 1, 0, 1}; unsigned exp_s[] = {0x00000000, 0x00000000, 0xffffffff, 0x40000000, 0x00000000, 0x80000000, 0x0001ffff, 0x00000000}; uint64_t exp_d[] = {0x0000000000000000, 0x0000000000000001, 0x0000000000000001, 0x4000000000000000, 0x0000000000000001, 0x8000000000000000, 0xffffffffffffffff, 0x0000000100000001}; ArithPredicatedFn fn = &MacroAssembler::Udiv; IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s); IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d); // clang-format on } typedef void (MacroAssembler::*ArithFn)(const ZRegister& zd, const ZRegister& zn, const ZRegister& zm); template static void IntArithHelper(Test* config, ArithFn macro, unsigned lane_size_in_bits, const T& zn_inputs, const T& zm_inputs, const T& zd_expected) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); ZRegister zn = z31.WithLaneSize(lane_size_in_bits); ZRegister zm = z27.WithLaneSize(lane_size_in_bits); InsrHelper(&masm, zn, zn_inputs); InsrHelper(&masm, zm, zm_inputs); ZRegister zd = z0.WithLaneSize(lane_size_in_bits); (masm.*macro)(zd, zn, zm); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zd_expected, zd); } } TEST_SVE(sve_arithmetic_unpredicated_add_sqadd_uqadd) { // clang-format off unsigned in_b[] = {0x81, 0x7f, 0x10, 0xaa, 0x55, 0xff, 0xf0}; unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa, 0x5555, 0xffff, 0xf0f0}; unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0x10001010, 0xaaaaaaaa, 0xf000f0f0}; uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f, 0x1000000010001010, 0xf0000000f000f0f0}; ArithFn fn = &MacroAssembler::Add; unsigned add_exp_b[] = {0x02, 0xfe, 0x20, 0x54, 0xaa, 0xfe, 0xe0}; unsigned add_exp_h[] = {0x0302, 0xfefe, 0x2020, 0x5554, 0xaaaa, 0xfffe, 0xe1e0}; unsigned add_exp_s[] = {0x00030302, 0xfffefefe, 0x20002020, 0x55555554, 0xe001e1e0}; uint64_t add_exp_d[] = {0x0000000300030302, 0xfffffffefffefefe, 0x2000000020002020, 0xe0000001e001e1e0}; IntArithHelper(config, fn, kBRegSize, in_b, in_b, add_exp_b); IntArithHelper(config, fn, kHRegSize, in_h, in_h, add_exp_h); IntArithHelper(config, fn, kSRegSize, in_s, in_s, add_exp_s); IntArithHelper(config, fn, kDRegSize, in_d, in_d, add_exp_d); fn = &MacroAssembler::Sqadd; unsigned sqadd_exp_b[] = {0x80, 0x7f, 0x20, 0x80, 0x7f, 0xfe, 0xe0}; unsigned sqadd_exp_h[] = {0x8000, 0x7fff, 0x2020, 0x8000, 0x7fff, 0xfffe, 0xe1e0}; unsigned sqadd_exp_s[] = {0x80000000, 0x7fffffff, 0x20002020, 0x80000000, 0xe001e1e0}; uint64_t sqadd_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff, 0x2000000020002020, 0xe0000001e001e1e0}; IntArithHelper(config, fn, kBRegSize, in_b, in_b, sqadd_exp_b); IntArithHelper(config, fn, kHRegSize, in_h, in_h, sqadd_exp_h); IntArithHelper(config, fn, kSRegSize, in_s, in_s, sqadd_exp_s); IntArithHelper(config, fn, kDRegSize, in_d, in_d, sqadd_exp_d); fn = &MacroAssembler::Uqadd; unsigned uqadd_exp_b[] = {0xff, 0xfe, 0x20, 0xff, 0xaa, 0xff, 0xff}; unsigned uqadd_exp_h[] = {0xffff, 0xfefe, 0x2020, 0xffff, 0xaaaa, 0xffff, 0xffff}; unsigned uqadd_exp_s[] = {0xffffffff, 0xfffefefe, 0x20002020, 0xffffffff, 0xffffffff}; uint64_t uqadd_exp_d[] = {0xffffffffffffffff, 0xfffffffefffefefe, 0x2000000020002020, 0xffffffffffffffff}; IntArithHelper(config, fn, kBRegSize, in_b, in_b, uqadd_exp_b); IntArithHelper(config, fn, kHRegSize, in_h, in_h, uqadd_exp_h); IntArithHelper(config, fn, kSRegSize, in_s, in_s, uqadd_exp_s); IntArithHelper(config, fn, kDRegSize, in_d, in_d, uqadd_exp_d); // clang-format on } TEST_SVE(sve_arithmetic_unpredicated_sub_sqsub_uqsub) { // clang-format off unsigned ins1_b[] = {0x81, 0x7f, 0x7e, 0xaa}; unsigned ins2_b[] = {0x10, 0xf0, 0xf0, 0x55}; unsigned ins1_h[] = {0x8181, 0x7f7f, 0x7e7e, 0xaaaa}; unsigned ins2_h[] = {0x1010, 0xf0f0, 0xf0f0, 0x5555}; unsigned ins1_s[] = {0x80018181, 0x7fff7f7f, 0x7eee7e7e, 0xaaaaaaaa}; unsigned ins2_s[] = {0x10001010, 0xf000f0f0, 0xf000f0f0, 0x55555555}; uint64_t ins1_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f, 0x7eeeeeee7eee7e7e, 0xaaaaaaaaaaaaaaaa}; uint64_t ins2_d[] = {0x1000000010001010, 0xf0000000f000f0f0, 0xf0000000f000f0f0, 0x5555555555555555}; ArithFn fn = &MacroAssembler::Sub; unsigned ins1_sub_ins2_exp_b[] = {0x71, 0x8f, 0x8e, 0x55}; unsigned ins1_sub_ins2_exp_h[] = {0x7171, 0x8e8f, 0x8d8e, 0x5555}; unsigned ins1_sub_ins2_exp_s[] = {0x70017171, 0x8ffe8e8f, 0x8eed8d8e, 0x55555555}; uint64_t ins1_sub_ins2_exp_d[] = {0x7000000170017171, 0x8ffffffe8ffe8e8f, 0x8eeeeeed8eed8d8e, 0x5555555555555555}; IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sub_ins2_exp_b); IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sub_ins2_exp_h); IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sub_ins2_exp_s); IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sub_ins2_exp_d); unsigned ins2_sub_ins1_exp_b[] = {0x8f, 0x71, 0x72, 0xab}; unsigned ins2_sub_ins1_exp_h[] = {0x8e8f, 0x7171, 0x7272, 0xaaab}; unsigned ins2_sub_ins1_exp_s[] = {0x8ffe8e8f, 0x70017171, 0x71127272, 0xaaaaaaab}; uint64_t ins2_sub_ins1_exp_d[] = {0x8ffffffe8ffe8e8f, 0x7000000170017171, 0x7111111271127272, 0xaaaaaaaaaaaaaaab}; IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sub_ins1_exp_b); IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sub_ins1_exp_h); IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sub_ins1_exp_s); IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sub_ins1_exp_d); fn = &MacroAssembler::Sqsub; unsigned ins1_sqsub_ins2_exp_b[] = {0x80, 0x7f, 0x7f, 0x80}; unsigned ins1_sqsub_ins2_exp_h[] = {0x8000, 0x7fff, 0x7fff, 0x8000}; unsigned ins1_sqsub_ins2_exp_s[] = {0x80000000, 0x7fffffff, 0x7fffffff, 0x80000000}; uint64_t ins1_sqsub_ins2_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x8000000000000000}; IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sqsub_ins2_exp_b); IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sqsub_ins2_exp_h); IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sqsub_ins2_exp_s); IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sqsub_ins2_exp_d); unsigned ins2_sqsub_ins1_exp_b[] = {0x7f, 0x80, 0x80, 0x7f}; unsigned ins2_sqsub_ins1_exp_h[] = {0x7fff, 0x8000, 0x8000, 0x7fff}; unsigned ins2_sqsub_ins1_exp_s[] = {0x7fffffff, 0x80000000, 0x80000000, 0x7fffffff}; uint64_t ins2_sqsub_ins1_exp_d[] = {0x7fffffffffffffff, 0x8000000000000000, 0x8000000000000000, 0x7fffffffffffffff}; IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sqsub_ins1_exp_b); IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sqsub_ins1_exp_h); IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sqsub_ins1_exp_s); IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sqsub_ins1_exp_d); fn = &MacroAssembler::Uqsub; unsigned ins1_uqsub_ins2_exp_b[] = {0x71, 0x00, 0x00, 0x55}; unsigned ins1_uqsub_ins2_exp_h[] = {0x7171, 0x0000, 0x0000, 0x5555}; unsigned ins1_uqsub_ins2_exp_s[] = {0x70017171, 0x00000000, 0x00000000, 0x55555555}; uint64_t ins1_uqsub_ins2_exp_d[] = {0x7000000170017171, 0x0000000000000000, 0x0000000000000000, 0x5555555555555555}; IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_uqsub_ins2_exp_b); IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_uqsub_ins2_exp_h); IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_uqsub_ins2_exp_s); IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_uqsub_ins2_exp_d); unsigned ins2_uqsub_ins1_exp_b[] = {0x00, 0x71, 0x72, 0x00}; unsigned ins2_uqsub_ins1_exp_h[] = {0x0000, 0x7171, 0x7272, 0x0000}; unsigned ins2_uqsub_ins1_exp_s[] = {0x00000000, 0x70017171, 0x71127272, 0x00000000}; uint64_t ins2_uqsub_ins1_exp_d[] = {0x0000000000000000, 0x7000000170017171, 0x7111111271127272, 0x0000000000000000}; IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_uqsub_ins1_exp_b); IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_uqsub_ins1_exp_h); IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_uqsub_ins1_exp_s); IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_uqsub_ins1_exp_d); // clang-format on } TEST_SVE(sve_rdvl) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Encodable multipliers. __ Rdvl(x0, 0); __ Rdvl(x1, 1); __ Rdvl(x2, 2); __ Rdvl(x3, 31); __ Rdvl(x4, -1); __ Rdvl(x5, -2); __ Rdvl(x6, -32); // For unencodable multipliers, the MacroAssembler uses a sequence of // instructions. __ Rdvl(x10, 32); __ Rdvl(x11, -33); __ Rdvl(x12, 42); __ Rdvl(x13, -42); // The maximum value of VL is 256 (bytes), so the multiplier is limited to the // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow // occurs in the macro. __ Rdvl(x14, 0x007fffffffffffff); __ Rdvl(x15, -0x0080000000000000); END(); if (CAN_RUN()) { RUN(); uint64_t vl = config->sve_vl_in_bytes(); ASSERT_EQUAL_64(vl * 0, x0); ASSERT_EQUAL_64(vl * 1, x1); ASSERT_EQUAL_64(vl * 2, x2); ASSERT_EQUAL_64(vl * 31, x3); ASSERT_EQUAL_64(vl * -1, x4); ASSERT_EQUAL_64(vl * -2, x5); ASSERT_EQUAL_64(vl * -32, x6); ASSERT_EQUAL_64(vl * 32, x10); ASSERT_EQUAL_64(vl * -33, x11); ASSERT_EQUAL_64(vl * 42, x12); ASSERT_EQUAL_64(vl * -42, x13); ASSERT_EQUAL_64(vl * 0x007fffffffffffff, x14); ASSERT_EQUAL_64(vl * 0xff80000000000000, x15); } } TEST_SVE(sve_rdpl) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // There is no `rdpl` instruction, so the MacroAssembler maps `Rdpl` onto // Addpl(xd, xzr, ...). // Encodable multipliers (as `addvl`). __ Rdpl(x0, 0); __ Rdpl(x1, 8); __ Rdpl(x2, 248); __ Rdpl(x3, -8); __ Rdpl(x4, -256); // Encodable multipliers (as `movz` + `addpl`). __ Rdpl(x7, 31); __ Rdpl(x8, -31); // For unencodable multipliers, the MacroAssembler uses a sequence of // instructions. __ Rdpl(x10, 42); __ Rdpl(x11, -42); // The maximum value of VL is 256 (bytes), so the multiplier is limited to the // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow // occurs in the macro. __ Rdpl(x12, 0x007fffffffffffff); __ Rdpl(x13, -0x0080000000000000); END(); if (CAN_RUN()) { RUN(); uint64_t vl = config->sve_vl_in_bytes(); VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0); uint64_t pl = vl / kZRegBitsPerPRegBit; ASSERT_EQUAL_64(pl * 0, x0); ASSERT_EQUAL_64(pl * 8, x1); ASSERT_EQUAL_64(pl * 248, x2); ASSERT_EQUAL_64(pl * -8, x3); ASSERT_EQUAL_64(pl * -256, x4); ASSERT_EQUAL_64(pl * 31, x7); ASSERT_EQUAL_64(pl * -31, x8); ASSERT_EQUAL_64(pl * 42, x10); ASSERT_EQUAL_64(pl * -42, x11); ASSERT_EQUAL_64(pl * 0x007fffffffffffff, x12); ASSERT_EQUAL_64(pl * 0xff80000000000000, x13); } } TEST_SVE(sve_addvl) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t base = 0x1234567800000000; __ Mov(x30, base); // Encodable multipliers. __ Addvl(x0, x30, 0); __ Addvl(x1, x30, 1); __ Addvl(x2, x30, 31); __ Addvl(x3, x30, -1); __ Addvl(x4, x30, -32); // For unencodable multipliers, the MacroAssembler uses `Rdvl` and `Add`. __ Addvl(x5, x30, 32); __ Addvl(x6, x30, -33); // Test the limits of the multiplier supported by the `Rdvl` macro. __ Addvl(x7, x30, 0x007fffffffffffff); __ Addvl(x8, x30, -0x0080000000000000); // Check that xzr behaves correctly. __ Addvl(x9, xzr, 8); __ Addvl(x10, xzr, 42); // Check that sp behaves correctly with encodable and unencodable multipliers. __ Addvl(sp, sp, -5); __ Addvl(sp, sp, -37); __ Addvl(x11, sp, -2); __ Addvl(sp, x11, 2); __ Addvl(x12, sp, -42); // Restore the value of sp. __ Addvl(sp, x11, 39); __ Addvl(sp, sp, 5); // Adjust x11 and x12 to make the test sp-agnostic. __ Sub(x11, sp, x11); __ Sub(x12, sp, x12); // Check cases where xd.Is(xn). This stresses scratch register allocation. __ Mov(x20, x30); __ Mov(x21, x30); __ Mov(x22, x30); __ Addvl(x20, x20, 4); __ Addvl(x21, x21, 42); __ Addvl(x22, x22, -0x0080000000000000); END(); if (CAN_RUN()) { RUN(); uint64_t vl = config->sve_vl_in_bytes(); ASSERT_EQUAL_64(base + (vl * 0), x0); ASSERT_EQUAL_64(base + (vl * 1), x1); ASSERT_EQUAL_64(base + (vl * 31), x2); ASSERT_EQUAL_64(base + (vl * -1), x3); ASSERT_EQUAL_64(base + (vl * -32), x4); ASSERT_EQUAL_64(base + (vl * 32), x5); ASSERT_EQUAL_64(base + (vl * -33), x6); ASSERT_EQUAL_64(base + (vl * 0x007fffffffffffff), x7); ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x8); ASSERT_EQUAL_64(vl * 8, x9); ASSERT_EQUAL_64(vl * 42, x10); ASSERT_EQUAL_64(vl * 44, x11); ASSERT_EQUAL_64(vl * 84, x12); ASSERT_EQUAL_64(base + (vl * 4), x20); ASSERT_EQUAL_64(base + (vl * 42), x21); ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x22); ASSERT_EQUAL_64(base, x30); } } TEST_SVE(sve_addpl) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t base = 0x1234567800000000; __ Mov(x30, base); // Encodable multipliers. __ Addpl(x0, x30, 0); __ Addpl(x1, x30, 1); __ Addpl(x2, x30, 31); __ Addpl(x3, x30, -1); __ Addpl(x4, x30, -32); // For unencodable multipliers, the MacroAssembler uses `Addvl` if it can, or // it falls back to `Rdvl` and `Add`. __ Addpl(x5, x30, 32); __ Addpl(x6, x30, -33); // Test the limits of the multiplier supported by the `Rdvl` macro. __ Addpl(x7, x30, 0x007fffffffffffff); __ Addpl(x8, x30, -0x0080000000000000); // Check that xzr behaves correctly. __ Addpl(x9, xzr, 8); __ Addpl(x10, xzr, 42); // Check that sp behaves correctly with encodable and unencodable multipliers. __ Addpl(sp, sp, -5); __ Addpl(sp, sp, -37); __ Addpl(x11, sp, -2); __ Addpl(sp, x11, 2); __ Addpl(x12, sp, -42); // Restore the value of sp. __ Addpl(sp, x11, 39); __ Addpl(sp, sp, 5); // Adjust x11 and x12 to make the test sp-agnostic. __ Sub(x11, sp, x11); __ Sub(x12, sp, x12); // Check cases where xd.Is(xn). This stresses scratch register allocation. __ Mov(x20, x30); __ Mov(x21, x30); __ Mov(x22, x30); __ Addpl(x20, x20, 4); __ Addpl(x21, x21, 42); __ Addpl(x22, x22, -0x0080000000000000); END(); if (CAN_RUN()) { RUN(); uint64_t vl = config->sve_vl_in_bytes(); VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0); uint64_t pl = vl / kZRegBitsPerPRegBit; ASSERT_EQUAL_64(base + (pl * 0), x0); ASSERT_EQUAL_64(base + (pl * 1), x1); ASSERT_EQUAL_64(base + (pl * 31), x2); ASSERT_EQUAL_64(base + (pl * -1), x3); ASSERT_EQUAL_64(base + (pl * -32), x4); ASSERT_EQUAL_64(base + (pl * 32), x5); ASSERT_EQUAL_64(base + (pl * -33), x6); ASSERT_EQUAL_64(base + (pl * 0x007fffffffffffff), x7); ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x8); ASSERT_EQUAL_64(pl * 8, x9); ASSERT_EQUAL_64(pl * 42, x10); ASSERT_EQUAL_64(pl * 44, x11); ASSERT_EQUAL_64(pl * 84, x12); ASSERT_EQUAL_64(base + (pl * 4), x20); ASSERT_EQUAL_64(base + (pl * 42), x21); ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x22); ASSERT_EQUAL_64(base, x30); } } TEST_SVE(sve_calculate_sve_address) { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" // Shadow the `MacroAssembler` type so that the test macros work without // modification. typedef CalculateSVEAddressMacroAssembler MacroAssembler; SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // NOLINT(clang-diagnostic-local-type-template-args) uint64_t base = 0x1234567800000000; __ Mov(x28, base); __ Mov(x29, 48); __ Mov(x30, -48); // Simple scalar (or equivalent) cases. __ CalculateSVEAddress(x0, SVEMemOperand(x28)); __ CalculateSVEAddress(x1, SVEMemOperand(x28, 0)); __ CalculateSVEAddress(x2, SVEMemOperand(x28, 0, SVE_MUL_VL)); __ CalculateSVEAddress(x3, SVEMemOperand(x28, 0, SVE_MUL_VL), 3); __ CalculateSVEAddress(x4, SVEMemOperand(x28, xzr)); __ CalculateSVEAddress(x5, SVEMemOperand(x28, xzr, LSL, 42)); // scalar-plus-immediate // Unscaled immediates, handled with `Add`. __ CalculateSVEAddress(x6, SVEMemOperand(x28, 42)); __ CalculateSVEAddress(x7, SVEMemOperand(x28, -42)); // Scaled immediates, handled with `Addvl` or `Addpl`. __ CalculateSVEAddress(x8, SVEMemOperand(x28, 31, SVE_MUL_VL), 0); __ CalculateSVEAddress(x9, SVEMemOperand(x28, -32, SVE_MUL_VL), 0); // Out of `addvl` or `addpl` range. __ CalculateSVEAddress(x10, SVEMemOperand(x28, 42, SVE_MUL_VL), 0); __ CalculateSVEAddress(x11, SVEMemOperand(x28, -42, SVE_MUL_VL), 0); // As above, for VL-based accesses smaller than a Z register. VIXL_STATIC_ASSERT(kZRegBitsPerPRegBitLog2 == 3); __ CalculateSVEAddress(x12, SVEMemOperand(x28, -32 * 8, SVE_MUL_VL), 3); __ CalculateSVEAddress(x13, SVEMemOperand(x28, -42 * 8, SVE_MUL_VL), 3); __ CalculateSVEAddress(x14, SVEMemOperand(x28, -32 * 4, SVE_MUL_VL), 2); __ CalculateSVEAddress(x15, SVEMemOperand(x28, -42 * 4, SVE_MUL_VL), 2); __ CalculateSVEAddress(x18, SVEMemOperand(x28, -32 * 2, SVE_MUL_VL), 1); __ CalculateSVEAddress(x19, SVEMemOperand(x28, -42 * 2, SVE_MUL_VL), 1); // scalar-plus-scalar __ CalculateSVEAddress(x20, SVEMemOperand(x28, x29)); __ CalculateSVEAddress(x21, SVEMemOperand(x28, x30)); __ CalculateSVEAddress(x22, SVEMemOperand(x28, x29, LSL, 8)); __ CalculateSVEAddress(x23, SVEMemOperand(x28, x30, LSL, 8)); // In-place updates, to stress scratch register allocation. __ Mov(x24, 0xabcd000000000000); __ Mov(x25, 0xabcd101100000000); __ Mov(x26, 0xabcd202200000000); __ Mov(x27, 0xabcd303300000000); __ Mov(x28, 0xabcd404400000000); __ Mov(x29, 0xabcd505500000000); __ CalculateSVEAddress(x24, SVEMemOperand(x24)); __ CalculateSVEAddress(x25, SVEMemOperand(x25, 0x42)); __ CalculateSVEAddress(x26, SVEMemOperand(x26, 3, SVE_MUL_VL), 0); __ CalculateSVEAddress(x27, SVEMemOperand(x27, 0x42, SVE_MUL_VL), 3); __ CalculateSVEAddress(x28, SVEMemOperand(x28, x30)); __ CalculateSVEAddress(x29, SVEMemOperand(x29, x30, LSL, 4)); END(); if (CAN_RUN()) { RUN(); uint64_t vl = config->sve_vl_in_bytes(); VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0); uint64_t pl = vl / kZRegBitsPerPRegBit; // Simple scalar (or equivalent) cases. ASSERT_EQUAL_64(base, x0); ASSERT_EQUAL_64(base, x1); ASSERT_EQUAL_64(base, x2); ASSERT_EQUAL_64(base, x3); ASSERT_EQUAL_64(base, x4); ASSERT_EQUAL_64(base, x5); // scalar-plus-immediate ASSERT_EQUAL_64(base + 42, x6); ASSERT_EQUAL_64(base - 42, x7); ASSERT_EQUAL_64(base + (31 * vl), x8); ASSERT_EQUAL_64(base - (32 * vl), x9); ASSERT_EQUAL_64(base + (42 * vl), x10); ASSERT_EQUAL_64(base - (42 * vl), x11); ASSERT_EQUAL_64(base - (32 * vl), x12); ASSERT_EQUAL_64(base - (42 * vl), x13); ASSERT_EQUAL_64(base - (32 * vl), x14); ASSERT_EQUAL_64(base - (42 * vl), x15); ASSERT_EQUAL_64(base - (32 * vl), x18); ASSERT_EQUAL_64(base - (42 * vl), x19); // scalar-plus-scalar ASSERT_EQUAL_64(base + 48, x20); ASSERT_EQUAL_64(base - 48, x21); ASSERT_EQUAL_64(base + (48 << 8), x22); ASSERT_EQUAL_64(base - (48 << 8), x23); // In-place updates. ASSERT_EQUAL_64(0xabcd000000000000, x24); ASSERT_EQUAL_64(0xabcd101100000000 + 0x42, x25); ASSERT_EQUAL_64(0xabcd202200000000 + (3 * vl), x26); ASSERT_EQUAL_64(0xabcd303300000000 + (0x42 * pl), x27); ASSERT_EQUAL_64(0xabcd404400000000 - 48, x28); ASSERT_EQUAL_64(0xabcd505500000000 - (48 << 4), x29); } #pragma GCC diagnostic pop } TEST_SVE(sve_permute_vector_unpredicated) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON); START(); // Initialise registers with known values first. __ Dup(z1.VnB(), 0x11); __ Dup(z2.VnB(), 0x22); __ Dup(z3.VnB(), 0x33); __ Dup(z4.VnB(), 0x44); __ Mov(x0, 0x0123456789abcdef); __ Fmov(d0, RawbitsToDouble(0x7ffaaaaa22223456)); __ Insr(z1.VnS(), w0); __ Insr(z2.VnD(), x0); __ Insr(z3.VnH(), h0); __ Insr(z4.VnD(), d0); uint64_t inputs[] = {0xfedcba9876543210, 0x0123456789abcdef, 0x8f8e8d8c8b8a8988, 0x8786858483828180}; // Initialize a distinguishable value throughout the register first. __ Dup(z9.VnB(), 0xff); InsrHelper(&masm, z9.VnD(), inputs); __ Rev(z5.VnB(), z9.VnB()); __ Rev(z6.VnH(), z9.VnH()); __ Rev(z7.VnS(), z9.VnS()); __ Rev(z8.VnD(), z9.VnD()); int index[7] = {22, 7, 7, 3, 1, 1, 63}; // Broadcasting an data within the input array. __ Dup(z10.VnB(), z9.VnB(), index[0]); __ Dup(z11.VnH(), z9.VnH(), index[1]); __ Dup(z12.VnS(), z9.VnS(), index[2]); __ Dup(z13.VnD(), z9.VnD(), index[3]); __ Dup(z14.VnQ(), z9.VnQ(), index[4]); // Test dst == src __ Mov(z15, z9); __ Dup(z15.VnS(), z15.VnS(), index[5]); // Selecting an data beyond the input array. __ Dup(z16.VnB(), z9.VnB(), index[6]); END(); if (CAN_RUN()) { RUN(); // Insr uint64_t z1_expected[] = {0x1111111111111111, 0x1111111189abcdef}; uint64_t z2_expected[] = {0x2222222222222222, 0x0123456789abcdef}; uint64_t z3_expected[] = {0x3333333333333333, 0x3333333333333456}; uint64_t z4_expected[] = {0x4444444444444444, 0x7ffaaaaa22223456}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); ASSERT_EQUAL_SVE(z2_expected, z2.VnD()); ASSERT_EQUAL_SVE(z3_expected, z3.VnD()); ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); // Rev int lane_count = core.GetSVELaneCount(kBRegSize); for (int i = 0; i < lane_count; i++) { uint64_t expected = core.zreg_lane(z5.GetCode(), kBRegSize, lane_count - i - 1); uint64_t input = core.zreg_lane(z9.GetCode(), kBRegSize, i); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kHRegSize); for (int i = 0; i < lane_count; i++) { uint64_t expected = core.zreg_lane(z6.GetCode(), kHRegSize, lane_count - i - 1); uint64_t input = core.zreg_lane(z9.GetCode(), kHRegSize, i); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kSRegSize); for (int i = 0; i < lane_count; i++) { uint64_t expected = core.zreg_lane(z7.GetCode(), kSRegSize, lane_count - i - 1); uint64_t input = core.zreg_lane(z9.GetCode(), kSRegSize, i); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kDRegSize); for (int i = 0; i < lane_count; i++) { uint64_t expected = core.zreg_lane(z8.GetCode(), kDRegSize, lane_count - i - 1); uint64_t input = core.zreg_lane(z9.GetCode(), kDRegSize, i); ASSERT_EQUAL_64(expected, input); } // Dup unsigned vl = config->sve_vl_in_bits(); lane_count = core.GetSVELaneCount(kBRegSize); uint64_t expected_z10 = (vl > (index[0] * kBRegSize)) ? 0x23 : 0; for (int i = 0; i < lane_count; i++) { ASSERT_EQUAL_SVE_LANE(expected_z10, z10.VnB(), i); } lane_count = core.GetSVELaneCount(kHRegSize); uint64_t expected_z11 = (vl > (index[1] * kHRegSize)) ? 0x8f8e : 0; for (int i = 0; i < lane_count; i++) { ASSERT_EQUAL_SVE_LANE(expected_z11, z11.VnH(), i); } lane_count = core.GetSVELaneCount(kSRegSize); uint64_t expected_z12 = (vl > (index[2] * kSRegSize)) ? 0xfedcba98 : 0; for (int i = 0; i < lane_count; i++) { ASSERT_EQUAL_SVE_LANE(expected_z12, z12.VnS(), i); } lane_count = core.GetSVELaneCount(kDRegSize); uint64_t expected_z13 = (vl > (index[3] * kDRegSize)) ? 0xfedcba9876543210 : 0; for (int i = 0; i < lane_count; i++) { ASSERT_EQUAL_SVE_LANE(expected_z13, z13.VnD(), i); } lane_count = core.GetSVELaneCount(kDRegSize); uint64_t expected_z14_lo = 0; uint64_t expected_z14_hi = 0; if (vl > (index[4] * kQRegSize)) { expected_z14_lo = 0x0123456789abcdef; expected_z14_hi = 0xfedcba9876543210; } for (int i = 0; i < lane_count; i += 2) { ASSERT_EQUAL_SVE_LANE(expected_z14_lo, z14.VnD(), i); ASSERT_EQUAL_SVE_LANE(expected_z14_hi, z14.VnD(), i + 1); } lane_count = core.GetSVELaneCount(kSRegSize); uint64_t expected_z15 = (vl > (index[5] * kSRegSize)) ? 0x87868584 : 0; for (int i = 0; i < lane_count; i++) { ASSERT_EQUAL_SVE_LANE(expected_z15, z15.VnS(), i); } lane_count = core.GetSVELaneCount(kBRegSize); uint64_t expected_z16 = (vl > (index[6] * kBRegSize)) ? 0xff : 0; for (int i = 0; i < lane_count; i++) { ASSERT_EQUAL_SVE_LANE(expected_z16, z16.VnB(), i); } } } TEST_SVE(sve_permute_vector_unpredicated_unpack_vector_elements) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t z9_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef, 0x8f8e8d8c8b8a8988, 0x8786858483828180}; InsrHelper(&masm, z9.VnD(), z9_inputs); __ Sunpkhi(z10.VnH(), z9.VnB()); __ Sunpkhi(z11.VnS(), z9.VnH()); __ Sunpkhi(z12.VnD(), z9.VnS()); __ Sunpklo(z13.VnH(), z9.VnB()); __ Sunpklo(z14.VnS(), z9.VnH()); __ Sunpklo(z15.VnD(), z9.VnS()); __ Uunpkhi(z16.VnH(), z9.VnB()); __ Uunpkhi(z17.VnS(), z9.VnH()); __ Uunpkhi(z18.VnD(), z9.VnS()); __ Uunpklo(z19.VnH(), z9.VnB()); __ Uunpklo(z20.VnS(), z9.VnH()); __ Uunpklo(z21.VnD(), z9.VnS()); // Test unpacking with same source and destination. __ Mov(z22, z9); __ Sunpklo(z22.VnH(), z22.VnB()); __ Mov(z23, z9); __ Uunpklo(z23.VnH(), z23.VnB()); END(); if (CAN_RUN()) { RUN(); // Suunpkhi int lane_count = core.GetSVELaneCount(kHRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint16_t expected = core.zreg_lane(z10.GetCode(), i); uint8_t b_lane = core.zreg_lane(z9.GetCode(), i + lane_count); uint16_t input = SignExtend(b_lane, kBRegSize); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kSRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint32_t expected = core.zreg_lane(z11.GetCode(), i); uint16_t h_lane = core.zreg_lane(z9.GetCode(), i + lane_count); uint32_t input = SignExtend(h_lane, kHRegSize); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kDRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint64_t expected = core.zreg_lane(z12.GetCode(), i); uint32_t s_lane = core.zreg_lane(z9.GetCode(), i + lane_count); uint64_t input = SignExtend(s_lane, kSRegSize); ASSERT_EQUAL_64(expected, input); } // Suunpklo lane_count = core.GetSVELaneCount(kHRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint16_t expected = core.zreg_lane(z13.GetCode(), i); uint8_t b_lane = core.zreg_lane(z9.GetCode(), i); uint16_t input = SignExtend(b_lane, kBRegSize); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kSRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint32_t expected = core.zreg_lane(z14.GetCode(), i); uint16_t h_lane = core.zreg_lane(z9.GetCode(), i); uint32_t input = SignExtend(h_lane, kHRegSize); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kDRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint64_t expected = core.zreg_lane(z15.GetCode(), i); uint32_t s_lane = core.zreg_lane(z9.GetCode(), i); uint64_t input = SignExtend(s_lane, kSRegSize); ASSERT_EQUAL_64(expected, input); } // Uuunpkhi lane_count = core.GetSVELaneCount(kHRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint16_t expected = core.zreg_lane(z16.GetCode(), i); uint16_t input = core.zreg_lane(z9.GetCode(), i + lane_count); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kSRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint32_t expected = core.zreg_lane(z17.GetCode(), i); uint32_t input = core.zreg_lane(z9.GetCode(), i + lane_count); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kDRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint64_t expected = core.zreg_lane(z18.GetCode(), i); uint64_t input = core.zreg_lane(z9.GetCode(), i + lane_count); ASSERT_EQUAL_64(expected, input); } // Uuunpklo lane_count = core.GetSVELaneCount(kHRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint16_t expected = core.zreg_lane(z19.GetCode(), i); uint16_t input = core.zreg_lane(z9.GetCode(), i); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kSRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint32_t expected = core.zreg_lane(z20.GetCode(), i); uint32_t input = core.zreg_lane(z9.GetCode(), i); ASSERT_EQUAL_64(expected, input); } lane_count = core.GetSVELaneCount(kDRegSize); for (int i = lane_count - 1; i >= 0; i--) { uint64_t expected = core.zreg_lane(z21.GetCode(), i); uint64_t input = core.zreg_lane(z9.GetCode(), i); ASSERT_EQUAL_64(expected, input); } ASSERT_EQUAL_SVE(z13, z22); ASSERT_EQUAL_SVE(z19, z23); } } TEST_SVE(sve_cnot_not) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in[] = {0x0000000000000000, 0x00000000e1c30000, 0x123456789abcdef0}; // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 1, 0 // For S lanes: 1, 1, 1, 0, 0 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0}; Initialise(&masm, p0.VnB(), pg_in); PRegisterM pg = p0.Merging(); // These are merging operations, so we have to initialise the result register. // We use a mixture of constructive and destructive operations. InsrHelper(&masm, z31.VnD(), in); // Make a copy so we can check that constructive operations preserve zn. __ Mov(z30, z31); // For constructive operations, use a different initial result value. __ Index(z29.VnB(), 0, -1); __ Mov(z0, z31); __ Cnot(z0.VnB(), pg, z0.VnB()); // destructive __ Mov(z1, z29); __ Cnot(z1.VnH(), pg, z31.VnH()); __ Mov(z2, z31); __ Cnot(z2.VnS(), pg, z2.VnS()); // destructive __ Mov(z3, z29); __ Cnot(z3.VnD(), pg, z31.VnD()); __ Mov(z4, z29); __ Not(z4.VnB(), pg, z31.VnB()); __ Mov(z5, z31); __ Not(z5.VnH(), pg, z5.VnH()); // destructive __ Mov(z6, z29); __ Not(z6.VnS(), pg, z31.VnS()); __ Mov(z7, z31); __ Not(z7.VnD(), pg, z7.VnD()); // destructive END(); if (CAN_RUN()) { RUN(); // Check that constructive operations preserve their inputs. ASSERT_EQUAL_SVE(z30, z31); // clang-format off // Cnot (B) destructive uint64_t expected_z0[] = // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 {0x0000000001000101, 0x01000001e1000101, 0x12340078000000f0}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); // Cnot (H) uint64_t expected_z1[] = // pg: 0 0 0 1 0 1 1 1 0 0 1 0 {0xe9eaebecedee0001, 0xf1f2000100000001, 0xf9fafbfc0000ff00}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); // Cnot (S) destructive uint64_t expected_z2[] = // pg: 0 1 1 1 0 0 {0x0000000000000001, 0x0000000100000000, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); // Cnot (D) uint64_t expected_z3[] = // pg: 1 1 0 {0x0000000000000001, 0x0000000000000000, 0xf9fafbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); // Not (B) uint64_t expected_z4[] = // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 {0xe9eaebecffeeffff, 0xfff2f3fff53cffff, 0xf9faa9fc65432100}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); // Not (H) destructive uint64_t expected_z5[] = // pg: 0 0 0 1 0 1 1 1 0 0 1 0 {0x000000000000ffff, 0x0000ffff1e3cffff, 0x123456786543def0}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); // Not (S) uint64_t expected_z6[] = // pg: 0 1 1 1 0 0 {0xe9eaebecffffffff, 0xffffffff1e3cffff, 0xf9fafbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); // Not (D) destructive uint64_t expected_z7[] = // pg: 1 1 0 {0xffffffffffffffff, 0xffffffff1e3cffff, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); // clang-format on } } TEST_SVE(sve_fabs_fneg) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Include FP64, FP32 and FP16 signalling NaNs. Most FP operations quieten // NaNs, but fabs and fneg do not. uint64_t in[] = {0xc04500004228d140, // Recognisable (+/-42) values. 0xfff00000ff80fc01, // Signalling NaNs. 0x123456789abcdef0}; // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 1, 0 // For S lanes: 1, 1, 1, 0, 0 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0}; Initialise(&masm, p0.VnB(), pg_in); PRegisterM pg = p0.Merging(); // These are merging operations, so we have to initialise the result register. // We use a mixture of constructive and destructive operations. InsrHelper(&masm, z31.VnD(), in); // Make a copy so we can check that constructive operations preserve zn. __ Mov(z30, z31); // For constructive operations, use a different initial result value. __ Index(z29.VnB(), 0, -1); __ Mov(z0, z29); __ Fabs(z0.VnH(), pg, z31.VnH()); __ Mov(z1, z31); __ Fabs(z1.VnS(), pg, z1.VnS()); // destructive __ Mov(z2, z29); __ Fabs(z2.VnD(), pg, z31.VnD()); __ Mov(z3, z31); __ Fneg(z3.VnH(), pg, z3.VnH()); // destructive __ Mov(z4, z29); __ Fneg(z4.VnS(), pg, z31.VnS()); __ Mov(z5, z31); __ Fneg(z5.VnD(), pg, z5.VnD()); // destructive END(); if (CAN_RUN()) { RUN(); // Check that constructive operations preserve their inputs. ASSERT_EQUAL_SVE(z30, z31); // clang-format off // Fabs (H) uint64_t expected_z0[] = // pg: 0 0 0 1 0 1 1 1 0 0 1 0 {0xe9eaebecedee5140, 0xf1f200007f807c01, 0xf9fafbfc1abcff00}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); // Fabs (S) destructive uint64_t expected_z1[] = // pg: 0 1 1 1 0 0 {0xc04500004228d140, 0x7ff000007f80fc01, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); // Fabs (D) uint64_t expected_z2[] = // pg: 1 1 0 {0x404500004228d140, 0x7ff00000ff80fc01, 0xf9fafbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); // Fneg (H) destructive uint64_t expected_z3[] = // pg: 0 0 0 1 0 1 1 1 0 0 1 0 {0xc045000042285140, 0xfff080007f807c01, 0x123456781abcdef0}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); // Fneg (S) uint64_t expected_z4[] = // pg: 0 1 1 1 0 0 {0xe9eaebecc228d140, 0x7ff000007f80fc01, 0xf9fafbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); // Fneg (D) destructive uint64_t expected_z5[] = // pg: 1 1 0 {0x404500004228d140, 0x7ff00000ff80fc01, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); // clang-format on } } TEST_SVE(sve_cls_clz_cnt) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in[] = {0x0000000000000000, 0xfefcf8f0e1c3870f, 0x123456789abcdef0}; // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 1, 0 // For S lanes: 1, 1, 1, 0, 0 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0}; Initialise(&masm, p0.VnB(), pg_in); PRegisterM pg = p0.Merging(); // These are merging operations, so we have to initialise the result register. // We use a mixture of constructive and destructive operations. InsrHelper(&masm, z31.VnD(), in); // Make a copy so we can check that constructive operations preserve zn. __ Mov(z30, z31); // For constructive operations, use a different initial result value. __ Index(z29.VnB(), 0, -1); __ Mov(z0, z29); __ Cls(z0.VnB(), pg, z31.VnB()); __ Mov(z1, z31); __ Clz(z1.VnH(), pg, z1.VnH()); // destructive __ Mov(z2, z29); __ Cnt(z2.VnS(), pg, z31.VnS()); __ Mov(z3, z31); __ Cnt(z3.VnD(), pg, z3.VnD()); // destructive END(); if (CAN_RUN()) { RUN(); // Check that non-destructive operations preserve their inputs. ASSERT_EQUAL_SVE(z30, z31); // clang-format off // cls (B) uint8_t expected_z0[] = // pg: 0 0 0 0 1 0 1 1 // pg: 1 0 0 1 0 1 1 1 // pg: 0 0 1 0 1 1 1 0 {0xe9, 0xea, 0xeb, 0xec, 7, 0xee, 7, 7, 6, 0xf2, 0xf3, 3, 0xf5, 1, 0, 3, 0xf9, 0xfa, 0, 0xfc, 0, 0, 1, 0x00}; ASSERT_EQUAL_SVE(expected_z0, z0.VnB()); // clz (H) destructive uint16_t expected_z1[] = // pg: 0 0 0 1 // pg: 0 1 1 1 // pg: 0 0 1 0 {0x0000, 0x0000, 0x0000, 16, 0xfefc, 0, 0, 0, 0x1234, 0x5678, 0, 0xdef0}; ASSERT_EQUAL_SVE(expected_z1, z1.VnH()); // cnt (S) uint32_t expected_z2[] = // pg: 0 1 // pg: 1 1 // pg: 0 0 {0xe9eaebec, 0, 22, 16, 0xf9fafbfc, 0xfdfeff00}; ASSERT_EQUAL_SVE(expected_z2, z2.VnS()); // cnt (D) destructive uint64_t expected_z3[] = // pg: 1 1 0 { 0, 38, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); // clang-format on } } TEST_SVE(sve_sxt) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0}; // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 1, 0 // For S lanes: 1, 1, 1, 0, 0 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0}; Initialise(&masm, p0.VnB(), pg_in); PRegisterM pg = p0.Merging(); // These are merging operations, so we have to initialise the result register. // We use a mixture of constructive and destructive operations. InsrHelper(&masm, z31.VnD(), in); // Make a copy so we can check that constructive operations preserve zn. __ Mov(z30, z31); // For constructive operations, use a different initial result value. __ Index(z29.VnB(), 0, -1); __ Mov(z0, z31); __ Sxtb(z0.VnH(), pg, z0.VnH()); // destructive __ Mov(z1, z29); __ Sxtb(z1.VnS(), pg, z31.VnS()); __ Mov(z2, z31); __ Sxtb(z2.VnD(), pg, z2.VnD()); // destructive __ Mov(z3, z29); __ Sxth(z3.VnS(), pg, z31.VnS()); __ Mov(z4, z31); __ Sxth(z4.VnD(), pg, z4.VnD()); // destructive __ Mov(z5, z29); __ Sxtw(z5.VnD(), pg, z31.VnD()); END(); if (CAN_RUN()) { RUN(); // Check that constructive operations preserve their inputs. ASSERT_EQUAL_SVE(z30, z31); // clang-format off // Sxtb (H) destructive uint64_t expected_z0[] = // pg: 0 0 0 1 0 1 1 1 0 0 1 0 {0x01f203f405f6fff8, 0xfefcfff0ffc3000f, 0x12345678ffbcdef0}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); // Sxtb (S) uint64_t expected_z1[] = // pg: 0 1 1 1 0 0 {0xe9eaebecfffffff8, 0xfffffff00000000f, 0xf9fafbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); // Sxtb (D) destructive uint64_t expected_z2[] = // pg: 1 1 0 {0xfffffffffffffff8, 0x000000000000000f, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); // Sxth (S) uint64_t expected_z3[] = // pg: 0 1 1 1 0 0 {0xe9eaebec000007f8, 0xfffff8f0ffff870f, 0xf9fafbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); // Sxth (D) destructive uint64_t expected_z4[] = // pg: 1 1 0 {0x00000000000007f8, 0xffffffffffff870f, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); // Sxtw (D) uint64_t expected_z5[] = // pg: 1 1 0 {0x0000000005f607f8, 0xffffffffe1c3870f, 0xf9fafbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); // clang-format on } } TEST_SVE(sve_uxt) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0}; // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 1, 0 // For S lanes: 1, 1, 1, 0, 0 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0}; Initialise(&masm, p0.VnB(), pg_in); PRegisterM pg = p0.Merging(); // These are merging operations, so we have to initialise the result register. // We use a mixture of constructive and destructive operations. InsrHelper(&masm, z31.VnD(), in); // Make a copy so we can check that constructive operations preserve zn. __ Mov(z30, z31); // For constructive operations, use a different initial result value. __ Index(z29.VnB(), 0, -1); __ Mov(z0, z29); __ Uxtb(z0.VnH(), pg, z31.VnH()); __ Mov(z1, z31); __ Uxtb(z1.VnS(), pg, z1.VnS()); // destructive __ Mov(z2, z29); __ Uxtb(z2.VnD(), pg, z31.VnD()); __ Mov(z3, z31); __ Uxth(z3.VnS(), pg, z3.VnS()); // destructive __ Mov(z4, z29); __ Uxth(z4.VnD(), pg, z31.VnD()); __ Mov(z5, z31); __ Uxtw(z5.VnD(), pg, z5.VnD()); // destructive END(); if (CAN_RUN()) { RUN(); // clang-format off // Uxtb (H) uint64_t expected_z0[] = // pg: 0 0 0 1 0 1 1 1 0 0 1 0 {0xe9eaebecedee00f8, 0xf1f200f000c3000f, 0xf9fafbfc00bcff00}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); // Uxtb (S) destructive uint64_t expected_z1[] = // pg: 0 1 1 1 0 0 {0x01f203f4000000f8, 0x000000f00000000f, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); // Uxtb (D) uint64_t expected_z2[] = // pg: 1 1 0 {0x00000000000000f8, 0x000000000000000f, 0xf9fafbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); // Uxth (S) destructive uint64_t expected_z3[] = // pg: 0 1 1 1 0 0 {0x01f203f4000007f8, 0x0000f8f00000870f, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); // Uxth (D) uint64_t expected_z4[] = // pg: 1 1 0 {0x00000000000007f8, 0x000000000000870f, 0xf9fafbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); // Uxtw (D) destructive uint64_t expected_z5[] = // pg: 1 1 0 {0x0000000005f607f8, 0x00000000e1c3870f, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); // clang-format on } } TEST_SVE(sve_abs_neg) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0}; // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 1, 0 // For S lanes: 1, 1, 1, 0, 0 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0}; Initialise(&masm, p0.VnB(), pg_in); PRegisterM pg = p0.Merging(); InsrHelper(&masm, z31.VnD(), in); // These are merging operations, so we have to initialise the result register. // We use a mixture of constructive and destructive operations. InsrHelper(&masm, z31.VnD(), in); // Make a copy so we can check that constructive operations preserve zn. __ Mov(z30, z31); // For constructive operations, use a different initial result value. __ Index(z29.VnB(), 0, -1); __ Mov(z0, z31); __ Abs(z0.VnD(), pg, z0.VnD()); // destructive __ Mov(z1, z29); __ Abs(z1.VnB(), pg, z31.VnB()); __ Mov(z2, z31); __ Neg(z2.VnH(), pg, z2.VnH()); // destructive __ Mov(z3, z29); __ Neg(z3.VnS(), pg, z31.VnS()); // The unpredicated form of `Neg` is implemented using `subr`. __ Mov(z4, z31); __ Neg(z4.VnB(), z4.VnB()); // destructive __ Mov(z5, z29); __ Neg(z5.VnD(), z31.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z30, z31); // clang-format off // Abs (D) destructive uint64_t expected_z0[] = // pg: 1 1 0 {0x01f203f405f607f8, 0x0103070f1e3c78f1, 0x123456789abcdef0}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); // Abs (B) uint64_t expected_z1[] = // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 {0xe9eaebec05ee0708, 0x02f2f310f53d790f, 0xf9fa56fc66442200}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); // Neg (H) destructive uint64_t expected_z2[] = // pg: 0 0 0 1 0 1 1 1 0 0 1 0 {0x01f203f405f6f808, 0xfefc07101e3d78f1, 0x123456786544def0}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); // Neg (S) uint64_t expected_z3[] = // pg: 0 1 1 1 0 0 {0xe9eaebecfa09f808, 0x010307101e3c78f1, 0xf9fafbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); // Neg (B) destructive, unpredicated uint64_t expected_z4[] = {0xff0efd0cfb0af908, 0x020408101f3d79f1, 0xeeccaa8866442210}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); // Neg (D) unpredicated uint64_t expected_z5[] = {0xfe0dfc0bfa09f808, 0x0103070f1e3c78f1, 0xedcba98765432110}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); // clang-format on } } TEST_SVE(sve_cpy) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON); START(); // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 0, 1, 1 // For S lanes: 0, 1, 1, 0, 1 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1}; PRegisterM pg = p7.Merging(); Initialise(&masm, pg.VnB(), pg_in); // These are merging operations, so we have to initialise the result registers // for each operation. for (unsigned i = 0; i < kNumberOfZRegisters; i++) { __ Index(ZRegister(i, kBRegSize), 0, -1); } // Recognisable values to copy. __ Mov(x0, 0xdeadbeefdeadbe42); __ Mov(x1, 0xdeadbeefdead8421); __ Mov(x2, 0xdeadbeef80042001); __ Mov(x3, 0x8000000420000001); // Use NEON moves, to avoid testing SVE `cpy` against itself. __ Dup(v28.V2D(), x0); __ Dup(v29.V2D(), x1); __ Dup(v30.V2D(), x2); __ Dup(v31.V2D(), x3); // Register forms (CPY_z_p_r) __ Cpy(z0.VnB(), pg, w0); __ Cpy(z1.VnH(), pg, x1); // X registers are accepted for small lanes. __ Cpy(z2.VnS(), pg, w2); __ Cpy(z3.VnD(), pg, x3); // VRegister forms (CPY_z_p_v) __ Cpy(z4.VnB(), pg, b28); __ Cpy(z5.VnH(), pg, h29); __ Cpy(z6.VnS(), pg, s30); __ Cpy(z7.VnD(), pg, d31); // Check that we can copy the stack pointer. __ Mov(x10, sp); __ Mov(sp, 0xabcabcabcabcabca); // Set sp to a known value. __ Cpy(z16.VnB(), pg, sp); __ Cpy(z17.VnH(), pg, wsp); __ Cpy(z18.VnS(), pg, wsp); __ Cpy(z19.VnD(), pg, sp); __ Mov(sp, x10); // Restore sp. END(); if (CAN_RUN()) { RUN(); // clang-format off uint64_t expected_b[] = // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 {0xe9eaebec424242f0, 0x42f2f34242f64242, 0xf942fbfcfdfeff42}; ASSERT_EQUAL_SVE(expected_b, z0.VnD()); ASSERT_EQUAL_SVE(expected_b, z4.VnD()); uint64_t expected_h[] = // pg: 0 0 1 0 0 1 0 1 1 0 0 1 {0xe9eaebec8421eff0, 0xf1f28421f5f68421, 0x8421fbfcfdfe8421}; ASSERT_EQUAL_SVE(expected_h, z1.VnD()); ASSERT_EQUAL_SVE(expected_h, z5.VnD()); uint64_t expected_s[] = // pg: 0 0 1 1 0 1 {0xe9eaebecedeeeff0, 0x8004200180042001, 0xf9fafbfc80042001}; ASSERT_EQUAL_SVE(expected_s, z2.VnD()); ASSERT_EQUAL_SVE(expected_s, z6.VnD()); uint64_t expected_d[] = // pg: 0 1 1 {0xe9eaebecedeeeff0, 0x8000000420000001, 0x8000000420000001}; ASSERT_EQUAL_SVE(expected_d, z3.VnD()); ASSERT_EQUAL_SVE(expected_d, z7.VnD()); uint64_t expected_b_sp[] = // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 {0xe9eaebeccacacaf0, 0xcaf2f3cacaf6caca, 0xf9cafbfcfdfeffca}; ASSERT_EQUAL_SVE(expected_b_sp, z16.VnD()); uint64_t expected_h_sp[] = // pg: 0 0 1 0 0 1 0 1 1 0 0 1 {0xe9eaebecabcaeff0, 0xf1f2abcaf5f6abca, 0xabcafbfcfdfeabca}; ASSERT_EQUAL_SVE(expected_h_sp, z17.VnD()); uint64_t expected_s_sp[] = // pg: 0 0 1 1 0 1 {0xe9eaebecedeeeff0, 0xcabcabcacabcabca, 0xf9fafbfccabcabca}; ASSERT_EQUAL_SVE(expected_s_sp, z18.VnD()); uint64_t expected_d_sp[] = // pg: 0 1 1 {0xe9eaebecedeeeff0, 0xabcabcabcabcabca, 0xabcabcabcabcabca}; ASSERT_EQUAL_SVE(expected_d_sp, z19.VnD()); // clang-format on } } TEST_SVE(sve_cpy_imm) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 0, 1, 1 // For S lanes: 0, 1, 1, 0, 1 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1}; PRegister pg = p7; Initialise(&masm, pg.VnB(), pg_in); // These are (mostly) merging operations, so we have to initialise the result // registers for each operation. for (unsigned i = 0; i < kNumberOfZRegisters; i++) { __ Index(ZRegister(i, kBRegSize), 0, -1); } // Encodable integer forms (CPY_z_p_i) __ Cpy(z0.VnB(), pg.Merging(), 0); __ Cpy(z1.VnB(), pg.Zeroing(), 42); __ Cpy(z2.VnB(), pg.Merging(), -42); __ Cpy(z3.VnB(), pg.Zeroing(), 0xff); __ Cpy(z4.VnH(), pg.Merging(), 127); __ Cpy(z5.VnS(), pg.Zeroing(), -128); __ Cpy(z6.VnD(), pg.Merging(), -1); // Forms encodable using fcpy. __ Cpy(z7.VnH(), pg.Merging(), Float16ToRawbits(Float16(-31.0))); __ Cpy(z8.VnS(), pg.Zeroing(), FloatToRawbits(2.0f)); __ Cpy(z9.VnD(), pg.Merging(), DoubleToRawbits(-4.0)); // Other forms use a scratch register. __ Cpy(z10.VnH(), pg.Merging(), 0xff); __ Cpy(z11.VnD(), pg.Zeroing(), 0x0123456789abcdef); END(); if (CAN_RUN()) { RUN(); // clang-format off uint64_t expected_z0[] = // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 {0xe9eaebec000000f0, 0x00f2f30000f60000, 0xf900fbfcfdfeff00}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); uint64_t expected_z1[] = // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 {0x000000002a2a2a00, 0x2a00002a2a002a2a, 0x002a00000000002a}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); uint64_t expected_z2[] = // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 {0xe9eaebecd6d6d6f0, 0xd6f2f3d6d6f6d6d6, 0xf9d6fbfcfdfeffd6}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 {0x00000000ffffff00, 0xff0000ffff00ffff, 0x00ff0000000000ff}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = // pg: 0 0 1 0 0 1 0 1 1 0 0 1 {0xe9eaebec007feff0, 0xf1f2007ff5f6007f, 0x007ffbfcfdfe007f}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = // pg: 0 0 1 1 0 1 {0x0000000000000000, 0xffffff80ffffff80, 0x00000000ffffff80}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = // pg: 0 1 1 {0xe9eaebecedeeeff0, 0xffffffffffffffff, 0xffffffffffffffff}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = // pg: 0 0 1 0 0 1 0 1 1 0 0 1 {0xe9eaebeccfc0eff0, 0xf1f2cfc0f5f6cfc0, 0xcfc0fbfcfdfecfc0}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = // pg: 0 0 1 1 0 1 {0x0000000000000000, 0x4000000040000000, 0x0000000040000000}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = // pg: 0 1 1 {0xe9eaebecedeeeff0, 0xc010000000000000, 0xc010000000000000}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z10[] = // pg: 0 0 1 0 0 1 0 1 1 0 0 1 {0xe9eaebec00ffeff0, 0xf1f200fff5f600ff, 0x00fffbfcfdfe00ff}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); uint64_t expected_z11[] = // pg: 0 1 1 {0x0000000000000000, 0x0123456789abcdef, 0x0123456789abcdef}; ASSERT_EQUAL_SVE(expected_z11, z11.VnD()); // clang-format on } } TEST_SVE(sve_fcpy_imm) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 0, 1, 1 // For S lanes: 0, 1, 1, 0, 1 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1}; PRegister pg = p7; Initialise(&masm, pg.VnB(), pg_in); // These are (mostly) merging operations, so we have to initialise the result // registers for each operation. for (unsigned i = 0; i < kNumberOfZRegisters; i++) { __ Index(ZRegister(i, kBRegSize), 0, -1); } // Encodable floating-point forms (FCPY_z_p_i) __ Fcpy(z1.VnH(), pg.Merging(), Float16(1.0)); __ Fcpy(z2.VnH(), pg.Merging(), -2.0f); __ Fcpy(z3.VnH(), pg.Merging(), 3.0); __ Fcpy(z4.VnS(), pg.Merging(), Float16(-4.0)); __ Fcpy(z5.VnS(), pg.Merging(), 5.0f); __ Fcpy(z6.VnS(), pg.Merging(), 6.0); __ Fcpy(z7.VnD(), pg.Merging(), Float16(7.0)); __ Fcpy(z8.VnD(), pg.Merging(), 8.0f); __ Fmov(z9.VnD(), pg.Merging(), -9.0); // Unencodable immediates. __ Fcpy(z10.VnS(), pg.Merging(), 0.0); __ Fcpy(z11.VnH(), pg.Merging(), Float16(42.0)); __ Fcpy(z12.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN __ Fcpy(z13.VnH(), pg.Merging(), kFP64NegativeInfinity); // Fmov alias. __ Fmov(z14.VnS(), pg.Merging(), 0.0); __ Fmov(z15.VnH(), pg.Merging(), Float16(42.0)); __ Fmov(z16.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN __ Fmov(z17.VnH(), pg.Merging(), kFP64NegativeInfinity); END(); if (CAN_RUN()) { RUN(); // clang-format off // 1.0 as FP16: 0x3c00 uint64_t expected_z1[] = // pg: 0 0 1 0 0 1 0 1 1 0 0 1 {0xe9eaebec3c00eff0, 0xf1f23c00f5f63c00, 0x3c00fbfcfdfe3c00}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); // -2.0 as FP16: 0xc000 uint64_t expected_z2[] = // pg: 0 0 1 0 0 1 0 1 1 0 0 1 {0xe9eaebecc000eff0, 0xf1f2c000f5f6c000, 0xc000fbfcfdfec000}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); // 3.0 as FP16: 0x4200 uint64_t expected_z3[] = // pg: 0 0 1 0 0 1 0 1 1 0 0 1 {0xe9eaebec4200eff0, 0xf1f24200f5f64200, 0x4200fbfcfdfe4200}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); // -4.0 as FP32: 0xc0800000 uint64_t expected_z4[] = // pg: 0 0 1 1 0 1 {0xe9eaebecedeeeff0, 0xc0800000c0800000, 0xf9fafbfcc0800000}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); // 5.0 as FP32: 0x40a00000 uint64_t expected_z5[] = // pg: 0 0 1 1 0 1 {0xe9eaebecedeeeff0, 0x40a0000040a00000, 0xf9fafbfc40a00000}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); // 6.0 as FP32: 0x40c00000 uint64_t expected_z6[] = // pg: 0 0 1 1 0 1 {0xe9eaebecedeeeff0, 0x40c0000040c00000, 0xf9fafbfc40c00000}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); // 7.0 as FP64: 0x401c000000000000 uint64_t expected_z7[] = // pg: 0 1 1 {0xe9eaebecedeeeff0, 0x401c000000000000, 0x401c000000000000}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); // 8.0 as FP64: 0x4020000000000000 uint64_t expected_z8[] = // pg: 0 1 1 {0xe9eaebecedeeeff0, 0x4020000000000000, 0x4020000000000000}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); // -9.0 as FP64: 0xc022000000000000 uint64_t expected_z9[] = // pg: 0 1 1 {0xe9eaebecedeeeff0, 0xc022000000000000, 0xc022000000000000}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); // 0.0 as FP32: 0x00000000 uint64_t expected_z10[] = // pg: 0 0 1 1 0 1 {0xe9eaebecedeeeff0, 0x0000000000000000, 0xf9fafbfc00000000}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); // 42.0 as FP16: 0x5140 uint64_t expected_z11[] = // pg: 0 0 1 0 0 1 0 1 1 0 0 1 {0xe9eaebec5140eff0, 0xf1f25140f5f65140, 0x5140fbfcfdfe5140}; ASSERT_EQUAL_SVE(expected_z11, z11.VnD()); // Signalling NaN (with payload): 0x7ff0000012340000 uint64_t expected_z12[] = // pg: 0 1 1 {0xe9eaebecedeeeff0, 0x7ff0000012340000, 0x7ff0000012340000}; ASSERT_EQUAL_SVE(expected_z12, z12.VnD()); // -infinity as FP16: 0xfc00 uint64_t expected_z13[] = // pg: 0 0 1 0 0 1 0 1 1 0 0 1 {0xe9eaebecfc00eff0, 0xf1f2fc00f5f6fc00, 0xfc00fbfcfdfefc00}; ASSERT_EQUAL_SVE(expected_z13, z13.VnD()); ASSERT_EQUAL_SVE(z10.VnD(), z14.VnD()); ASSERT_EQUAL_SVE(z11.VnD(), z15.VnD()); ASSERT_EQUAL_SVE(z12.VnD(), z16.VnD()); ASSERT_EQUAL_SVE(z13.VnD(), z17.VnD()); // clang-format on } } TEST_SVE(sve_permute_vector_unpredicated_table_lookup) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t table_inputs[] = {0xffeeddccbbaa9988, 0x7766554433221100}; int index_b[] = {255, 255, 11, 10, 15, 14, 13, 12, 1, 0, 4, 3, 7, 6, 5, 4}; int index_h[] = {5, 6, 7, 8, 2, 3, 6, 4}; int index_s[] = {1, 3, 2, 31, -1}; int index_d[] = {31, 1}; // Initialize the register with a value that doesn't existed in the table. __ Dup(z9.VnB(), 0x1f); InsrHelper(&masm, z9.VnD(), table_inputs); ZRegister ind_b = z0.WithLaneSize(kBRegSize); ZRegister ind_h = z1.WithLaneSize(kHRegSize); ZRegister ind_s = z2.WithLaneSize(kSRegSize); ZRegister ind_d = z3.WithLaneSize(kDRegSize); InsrHelper(&masm, ind_b, index_b); InsrHelper(&masm, ind_h, index_h); InsrHelper(&masm, ind_s, index_s); InsrHelper(&masm, ind_d, index_d); __ Tbl(z26.VnB(), z9.VnB(), ind_b); __ Tbl(z27.VnH(), z9.VnH(), ind_h); __ Tbl(z28.VnS(), z9.VnS(), ind_s); __ Tbl(z29.VnD(), z9.VnD(), ind_d); END(); if (CAN_RUN()) { RUN(); // clang-format off unsigned z26_expected[] = {0x1f, 0x1f, 0xbb, 0xaa, 0xff, 0xee, 0xdd, 0xcc, 0x11, 0x00, 0x44, 0x33, 0x77, 0x66, 0x55, 0x44}; unsigned z27_expected[] = {0xbbaa, 0xddcc, 0xffee, 0x1f1f, 0x5544, 0x7766, 0xddcc, 0x9988}; unsigned z28_expected[] = {0x77665544, 0xffeeddcc, 0xbbaa9988, 0x1f1f1f1f, 0x1f1f1f1f}; uint64_t z29_expected[] = {0x1f1f1f1f1f1f1f1f, 0xffeeddccbbaa9988}; // clang-format on unsigned vl = config->sve_vl_in_bits(); for (size_t i = 0; i < ArrayLength(index_b); i++) { int lane = static_cast(ArrayLength(index_b) - i - 1); if (!core.HasSVELane(z26.VnB(), lane)) break; uint64_t expected = (vl > (index_b[i] * kBRegSize)) ? z26_expected[i] : 0; ASSERT_EQUAL_SVE_LANE(expected, z26.VnB(), lane); } for (size_t i = 0; i < ArrayLength(index_h); i++) { int lane = static_cast(ArrayLength(index_h) - i - 1); if (!core.HasSVELane(z27.VnH(), lane)) break; uint64_t expected = (vl > (index_h[i] * kHRegSize)) ? z27_expected[i] : 0; ASSERT_EQUAL_SVE_LANE(expected, z27.VnH(), lane); } for (size_t i = 0; i < ArrayLength(index_s); i++) { int lane = static_cast(ArrayLength(index_s) - i - 1); if (!core.HasSVELane(z28.VnS(), lane)) break; uint64_t expected = (vl > (index_s[i] * kSRegSize)) ? z28_expected[i] : 0; ASSERT_EQUAL_SVE_LANE(expected, z28.VnS(), lane); } for (size_t i = 0; i < ArrayLength(index_d); i++) { int lane = static_cast(ArrayLength(index_d) - i - 1); if (!core.HasSVELane(z29.VnD(), lane)) break; uint64_t expected = (vl > (index_d[i] * kDRegSize)) ? z29_expected[i] : 0; ASSERT_EQUAL_SVE_LANE(expected, z29.VnD(), lane); } } } TEST_SVE(ldr_str_z_bi) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); // The immediate can address [-256, 255] times the VL, so allocate enough // space to exceed that in both directions. int data_size = vl * 1024; uint8_t* data = new uint8_t[data_size]; memset(data, 0, data_size); // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); __ Index(z1.VnB(), 1, 3); __ Index(z2.VnB(), 2, 5); __ Index(z3.VnB(), 3, 7); __ Index(z4.VnB(), 4, 11); __ Index(z5.VnB(), 5, 13); __ Index(z6.VnB(), 6, 2); __ Index(z7.VnB(), 7, 3); __ Index(z8.VnB(), 8, 5); __ Index(z9.VnB(), 9, 7); // Encodable cases. __ Str(z1, SVEMemOperand(x0)); __ Str(z2, SVEMemOperand(x0, 2, SVE_MUL_VL)); __ Str(z3, SVEMemOperand(x0, -3, SVE_MUL_VL)); __ Str(z4, SVEMemOperand(x0, 255, SVE_MUL_VL)); __ Str(z5, SVEMemOperand(x0, -256, SVE_MUL_VL)); // Cases that fall back on `CalculateSVEAddress`. __ Str(z6, SVEMemOperand(x0, 6 * vl)); __ Str(z7, SVEMemOperand(x0, -7 * vl)); __ Str(z8, SVEMemOperand(x0, 314, SVE_MUL_VL)); __ Str(z9, SVEMemOperand(x0, -314, SVE_MUL_VL)); // Corresponding loads. __ Ldr(z11, SVEMemOperand(x0, xzr)); // Test xzr operand. __ Ldr(z12, SVEMemOperand(x0, 2, SVE_MUL_VL)); __ Ldr(z13, SVEMemOperand(x0, -3, SVE_MUL_VL)); __ Ldr(z14, SVEMemOperand(x0, 255, SVE_MUL_VL)); __ Ldr(z15, SVEMemOperand(x0, -256, SVE_MUL_VL)); __ Ldr(z16, SVEMemOperand(x0, 6 * vl)); __ Ldr(z17, SVEMemOperand(x0, -7 * vl)); __ Ldr(z18, SVEMemOperand(x0, 314, SVE_MUL_VL)); __ Ldr(z19, SVEMemOperand(x0, -314, SVE_MUL_VL)); END(); if (CAN_RUN()) { RUN(); uint8_t* expected = new uint8_t[data_size]; memset(expected, 0, data_size); uint8_t* middle = &expected[data_size / 2]; for (int i = 0; i < vl; i++) { middle[i] = (1 + (3 * i)) & 0xff; // z1 middle[(2 * vl) + i] = (2 + (5 * i)) & 0xff; // z2 middle[(-3 * vl) + i] = (3 + (7 * i)) & 0xff; // z3 middle[(255 * vl) + i] = (4 + (11 * i)) & 0xff; // z4 middle[(-256 * vl) + i] = (5 + (13 * i)) & 0xff; // z5 middle[(6 * vl) + i] = (6 + (2 * i)) & 0xff; // z6 middle[(-7 * vl) + i] = (7 + (3 * i)) & 0xff; // z7 middle[(314 * vl) + i] = (8 + (5 * i)) & 0xff; // z8 middle[(-314 * vl) + i] = (9 + (7 * i)) & 0xff; // z9 } ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected); ASSERT_EQUAL_SVE(z1, z11); ASSERT_EQUAL_SVE(z2, z12); ASSERT_EQUAL_SVE(z3, z13); ASSERT_EQUAL_SVE(z4, z14); ASSERT_EQUAL_SVE(z5, z15); ASSERT_EQUAL_SVE(z6, z16); ASSERT_EQUAL_SVE(z7, z17); ASSERT_EQUAL_SVE(z8, z18); ASSERT_EQUAL_SVE(z9, z19); delete[] expected; } delete[] data; } TEST_SVE(ldr_str_p_bi) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0); int pl = vl / kZRegBitsPerPRegBit; // The immediate can address [-256, 255] times the PL, so allocate enough // space to exceed that in both directions. int data_size = pl * 1024; uint8_t* data = new uint8_t[data_size]; memset(data, 0, data_size); // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); uint64_t pattern[4] = {0x1010101011101111, 0x0010111011000101, 0x1001101110010110, 0x1010110101100011}; for (int i = 8; i <= 15; i++) { // Initialise p8-p15 with a conveniently-recognisable, non-zero pattern. Initialise(&masm, PRegister(i), pattern[3] * i, pattern[2] * i, pattern[1] * i, pattern[0] * i); } // Encodable cases. __ Str(p8, SVEMemOperand(x0)); __ Str(p9, SVEMemOperand(x0, 2, SVE_MUL_VL)); __ Str(p10, SVEMemOperand(x0, -3, SVE_MUL_VL)); __ Str(p11, SVEMemOperand(x0, 255, SVE_MUL_VL)); // Cases that fall back on `CalculateSVEAddress`. __ Str(p12, SVEMemOperand(x0, 6 * pl)); __ Str(p13, SVEMemOperand(x0, -7 * pl)); __ Str(p14, SVEMemOperand(x0, 314, SVE_MUL_VL)); __ Str(p15, SVEMemOperand(x0, -314, SVE_MUL_VL)); // Corresponding loads. __ Ldr(p0, SVEMemOperand(x0)); __ Ldr(p1, SVEMemOperand(x0, 2, SVE_MUL_VL)); __ Ldr(p2, SVEMemOperand(x0, -3, SVE_MUL_VL)); __ Ldr(p3, SVEMemOperand(x0, 255, SVE_MUL_VL)); __ Ldr(p4, SVEMemOperand(x0, 6 * pl)); __ Ldr(p5, SVEMemOperand(x0, -7 * pl)); __ Ldr(p6, SVEMemOperand(x0, 314, SVE_MUL_VL)); __ Ldr(p7, SVEMemOperand(x0, -314, SVE_MUL_VL)); END(); if (CAN_RUN()) { RUN(); uint8_t* expected = new uint8_t[data_size]; memset(expected, 0, data_size); uint8_t* middle = &expected[data_size / 2]; for (int i = 0; i < pl; i++) { int bit_index = (i % sizeof(pattern[0])) * kBitsPerByte; size_t index = i / sizeof(pattern[0]); VIXL_ASSERT(index < ArrayLength(pattern)); uint64_t byte = (pattern[index] >> bit_index) & 0xff; // Each byte of `pattern` can be multiplied by 15 without carry. VIXL_ASSERT((byte * 15) <= 0xff); middle[i] = byte * 8; // p8 middle[(2 * pl) + i] = byte * 9; // p9 middle[(-3 * pl) + i] = byte * 10; // p10 middle[(255 * pl) + i] = byte * 11; // p11 middle[(6 * pl) + i] = byte * 12; // p12 middle[(-7 * pl) + i] = byte * 13; // p13 middle[(314 * pl) + i] = byte * 14; // p14 middle[(-314 * pl) + i] = byte * 15; // p15 } ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected); ASSERT_EQUAL_SVE(p0, p8); ASSERT_EQUAL_SVE(p1, p9); ASSERT_EQUAL_SVE(p2, p10); ASSERT_EQUAL_SVE(p3, p11); ASSERT_EQUAL_SVE(p4, p12); ASSERT_EQUAL_SVE(p5, p13); ASSERT_EQUAL_SVE(p6, p14); ASSERT_EQUAL_SVE(p7, p15); delete[] expected; } delete[] data; } template static void MemoryWrite(uint8_t* base, int64_t offset, int64_t index, T data) { memcpy(base + offset + (index * sizeof(data)), &data, sizeof(data)); } TEST_SVE(sve_ld1_st1_contiguous) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); // The immediate can address [-8, 7] times the VL, so allocate enough space to // exceed that in both directions. int data_size = vl * 128; uint8_t* data = new uint8_t[data_size]; memset(data, 0, data_size); // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); // Encodable scalar-plus-immediate cases. __ Index(z1.VnB(), 1, -3); __ Ptrue(p1.VnB()); __ St1b(z1.VnB(), p1, SVEMemOperand(x0)); __ Index(z2.VnH(), -2, 5); __ Ptrue(p2.VnH(), SVE_MUL3); __ St1b(z2.VnH(), p2, SVEMemOperand(x0, 7, SVE_MUL_VL)); __ Index(z3.VnS(), 3, -7); __ Ptrue(p3.VnS(), SVE_POW2); __ St1h(z3.VnS(), p3, SVEMemOperand(x0, -8, SVE_MUL_VL)); // Encodable scalar-plus-scalar cases. __ Index(z4.VnD(), -4, 11); __ Ptrue(p4.VnD(), SVE_VL3); __ Addvl(x1, x0, 8); // Try not to overlap with VL-dependent cases. __ Mov(x2, 17); __ St1b(z4.VnD(), p4, SVEMemOperand(x1, x2)); __ Index(z5.VnD(), 6, -2); __ Ptrue(p5.VnD(), SVE_VL16); __ Addvl(x3, x0, 10); // Try not to overlap with VL-dependent cases. __ Mov(x4, 6); __ St1d(z5.VnD(), p5, SVEMemOperand(x3, x4, LSL, 3)); // Unencodable cases fall back on `CalculateSVEAddress`. __ Index(z6.VnS(), -7, 3); // Setting SVE_ALL on B lanes checks that the Simulator ignores irrelevant // predicate bits when handling larger lanes. __ Ptrue(p6.VnB(), SVE_ALL); __ St1w(z6.VnS(), p6, SVEMemOperand(x0, 42, SVE_MUL_VL)); __ Index(z7.VnD(), 32, -11); __ Ptrue(p7.VnD(), SVE_MUL4); __ St1w(z7.VnD(), p7, SVEMemOperand(x0, 22, SVE_MUL_VL)); // Corresponding loads. __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0)); __ Ld1b(z9.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL)); __ Ld1h(z10.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL)); __ Ld1b(z11.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2)); __ Ld1d(z12.VnD(), p5.Zeroing(), SVEMemOperand(x3, x4, LSL, 3)); __ Ld1w(z13.VnS(), p6.Zeroing(), SVEMemOperand(x0, 42, SVE_MUL_VL)); __ Ld1sb(z14.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL)); __ Ld1sh(z15.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL)); __ Ld1sb(z16.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2)); __ Ld1sw(z17.VnD(), p7.Zeroing(), SVEMemOperand(x0, 22, SVE_MUL_VL)); // We can test ld1 by comparing the value loaded with the value stored. In // most cases, there are two complications: // - Loads have zeroing predication, so we have to clear the inactive // elements on our reference. // - We have to replicate any sign- or zero-extension. // Ld1b(z8.VnB(), ...) __ Dup(z18.VnB(), 0); __ Mov(z18.VnB(), p1.Merging(), z1.VnB()); // Ld1b(z9.VnH(), ...) __ Dup(z19.VnH(), 0); __ Uxtb(z19.VnH(), p2.Merging(), z2.VnH()); // Ld1h(z10.VnS(), ...) __ Dup(z20.VnS(), 0); __ Uxth(z20.VnS(), p3.Merging(), z3.VnS()); // Ld1b(z11.VnD(), ...) __ Dup(z21.VnD(), 0); __ Uxtb(z21.VnD(), p4.Merging(), z4.VnD()); // Ld1d(z12.VnD(), ...) __ Dup(z22.VnD(), 0); __ Mov(z22.VnD(), p5.Merging(), z5.VnD()); // Ld1w(z13.VnS(), ...) __ Dup(z23.VnS(), 0); __ Mov(z23.VnS(), p6.Merging(), z6.VnS()); // Ld1sb(z14.VnH(), ...) __ Dup(z24.VnH(), 0); __ Sxtb(z24.VnH(), p2.Merging(), z2.VnH()); // Ld1sh(z15.VnS(), ...) __ Dup(z25.VnS(), 0); __ Sxth(z25.VnS(), p3.Merging(), z3.VnS()); // Ld1sb(z16.VnD(), ...) __ Dup(z26.VnD(), 0); __ Sxtb(z26.VnD(), p4.Merging(), z4.VnD()); // Ld1sw(z17.VnD(), ...) __ Dup(z27.VnD(), 0); __ Sxtw(z27.VnD(), p7.Merging(), z7.VnD()); END(); if (CAN_RUN()) { RUN(); uint8_t* expected = new uint8_t[data_size]; memset(expected, 0, data_size); uint8_t* middle = &expected[data_size / 2]; int vl_b = vl / kBRegSizeInBytes; int vl_h = vl / kHRegSizeInBytes; int vl_s = vl / kSRegSizeInBytes; int vl_d = vl / kDRegSizeInBytes; // Encodable cases. // st1b { z1.b }, SVE_ALL for (int i = 0; i < vl_b; i++) { MemoryWrite(middle, 0, i, static_cast(1 - (3 * i))); } // st1b { z2.h }, SVE_MUL3 int vl_h_mul3 = vl_h - (vl_h % 3); for (int i = 0; i < vl_h_mul3; i++) { int64_t offset = 7 * static_cast(vl / (kHRegSize / kBRegSize)); MemoryWrite(middle, offset, i, static_cast(-2 + (5 * i))); } // st1h { z3.s }, SVE_POW2 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s); for (int i = 0; i < vl_s_pow2; i++) { int64_t offset = -8 * static_cast(vl / (kSRegSize / kHRegSize)); MemoryWrite(middle, offset, i, static_cast(3 - (7 * i))); } // st1b { z4.d }, SVE_VL3 if (vl_d >= 3) { for (int i = 0; i < 3; i++) { MemoryWrite(middle, (8 * vl) + 17, i, static_cast(-4 + (11 * i))); } } // st1d { z5.d }, SVE_VL16 if (vl_d >= 16) { for (int i = 0; i < 16; i++) { MemoryWrite(middle, (10 * vl) + (6 * kDRegSizeInBytes), i, static_cast(6 - (2 * i))); } } // Unencodable cases. // st1w { z6.s }, SVE_ALL for (int i = 0; i < vl_s; i++) { MemoryWrite(middle, 42 * vl, i, static_cast(-7 + (3 * i))); } // st1w { z7.d }, SVE_MUL4 int vl_d_mul4 = vl_d - (vl_d % 4); for (int i = 0; i < vl_d_mul4; i++) { int64_t offset = 22 * static_cast(vl / (kDRegSize / kWRegSize)); MemoryWrite(middle, offset, i, static_cast(32 + (-11 * i))); } ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected); // Check that we loaded back the expected values. ASSERT_EQUAL_SVE(z18, z8); ASSERT_EQUAL_SVE(z19, z9); ASSERT_EQUAL_SVE(z20, z10); ASSERT_EQUAL_SVE(z21, z11); ASSERT_EQUAL_SVE(z22, z12); ASSERT_EQUAL_SVE(z23, z13); ASSERT_EQUAL_SVE(z24, z14); ASSERT_EQUAL_SVE(z25, z15); ASSERT_EQUAL_SVE(z26, z16); ASSERT_EQUAL_SVE(z27, z17); delete[] expected; } delete[] data; } TEST_SVE(sve_ld2_st2_scalar_plus_imm) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); // The immediate can address [-16, 14] times the VL, so allocate enough space // to exceed that in both directions. int data_size = vl * 128; uint8_t* data = new uint8_t[data_size]; memset(data, 0, data_size); // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); __ Index(z14.VnB(), 1, -3); __ Index(z15.VnB(), 2, -3); __ Ptrue(p0.VnB()); __ St2b(z14.VnB(), z15.VnB(), p0, SVEMemOperand(x0)); __ Index(z16.VnH(), -2, 5); __ Index(z17.VnH(), -3, 5); __ Ptrue(p1.VnH(), SVE_MUL3); __ St2h(z16.VnH(), z17.VnH(), p1, SVEMemOperand(x0, 8, SVE_MUL_VL)); // Wrap around from z31 to z0. __ Index(z31.VnS(), 3, -7); __ Index(z0.VnS(), 4, -7); __ Ptrue(p2.VnS(), SVE_POW2); __ St2w(z31.VnS(), z0.VnS(), p2, SVEMemOperand(x0, -12, SVE_MUL_VL)); __ Index(z18.VnD(), -7, 3); __ Index(z19.VnD(), -8, 3); // Sparse predication, including some irrelevant bits (0xe). To make the // results easy to check, activate each lane where n is a multiple of 5. Initialise(&masm, p3, 0xeee10000000001ee, 0xeeeeeee100000000, 0x01eeeeeeeee10000, 0x000001eeeeeeeee1); __ St2d(z18.VnD(), z19.VnD(), p3, SVEMemOperand(x0, 14, SVE_MUL_VL)); // We can test ld2 by comparing the values loaded with the values stored. // There are two complications: // - Loads have zeroing predication, so we have to clear the inactive // elements on our reference. // - We want to test both loads and stores that span { z31, z0 }, so we have // to move some values around. // // Registers z4-z11 will hold as-stored values (with inactive elements // cleared). Registers z20-z27 will hold the values that were loaded. // Ld2b(z14.VnB(), z15.VnB(), ...) __ Dup(z4.VnB(), 0); __ Dup(z5.VnB(), 0); __ Mov(z4.VnB(), p0.Merging(), z14.VnB()); __ Mov(z5.VnB(), p0.Merging(), z15.VnB()); // Ld2h(z16.VnH(), z17.VnH(), ...) __ Dup(z6.VnH(), 0); __ Dup(z7.VnH(), 0); __ Mov(z6.VnH(), p1.Merging(), z16.VnH()); __ Mov(z7.VnH(), p1.Merging(), z17.VnH()); // Ld2w(z31.VnS(), z0.VnS(), ...) __ Dup(z8.VnS(), 0); __ Dup(z9.VnS(), 0); __ Mov(z8.VnS(), p2.Merging(), z31.VnS()); __ Mov(z9.VnS(), p2.Merging(), z0.VnS()); // Ld2d(z18.VnD(), z19.VnD(), ...) __ Dup(z10.VnD(), 0); __ Dup(z11.VnD(), 0); __ Mov(z10.VnD(), p3.Merging(), z18.VnD()); __ Mov(z11.VnD(), p3.Merging(), z19.VnD()); // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap. __ Ld2b(z31.VnB(), z0.VnB(), p0.Zeroing(), SVEMemOperand(x0)); __ Mov(z20, z31); __ Mov(z21, z0); __ Ld2h(z22.VnH(), z23.VnH(), p1.Zeroing(), SVEMemOperand(x0, 8, SVE_MUL_VL)); __ Ld2w(z24.VnS(), z25.VnS(), p2.Zeroing(), SVEMemOperand(x0, -12, SVE_MUL_VL)); __ Ld2d(z26.VnD(), z27.VnD(), p3.Zeroing(), SVEMemOperand(x0, 14, SVE_MUL_VL)); END(); if (CAN_RUN()) { RUN(); uint8_t* expected = new uint8_t[data_size]; memset(expected, 0, data_size); uint8_t* middle = &expected[data_size / 2]; int vl_b = vl / kBRegSizeInBytes; int vl_h = vl / kHRegSizeInBytes; int vl_s = vl / kSRegSizeInBytes; int vl_d = vl / kDRegSizeInBytes; int reg_count = 2; // st2b { z14.b, z15.b }, SVE_ALL for (int i = 0; i < vl_b; i++) { uint8_t lane0 = 1 - (3 * i); uint8_t lane1 = 2 - (3 * i); MemoryWrite(middle, 0, (i * reg_count) + 0, lane0); MemoryWrite(middle, 0, (i * reg_count) + 1, lane1); } // st2h { z16.h, z17.h }, SVE_MUL3 int vl_h_mul3 = vl_h - (vl_h % 3); for (int i = 0; i < vl_h_mul3; i++) { int64_t offset = 8 * vl; uint16_t lane0 = -2 + (5 * i); uint16_t lane1 = -3 + (5 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); } // st2w { z31.s, z0.s }, SVE_POW2 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s); for (int i = 0; i < vl_s_pow2; i++) { int64_t offset = -12 * vl; uint32_t lane0 = 3 - (7 * i); uint32_t lane1 = 4 - (7 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); } // st2d { z18.d, z19.d }, ((i % 5) == 0) for (int i = 0; i < vl_d; i++) { if ((i % 5) == 0) { int64_t offset = 14 * vl; uint64_t lane0 = -7 + (3 * i); uint64_t lane1 = -8 + (3 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); } } ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected); // Check that we loaded back the expected values. // st2b/ld2b ASSERT_EQUAL_SVE(z4, z20); ASSERT_EQUAL_SVE(z5, z21); // st2h/ld2h ASSERT_EQUAL_SVE(z6, z22); ASSERT_EQUAL_SVE(z7, z23); // st2w/ld2w ASSERT_EQUAL_SVE(z8, z24); ASSERT_EQUAL_SVE(z9, z25); // st2d/ld2d ASSERT_EQUAL_SVE(z10, z26); ASSERT_EQUAL_SVE(z11, z27); delete[] expected; } delete[] data; } TEST_SVE(sve_ld2_st2_scalar_plus_scalar) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); // Allocate plenty of space to enable indexing in both directions. int data_size = vl * 128; uint8_t* data = new uint8_t[data_size]; memset(data, 0, data_size); // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); __ Index(z10.VnB(), -4, 11); __ Index(z11.VnB(), -5, 11); __ Ptrue(p7.VnB(), SVE_MUL4); __ Mov(x1, 0); __ St2b(z10.VnB(), z11.VnB(), p7, SVEMemOperand(x0, x1)); __ Index(z12.VnH(), 6, -2); __ Index(z13.VnH(), 7, -2); __ Ptrue(p6.VnH(), SVE_VL16); __ Rdvl(x2, 3); // Make offsets VL-dependent so we can avoid overlap. __ St2h(z12.VnH(), z13.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1)); __ Index(z14.VnS(), -7, 3); __ Index(z15.VnS(), -8, 3); // Sparse predication, including some irrelevant bits (0xe). To make the // results easy to check, activate each lane where n is a multiple of 5. Initialise(&masm, p5, 0xeee1000010000100, 0x001eeee100001000, 0x0100001eeee10000, 0x10000100001eeee1); __ Rdvl(x3, -3); __ St2w(z14.VnS(), z15.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2)); // Wrap around from z31 to z0. __ Index(z31.VnD(), 32, -11); __ Index(z0.VnD(), 33, -11); __ Ptrue(p4.VnD(), SVE_MUL3); __ Rdvl(x4, 1); __ St2d(z31.VnD(), z0.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3)); // We can test ld2 by comparing the values loaded with the values stored. // There are two complications: // - Loads have zeroing predication, so we have to clear the inactive // elements on our reference. // - We want to test both loads and stores that span { z31, z0 }, so we have // to move some values around. // // Registers z4-z11 will hold as-stored values (with inactive elements // cleared). Registers z20-z27 will hold the values that were loaded. // Ld2b(z20.VnB(), z21.VnB(), ...) __ Dup(z4.VnB(), 0); __ Dup(z5.VnB(), 0); __ Mov(z4.VnB(), p7.Merging(), z10.VnB()); __ Mov(z5.VnB(), p7.Merging(), z11.VnB()); // Ld2h(z22.VnH(), z23.VnH(), ...) __ Dup(z6.VnH(), 0); __ Dup(z7.VnH(), 0); __ Mov(z6.VnH(), p6.Merging(), z12.VnH()); __ Mov(z7.VnH(), p6.Merging(), z13.VnH()); // Ld2w(z24.VnS(), z25.VnS(), ...) __ Dup(z8.VnS(), 0); __ Dup(z9.VnS(), 0); __ Mov(z8.VnS(), p5.Merging(), z14.VnS()); __ Mov(z9.VnS(), p5.Merging(), z15.VnS()); // Ld2d(z31.VnD(), z0.VnD(), ...) __ Dup(z10.VnD(), 0); __ Dup(z11.VnD(), 0); __ Mov(z10.VnD(), p4.Merging(), z31.VnD()); __ Mov(z11.VnD(), p4.Merging(), z0.VnD()); // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap. __ Ld2b(z31.VnB(), z0.VnB(), p7.Zeroing(), SVEMemOperand(x0, x1)); __ Mov(z20, z31); __ Mov(z21, z0); __ Ld2h(z22.VnH(), z23.VnH(), p6.Zeroing(), SVEMemOperand(x0, x2, LSL, 1)); __ Ld2w(z24.VnS(), z25.VnS(), p5.Zeroing(), SVEMemOperand(x0, x3, LSL, 2)); __ Ld2d(z26.VnD(), z27.VnD(), p4.Zeroing(), SVEMemOperand(x0, x4, LSL, 3)); END(); if (CAN_RUN()) { RUN(); uint8_t* expected = new uint8_t[data_size]; memset(expected, 0, data_size); uint8_t* middle = &expected[data_size / 2]; int vl_b = vl / kBRegSizeInBytes; int vl_h = vl / kHRegSizeInBytes; int vl_s = vl / kSRegSizeInBytes; int vl_d = vl / kDRegSizeInBytes; int reg_count = 2; // st2b { z10.b, z11.b }, SVE_MUL4 int vl_b_mul4 = vl_b - (vl_b % 4); for (int i = 0; i < vl_b_mul4; i++) { uint8_t lane0 = -4 + (11 * i); uint8_t lane1 = -5 + (11 * i); MemoryWrite(middle, 0, (i * reg_count) + 0, lane0); MemoryWrite(middle, 0, (i * reg_count) + 1, lane1); } // st2h { z12.h, z13.h }, SVE_VL16 if (vl_h >= 16) { for (int i = 0; i < 16; i++) { int64_t offset = (3 << kHRegSizeInBytesLog2) * vl; uint16_t lane0 = 6 - (2 * i); uint16_t lane1 = 7 - (2 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); } } // st2w { z14.s, z15.s }, ((i % 5) == 0) for (int i = 0; i < vl_s; i++) { if ((i % 5) == 0) { int64_t offset = -(3 << kSRegSizeInBytesLog2) * vl; uint32_t lane0 = -7 + (3 * i); uint32_t lane1 = -8 + (3 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); } } // st2d { z31.b, z0.b }, SVE_MUL3 int vl_d_mul3 = vl_d - (vl_d % 3); for (int i = 0; i < vl_d_mul3; i++) { int64_t offset = (1 << kDRegSizeInBytesLog2) * vl; uint64_t lane0 = 32 - (11 * i); uint64_t lane1 = 33 - (11 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); } ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected); // Check that we loaded back the expected values. // st2b/ld2b ASSERT_EQUAL_SVE(z4, z20); ASSERT_EQUAL_SVE(z5, z21); // st2h/ld2h ASSERT_EQUAL_SVE(z6, z22); ASSERT_EQUAL_SVE(z7, z23); // st2w/ld2w ASSERT_EQUAL_SVE(z8, z24); ASSERT_EQUAL_SVE(z9, z25); // st2d/ld2d ASSERT_EQUAL_SVE(z10, z26); ASSERT_EQUAL_SVE(z11, z27); delete[] expected; } delete[] data; } TEST_SVE(sve_ld3_st3_scalar_plus_imm) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); // The immediate can address [-24, 21] times the VL, so allocate enough space // to exceed that in both directions. int data_size = vl * 128; uint8_t* data = new uint8_t[data_size]; memset(data, 0, data_size); // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); // We can test ld3 by comparing the values loaded with the values stored. // There are two complications: // - Loads have zeroing predication, so we have to clear the inactive // elements on our reference. // - We want to test both loads and stores that span { z31, z0 }, so we have // to move some values around. // // Registers z4-z15 will hold as-stored values (with inactive elements // cleared). Registers z16-z27 will hold the values that were loaded. __ Index(z10.VnB(), 1, -3); __ Index(z11.VnB(), 2, -3); __ Index(z12.VnB(), 3, -3); __ Ptrue(p0.VnB()); __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p0, SVEMemOperand(x0)); // Save the stored values for ld3 tests. __ Dup(z4.VnB(), 0); __ Dup(z5.VnB(), 0); __ Dup(z6.VnB(), 0); __ Mov(z4.VnB(), p0.Merging(), z10.VnB()); __ Mov(z5.VnB(), p0.Merging(), z11.VnB()); __ Mov(z6.VnB(), p0.Merging(), z12.VnB()); // Wrap around from z31 to z0. __ Index(z31.VnH(), -2, 5); __ Index(z0.VnH(), -3, 5); __ Index(z1.VnH(), -4, 5); __ Ptrue(p1.VnH(), SVE_MUL3); __ St3h(z31.VnH(), z0.VnH(), z1.VnH(), p1, SVEMemOperand(x0, 9, SVE_MUL_VL)); // Save the stored values for ld3 tests. __ Dup(z7.VnH(), 0); __ Dup(z8.VnH(), 0); __ Dup(z9.VnH(), 0); __ Mov(z7.VnH(), p1.Merging(), z31.VnH()); __ Mov(z8.VnH(), p1.Merging(), z0.VnH()); __ Mov(z9.VnH(), p1.Merging(), z1.VnH()); __ Index(z30.VnS(), 3, -7); __ Index(z31.VnS(), 4, -7); __ Index(z0.VnS(), 5, -7); __ Ptrue(p2.VnS(), SVE_POW2); __ St3w(z30.VnS(), z31.VnS(), z0.VnS(), p2, SVEMemOperand(x0, -12, SVE_MUL_VL)); // Save the stored values for ld3 tests. __ Dup(z10.VnS(), 0); __ Dup(z11.VnS(), 0); __ Dup(z12.VnS(), 0); __ Mov(z10.VnS(), p2.Merging(), z30.VnS()); __ Mov(z11.VnS(), p2.Merging(), z31.VnS()); __ Mov(z12.VnS(), p2.Merging(), z0.VnS()); __ Index(z0.VnD(), -7, 3); __ Index(z1.VnD(), -8, 3); __ Index(z2.VnD(), -9, 3); // Sparse predication, including some irrelevant bits (0xee). To make the // results easy to check, activate each lane where n is a multiple of 5. Initialise(&masm, p3, 0xeee10000000001ee, 0xeeeeeee100000000, 0x01eeeeeeeee10000, 0x000001eeeeeeeee1); __ St3d(z0.VnD(), z1.VnD(), z2.VnD(), p3, SVEMemOperand(x0, 15, SVE_MUL_VL)); // Save the stored values for ld3 tests. __ Dup(z13.VnD(), 0); __ Dup(z14.VnD(), 0); __ Dup(z15.VnD(), 0); __ Mov(z13.VnD(), p3.Merging(), z0.VnD()); __ Mov(z14.VnD(), p3.Merging(), z1.VnD()); __ Mov(z15.VnD(), p3.Merging(), z2.VnD()); // Corresponding loads. // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap. __ Ld3b(z31.VnB(), z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(x0)); __ Mov(z16, z31); __ Mov(z17, z0); __ Mov(z18, z1); __ Ld3h(z30.VnH(), z31.VnH(), z0.VnH(), p1.Zeroing(), SVEMemOperand(x0, 9, SVE_MUL_VL)); __ Mov(z19, z30); __ Mov(z20, z31); __ Mov(z21, z0); __ Ld3w(z22.VnS(), z23.VnS(), z24.VnS(), p2.Zeroing(), SVEMemOperand(x0, -12, SVE_MUL_VL)); __ Ld3d(z25.VnD(), z26.VnD(), z27.VnD(), p3.Zeroing(), SVEMemOperand(x0, 15, SVE_MUL_VL)); END(); if (CAN_RUN()) { RUN(); uint8_t* expected = new uint8_t[data_size]; memset(expected, 0, data_size); uint8_t* middle = &expected[data_size / 2]; int vl_b = vl / kBRegSizeInBytes; int vl_h = vl / kHRegSizeInBytes; int vl_s = vl / kSRegSizeInBytes; int vl_d = vl / kDRegSizeInBytes; int reg_count = 3; // st3b { z10.b, z11.b, z12.b }, SVE_ALL for (int i = 0; i < vl_b; i++) { uint8_t lane0 = 1 - (3 * i); uint8_t lane1 = 2 - (3 * i); uint8_t lane2 = 3 - (3 * i); MemoryWrite(middle, 0, (i * reg_count) + 0, lane0); MemoryWrite(middle, 0, (i * reg_count) + 1, lane1); MemoryWrite(middle, 0, (i * reg_count) + 2, lane2); } // st3h { z31.h, z0.h, z1.h }, SVE_MUL3 int vl_h_mul3 = vl_h - (vl_h % 3); for (int i = 0; i < vl_h_mul3; i++) { int64_t offset = 9 * vl; uint16_t lane0 = -2 + (5 * i); uint16_t lane1 = -3 + (5 * i); uint16_t lane2 = -4 + (5 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); } // st3w { z30.s, z31.s, z0.s }, SVE_POW2 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s); for (int i = 0; i < vl_s_pow2; i++) { int64_t offset = -12 * vl; uint32_t lane0 = 3 - (7 * i); uint32_t lane1 = 4 - (7 * i); uint32_t lane2 = 5 - (7 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); } // st3d { z0.d, z1.d, z2.d }, ((i % 5) == 0) for (int i = 0; i < vl_d; i++) { if ((i % 5) == 0) { int64_t offset = 15 * vl; uint64_t lane0 = -7 + (3 * i); uint64_t lane1 = -8 + (3 * i); uint64_t lane2 = -9 + (3 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); } } ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected); // Check that we loaded back the expected values. // st3b/ld3b ASSERT_EQUAL_SVE(z4, z16); ASSERT_EQUAL_SVE(z5, z17); ASSERT_EQUAL_SVE(z6, z18); // st3h/ld3h ASSERT_EQUAL_SVE(z7, z19); ASSERT_EQUAL_SVE(z8, z20); ASSERT_EQUAL_SVE(z9, z21); // st3w/ld3w ASSERT_EQUAL_SVE(z10, z22); ASSERT_EQUAL_SVE(z11, z23); ASSERT_EQUAL_SVE(z12, z24); // st3d/ld3d ASSERT_EQUAL_SVE(z13, z25); ASSERT_EQUAL_SVE(z14, z26); ASSERT_EQUAL_SVE(z15, z27); delete[] expected; } delete[] data; } TEST_SVE(sve_ld3_st3_scalar_plus_scalar) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); // Allocate plenty of space to enable indexing in both directions. int data_size = vl * 128; uint8_t* data = new uint8_t[data_size]; memset(data, 0, data_size); // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); // We can test ld3 by comparing the values loaded with the values stored. // There are two complications: // - Loads have zeroing predication, so we have to clear the inactive // elements on our reference. // - We want to test both loads and stores that span { z31, z0 }, so we have // to move some values around. // // Registers z4-z15 will hold as-stored values (with inactive elements // cleared). Registers z16-z27 will hold the values that were loaded. __ Index(z10.VnB(), -4, 11); __ Index(z11.VnB(), -5, 11); __ Index(z12.VnB(), -6, 11); __ Ptrue(p7.VnB(), SVE_MUL4); __ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap. __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p7, SVEMemOperand(x0, x1, LSL, 0)); // Save the stored values for ld3 tests. __ Dup(z4.VnB(), 0); __ Dup(z5.VnB(), 0); __ Dup(z6.VnB(), 0); __ Mov(z4.VnB(), p7.Merging(), z10.VnB()); __ Mov(z5.VnB(), p7.Merging(), z11.VnB()); __ Mov(z6.VnB(), p7.Merging(), z12.VnB()); __ Index(z13.VnH(), 6, -2); __ Index(z14.VnH(), 7, -2); __ Index(z15.VnH(), 8, -2); __ Ptrue(p6.VnH(), SVE_VL16); __ Rdvl(x2, 5); // (5 * vl) << 1 = 10 * vl __ St3h(z13.VnH(), z14.VnH(), z15.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1)); // Save the stored values for ld3 tests. __ Dup(z7.VnH(), 0); __ Dup(z8.VnH(), 0); __ Dup(z9.VnH(), 0); __ Mov(z7.VnH(), p6.Merging(), z13.VnH()); __ Mov(z8.VnH(), p6.Merging(), z14.VnH()); __ Mov(z9.VnH(), p6.Merging(), z15.VnH()); // Wrap around from z31 to z0. __ Index(z30.VnS(), -7, 3); __ Index(z31.VnS(), -8, 3); __ Index(z0.VnS(), -9, 3); // Sparse predication, including some irrelevant bits (0xe). To make the // results easy to check, activate each lane where n is a multiple of 5. Initialise(&masm, p5, 0xeee1000010000100, 0x001eeee100001000, 0x0100001eeee10000, 0x10000100001eeee1); __ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl __ St3w(z30.VnS(), z31.VnS(), z0.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2)); // Save the stored values for ld3 tests. __ Dup(z10.VnS(), 0); __ Dup(z11.VnS(), 0); __ Dup(z12.VnS(), 0); __ Mov(z10.VnS(), p5.Merging(), z30.VnS()); __ Mov(z11.VnS(), p5.Merging(), z31.VnS()); __ Mov(z12.VnS(), p5.Merging(), z0.VnS()); __ Index(z31.VnD(), 32, -11); __ Index(z0.VnD(), 33, -11); __ Index(z1.VnD(), 34, -11); __ Ptrue(p4.VnD(), SVE_MUL3); __ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 * vl __ St3d(z31.VnD(), z0.VnD(), z1.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3)); // Save the stored values for ld3 tests. __ Dup(z13.VnD(), 0); __ Dup(z14.VnD(), 0); __ Dup(z15.VnD(), 0); __ Mov(z13.VnD(), p4.Merging(), z31.VnD()); __ Mov(z14.VnD(), p4.Merging(), z0.VnD()); __ Mov(z15.VnD(), p4.Merging(), z1.VnD()); // Corresponding loads. // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap. __ Ld3b(z31.VnB(), z0.VnB(), z1.VnB(), p7.Zeroing(), SVEMemOperand(x0, x1, LSL, 0)); __ Mov(z16, z31); __ Mov(z17, z0); __ Mov(z18, z1); __ Ld3h(z30.VnH(), z31.VnH(), z0.VnH(), p6.Zeroing(), SVEMemOperand(x0, x2, LSL, 1)); __ Mov(z19, z30); __ Mov(z20, z31); __ Mov(z21, z0); __ Ld3w(z22.VnS(), z23.VnS(), z24.VnS(), p5.Zeroing(), SVEMemOperand(x0, x3, LSL, 2)); __ Ld3d(z25.VnD(), z26.VnD(), z27.VnD(), p4.Zeroing(), SVEMemOperand(x0, x4, LSL, 3)); END(); if (CAN_RUN()) { RUN(); uint8_t* expected = new uint8_t[data_size]; memset(expected, 0, data_size); uint8_t* middle = &expected[data_size / 2]; int vl_b = vl / kBRegSizeInBytes; int vl_h = vl / kHRegSizeInBytes; int vl_s = vl / kSRegSizeInBytes; int vl_d = vl / kDRegSizeInBytes; int reg_count = 3; // st3b { z10.b, z11.b, z12.b }, SVE_MUL4 int vl_b_mul4 = vl_b - (vl_b % 4); for (int i = 0; i < vl_b_mul4; i++) { int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl; uint8_t lane0 = -4 + (11 * i); uint8_t lane1 = -5 + (11 * i); uint8_t lane2 = -6 + (11 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); } // st3h { z13.h, z14.h, z15.h }, SVE_VL16 if (vl_h >= 16) { for (int i = 0; i < 16; i++) { int64_t offset = (5 << kHRegSizeInBytesLog2) * vl; uint16_t lane0 = 6 - (2 * i); uint16_t lane1 = 7 - (2 * i); uint16_t lane2 = 8 - (2 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); } } // st3w { z30.s, z31.s, z0.s }, ((i % 5) == 0) for (int i = 0; i < vl_s; i++) { if ((i % 5) == 0) { int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl; uint32_t lane0 = -7 + (3 * i); uint32_t lane1 = -8 + (3 * i); uint32_t lane2 = -9 + (3 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); } } // st3d { z31.d, z0.d, z1.d }, SVE_MUL3 int vl_d_mul3 = vl_d - (vl_d % 3); for (int i = 0; i < vl_d_mul3; i++) { int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl; uint64_t lane0 = 32 - (11 * i); uint64_t lane1 = 33 - (11 * i); uint64_t lane2 = 34 - (11 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); } ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected); // Check that we loaded back the expected values. // st3b/ld3b ASSERT_EQUAL_SVE(z4, z16); ASSERT_EQUAL_SVE(z5, z17); ASSERT_EQUAL_SVE(z6, z18); // st3h/ld3h ASSERT_EQUAL_SVE(z7, z19); ASSERT_EQUAL_SVE(z8, z20); ASSERT_EQUAL_SVE(z9, z21); // st3w/ld3w ASSERT_EQUAL_SVE(z10, z22); ASSERT_EQUAL_SVE(z11, z23); ASSERT_EQUAL_SVE(z12, z24); // st3d/ld3d ASSERT_EQUAL_SVE(z13, z25); ASSERT_EQUAL_SVE(z14, z26); ASSERT_EQUAL_SVE(z15, z27); delete[] expected; } delete[] data; } TEST_SVE(sve_ld4_st4_scalar_plus_imm) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); // The immediate can address [-24, 21] times the VL, so allocate enough space // to exceed that in both directions. int data_size = vl * 128; uint8_t* data = new uint8_t[data_size]; memset(data, 0, data_size); // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); // We can test ld4 by comparing the values loaded with the values stored. // There are two complications: // - Loads have zeroing predication, so we have to clear the inactive // elements on our reference. // - We want to test both loads and stores that span { z31, z0 }, so we have // to move some values around. // // Registers z3-z18 will hold as-stored values (with inactive elements // cleared). Registers z19-z31 and z0-z2 will hold the values that were // loaded. __ Index(z10.VnB(), 1, -7); __ Index(z11.VnB(), 2, -7); __ Index(z12.VnB(), 3, -7); __ Index(z13.VnB(), 4, -7); __ Ptrue(p0.VnB()); __ St4b(z10.VnB(), z11.VnB(), z12.VnB(), z13.VnB(), p0, SVEMemOperand(x0)); // Save the stored values for ld4 tests. __ Dup(z3.VnB(), 0); __ Dup(z4.VnB(), 0); __ Dup(z5.VnB(), 0); __ Dup(z6.VnB(), 0); __ Mov(z3.VnB(), p0.Merging(), z10.VnB()); __ Mov(z4.VnB(), p0.Merging(), z11.VnB()); __ Mov(z5.VnB(), p0.Merging(), z12.VnB()); __ Mov(z6.VnB(), p0.Merging(), z13.VnB()); // Wrap around from z31 to z0. __ Index(z31.VnH(), -2, 5); __ Index(z0.VnH(), -3, 5); __ Index(z1.VnH(), -4, 5); __ Index(z2.VnH(), -5, 5); __ Ptrue(p1.VnH(), SVE_MUL3); __ St4h(z31.VnH(), z0.VnH(), z1.VnH(), z2.VnH(), p1, SVEMemOperand(x0, 4, SVE_MUL_VL)); // Save the stored values for ld4 tests. __ Dup(z7.VnH(), 0); __ Dup(z8.VnH(), 0); __ Dup(z9.VnH(), 0); __ Dup(z10.VnH(), 0); __ Mov(z7.VnH(), p1.Merging(), z31.VnH()); __ Mov(z8.VnH(), p1.Merging(), z0.VnH()); __ Mov(z9.VnH(), p1.Merging(), z1.VnH()); __ Mov(z10.VnH(), p1.Merging(), z2.VnH()); // Wrap around from z31 to z0. __ Index(z29.VnS(), 2, -7); __ Index(z30.VnS(), 3, -7); __ Index(z31.VnS(), 4, -7); __ Index(z0.VnS(), 5, -7); __ Ptrue(p2.VnS(), SVE_POW2); __ St4w(z29.VnS(), z30.VnS(), z31.VnS(), z0.VnS(), p2, SVEMemOperand(x0, -12, SVE_MUL_VL)); // Save the stored values for ld4 tests. __ Dup(z11.VnS(), 0); __ Dup(z12.VnS(), 0); __ Dup(z13.VnS(), 0); __ Dup(z14.VnS(), 0); __ Mov(z11.VnS(), p2.Merging(), z29.VnS()); __ Mov(z12.VnS(), p2.Merging(), z30.VnS()); __ Mov(z13.VnS(), p2.Merging(), z31.VnS()); __ Mov(z14.VnS(), p2.Merging(), z0.VnS()); __ Index(z20.VnD(), -7, 8); __ Index(z21.VnD(), -8, 8); __ Index(z22.VnD(), -9, 8); __ Index(z23.VnD(), -10, 8); // Sparse predication, including some irrelevant bits (0xee). To make the // results easy to check, activate each lane where n is a multiple of 5. Initialise(&masm, p3, 0xeee10000000001ee, 0xeeeeeee100000000, 0x01eeeeeeeee10000, 0x000001eeeeeeeee1); __ St4d(z20.VnD(), z21.VnD(), z22.VnD(), z23.VnD(), p3, SVEMemOperand(x0, 16, SVE_MUL_VL)); // Save the stored values for ld4 tests. __ Dup(z15.VnD(), 0); __ Dup(z16.VnD(), 0); __ Dup(z17.VnD(), 0); __ Dup(z18.VnD(), 0); __ Mov(z15.VnD(), p3.Merging(), z20.VnD()); __ Mov(z16.VnD(), p3.Merging(), z21.VnD()); __ Mov(z17.VnD(), p3.Merging(), z22.VnD()); __ Mov(z18.VnD(), p3.Merging(), z23.VnD()); // Corresponding loads. // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap. __ Ld4b(z31.VnB(), z0.VnB(), z1.VnB(), z2.VnB(), p0.Zeroing(), SVEMemOperand(x0)); __ Mov(z19, z31); __ Mov(z20, z0); __ Mov(z21, z1); __ Mov(z22, z2); __ Ld4h(z23.VnH(), z24.VnH(), z25.VnH(), z26.VnH(), p1.Zeroing(), SVEMemOperand(x0, 4, SVE_MUL_VL)); __ Ld4w(z27.VnS(), z28.VnS(), z29.VnS(), z30.VnS(), p2.Zeroing(), SVEMemOperand(x0, -12, SVE_MUL_VL)); // Wrap around from z31 to z0. __ Ld4d(z31.VnD(), z0.VnD(), z1.VnD(), z2.VnD(), p3.Zeroing(), SVEMemOperand(x0, 16, SVE_MUL_VL)); END(); if (CAN_RUN()) { RUN(); uint8_t* expected = new uint8_t[data_size]; memset(expected, 0, data_size); uint8_t* middle = &expected[data_size / 2]; int vl_b = vl / kBRegSizeInBytes; int vl_h = vl / kHRegSizeInBytes; int vl_s = vl / kSRegSizeInBytes; int vl_d = vl / kDRegSizeInBytes; int reg_count = 4; // st2b { z10.b, z11.b, z12.b, z13.b }, SVE_ALL for (int i = 0; i < vl_b; i++) { uint8_t lane0 = 1 - (7 * i); uint8_t lane1 = 2 - (7 * i); uint8_t lane2 = 3 - (7 * i); uint8_t lane3 = 4 - (7 * i); MemoryWrite(middle, 0, (i * reg_count) + 0, lane0); MemoryWrite(middle, 0, (i * reg_count) + 1, lane1); MemoryWrite(middle, 0, (i * reg_count) + 2, lane2); MemoryWrite(middle, 0, (i * reg_count) + 3, lane3); } // st4h { z31.h, z0.h, z1.h, z2.h }, SVE_MUL3 int vl_h_mul3 = vl_h - (vl_h % 3); for (int i = 0; i < vl_h_mul3; i++) { int64_t offset = 4 * vl; uint16_t lane0 = -2 + (5 * i); uint16_t lane1 = -3 + (5 * i); uint16_t lane2 = -4 + (5 * i); uint16_t lane3 = -5 + (5 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); MemoryWrite(middle, offset, (i * reg_count) + 3, lane3); } // st4w { z29.s, z30.s, z31.s, z0.s }, SVE_POW2 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s); for (int i = 0; i < vl_s_pow2; i++) { int64_t offset = -12 * vl; uint32_t lane0 = 2 - (7 * i); uint32_t lane1 = 3 - (7 * i); uint32_t lane2 = 4 - (7 * i); uint32_t lane3 = 5 - (7 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); MemoryWrite(middle, offset, (i * reg_count) + 3, lane3); } // st4d { z20.d, z21.d, z22.d, z23.d }, ((i % 5) == 0) for (int i = 0; i < vl_d; i++) { if ((i % 5) == 0) { int64_t offset = 16 * vl; uint64_t lane0 = -7 + (8 * i); uint64_t lane1 = -8 + (8 * i); uint64_t lane2 = -9 + (8 * i); uint64_t lane3 = -10 + (8 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); MemoryWrite(middle, offset, (i * reg_count) + 3, lane3); } } ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected); // Check that we loaded back the expected values. // st4b/ld4b ASSERT_EQUAL_SVE(z3, z19); ASSERT_EQUAL_SVE(z4, z20); ASSERT_EQUAL_SVE(z5, z21); ASSERT_EQUAL_SVE(z6, z22); // st4h/ld4h ASSERT_EQUAL_SVE(z7, z23); ASSERT_EQUAL_SVE(z8, z24); ASSERT_EQUAL_SVE(z9, z25); ASSERT_EQUAL_SVE(z10, z26); // st4w/ld4w ASSERT_EQUAL_SVE(z11, z27); ASSERT_EQUAL_SVE(z12, z28); ASSERT_EQUAL_SVE(z13, z29); ASSERT_EQUAL_SVE(z14, z30); // st4d/ld4d ASSERT_EQUAL_SVE(z15, z31); ASSERT_EQUAL_SVE(z16, z0); ASSERT_EQUAL_SVE(z17, z1); ASSERT_EQUAL_SVE(z18, z2); delete[] expected; } delete[] data; } TEST_SVE(sve_ld4_st4_scalar_plus_scalar) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); // Allocate plenty of space to enable indexing in both directions. int data_size = vl * 128; uint8_t* data = new uint8_t[data_size]; memset(data, 0, data_size); // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); // We can test ld4 by comparing the values loaded with the values stored. // There are two complications: // - Loads have zeroing predication, so we have to clear the inactive // elements on our reference. // - We want to test both loads and stores that span { z31, z0 }, so we have // to move some values around. // // Registers z3-z18 will hold as-stored values (with inactive elements // cleared). Registers z19-z31 and z0-z2 will hold the values that were // loaded. __ Index(z19.VnB(), -4, 11); __ Index(z20.VnB(), -5, 11); __ Index(z21.VnB(), -6, 11); __ Index(z22.VnB(), -7, 11); __ Ptrue(p7.VnB(), SVE_MUL4); __ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap. __ St4b(z19.VnB(), z20.VnB(), z21.VnB(), z22.VnB(), p7, SVEMemOperand(x0, x1, LSL, 0)); // Save the stored values for ld4 tests. __ Dup(z3.VnB(), 0); __ Dup(z4.VnB(), 0); __ Dup(z5.VnB(), 0); __ Dup(z6.VnB(), 0); __ Mov(z3.VnB(), p7.Merging(), z19.VnB()); __ Mov(z4.VnB(), p7.Merging(), z20.VnB()); __ Mov(z5.VnB(), p7.Merging(), z21.VnB()); __ Mov(z6.VnB(), p7.Merging(), z22.VnB()); __ Index(z23.VnH(), 6, -2); __ Index(z24.VnH(), 7, -2); __ Index(z25.VnH(), 8, -2); __ Index(z26.VnH(), 9, -2); __ Ptrue(p6.VnH(), SVE_VL16); __ Rdvl(x2, 7); // (7 * vl) << 1 = 14 * vl __ St4h(z23.VnH(), z24.VnH(), z25.VnH(), z26.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1)); // Save the stored values for ld4 tests. __ Dup(z7.VnH(), 0); __ Dup(z8.VnH(), 0); __ Dup(z9.VnH(), 0); __ Dup(z10.VnH(), 0); __ Mov(z7.VnH(), p6.Merging(), z23.VnH()); __ Mov(z8.VnH(), p6.Merging(), z24.VnH()); __ Mov(z9.VnH(), p6.Merging(), z25.VnH()); __ Mov(z10.VnH(), p6.Merging(), z26.VnH()); // Wrap around from z31 to z0. __ Index(z29.VnS(), -6, 7); __ Index(z30.VnS(), -7, 7); __ Index(z31.VnS(), -8, 7); __ Index(z0.VnS(), -9, 7); // Sparse predication, including some irrelevant bits (0xe). To make the // results easy to check, activate each lane where n is a multiple of 5. Initialise(&masm, p5, 0xeee1000010000100, 0x001eeee100001000, 0x0100001eeee10000, 0x10000100001eeee1); __ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl __ St4w(z29.VnS(), z30.VnS(), z31.VnS(), z0.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2)); // Save the stored values for ld4 tests. __ Dup(z11.VnS(), 0); __ Dup(z12.VnS(), 0); __ Dup(z13.VnS(), 0); __ Dup(z14.VnS(), 0); __ Mov(z11.VnS(), p5.Merging(), z29.VnS()); __ Mov(z12.VnS(), p5.Merging(), z30.VnS()); __ Mov(z13.VnS(), p5.Merging(), z31.VnS()); __ Mov(z14.VnS(), p5.Merging(), z0.VnS()); __ Index(z31.VnD(), 32, -11); __ Index(z0.VnD(), 33, -11); __ Index(z1.VnD(), 34, -11); __ Index(z2.VnD(), 35, -11); __ Ptrue(p4.VnD(), SVE_MUL3); __ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 *vl __ St4d(z31.VnD(), z0.VnD(), z1.VnD(), z2.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3)); // Save the stored values for ld4 tests. __ Dup(z15.VnD(), 0); __ Dup(z16.VnD(), 0); __ Dup(z17.VnD(), 0); __ Dup(z18.VnD(), 0); __ Mov(z15.VnD(), p4.Merging(), z31.VnD()); __ Mov(z16.VnD(), p4.Merging(), z0.VnD()); __ Mov(z17.VnD(), p4.Merging(), z1.VnD()); __ Mov(z18.VnD(), p4.Merging(), z2.VnD()); // Corresponding loads. // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap. __ Ld4b(z31.VnB(), z0.VnB(), z1.VnB(), z2.VnB(), p7.Zeroing(), SVEMemOperand(x0, x1, LSL, 0)); __ Mov(z19, z31); __ Mov(z20, z0); __ Mov(z21, z1); __ Mov(z22, z2); __ Ld4h(z23.VnH(), z24.VnH(), z25.VnH(), z26.VnH(), p6.Zeroing(), SVEMemOperand(x0, x2, LSL, 1)); __ Ld4w(z27.VnS(), z28.VnS(), z29.VnS(), z30.VnS(), p5.Zeroing(), SVEMemOperand(x0, x3, LSL, 2)); // Wrap around from z31 to z0. __ Ld4d(z31.VnD(), z0.VnD(), z1.VnD(), z2.VnD(), p4.Zeroing(), SVEMemOperand(x0, x4, LSL, 3)); END(); if (CAN_RUN()) { RUN(); uint8_t* expected = new uint8_t[data_size]; memset(expected, 0, data_size); uint8_t* middle = &expected[data_size / 2]; int vl_b = vl / kBRegSizeInBytes; int vl_h = vl / kHRegSizeInBytes; int vl_s = vl / kSRegSizeInBytes; int vl_d = vl / kDRegSizeInBytes; int reg_count = 4; // st4b { z19.b, z20.b, z21.b, z22.b }, SVE_MUL4 int vl_b_mul4 = vl_b - (vl_b % 4); for (int i = 0; i < vl_b_mul4; i++) { int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl; uint8_t lane0 = -4 + (11 * i); uint8_t lane1 = -5 + (11 * i); uint8_t lane2 = -6 + (11 * i); uint8_t lane3 = -7 + (11 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); MemoryWrite(middle, offset, (i * reg_count) + 3, lane3); } // st4h { z22.h, z23.h, z24.h, z25.h }, SVE_VL16 if (vl_h >= 16) { for (int i = 0; i < 16; i++) { int64_t offset = (7 << kHRegSizeInBytesLog2) * vl; uint16_t lane0 = 6 - (2 * i); uint16_t lane1 = 7 - (2 * i); uint16_t lane2 = 8 - (2 * i); uint16_t lane3 = 9 - (2 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); MemoryWrite(middle, offset, (i * reg_count) + 3, lane3); } } // st4w { z29.s, z30.s, z31.s, z0.s }, ((i % 5) == 0) for (int i = 0; i < vl_s; i++) { if ((i % 5) == 0) { int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl; uint32_t lane0 = -6 + (7 * i); uint32_t lane1 = -7 + (7 * i); uint32_t lane2 = -8 + (7 * i); uint32_t lane3 = -9 + (7 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); MemoryWrite(middle, offset, (i * reg_count) + 3, lane3); } } // st4d { z31.d, z0.d, z1.d, z2.d }, SVE_MUL3 int vl_d_mul3 = vl_d - (vl_d % 3); for (int i = 0; i < vl_d_mul3; i++) { int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl; uint64_t lane0 = 32 - (11 * i); uint64_t lane1 = 33 - (11 * i); uint64_t lane2 = 34 - (11 * i); uint64_t lane3 = 35 - (11 * i); MemoryWrite(middle, offset, (i * reg_count) + 0, lane0); MemoryWrite(middle, offset, (i * reg_count) + 1, lane1); MemoryWrite(middle, offset, (i * reg_count) + 2, lane2); MemoryWrite(middle, offset, (i * reg_count) + 3, lane3); } ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected); // Check that we loaded back the expected values. // st4b/ld4b ASSERT_EQUAL_SVE(z3, z19); ASSERT_EQUAL_SVE(z4, z20); ASSERT_EQUAL_SVE(z5, z21); ASSERT_EQUAL_SVE(z6, z22); // st4h/ld4h ASSERT_EQUAL_SVE(z7, z23); ASSERT_EQUAL_SVE(z8, z24); ASSERT_EQUAL_SVE(z9, z25); ASSERT_EQUAL_SVE(z10, z26); // st4w/ld4w ASSERT_EQUAL_SVE(z11, z27); ASSERT_EQUAL_SVE(z12, z28); ASSERT_EQUAL_SVE(z13, z29); ASSERT_EQUAL_SVE(z14, z30); // st4d/ld4d ASSERT_EQUAL_SVE(z15, z31); ASSERT_EQUAL_SVE(z16, z0); ASSERT_EQUAL_SVE(z17, z1); ASSERT_EQUAL_SVE(z18, z2); delete[] expected; } delete[] data; } TEST_SVE(sve_ld234_st234_scalar_plus_scalar_sp) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Check that the simulator correctly interprets rn == 31 as sp. // The indexing logic is the same regardless so we just check one load and // store of each type. // There are no pre- or post-indexing modes, so reserve space first. __ ClaimVL(2 + 3 + 4); __ Index(z0.VnB(), 42, 2); __ Index(z1.VnB(), 43, 2); __ Ptrue(p0.VnB(), SVE_VL7); __ Rdvl(x0, 0); __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, x0)); __ Index(z4.VnH(), 42, 3); __ Index(z5.VnH(), 43, 3); __ Index(z6.VnH(), 44, 3); __ Ptrue(p1.VnH(), SVE_POW2); __ Rdvl(x1, 2); __ Lsr(x1, x1, 1); __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, x1, LSL, 1)); __ Index(z8.VnS(), 42, 4); __ Index(z9.VnS(), 43, 4); __ Index(z10.VnS(), 44, 4); __ Index(z11.VnS(), 45, 4); __ Ptrue(p2.VnS()); __ Rdvl(x2, 2 + 3); __ Lsr(x2, x2, 2); __ St4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2, SVEMemOperand(sp, x2, LSL, 2)); // Corresponding loads. // We have to explicitly zero inactive lanes in the reference values because // loads have zeroing predication. __ Dup(z12.VnB(), 0); __ Dup(z13.VnB(), 0); __ Mov(z12.VnB(), p0.Merging(), z0.VnB()); __ Mov(z13.VnB(), p0.Merging(), z1.VnB()); __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, x0)); __ Dup(z16.VnH(), 0); __ Dup(z17.VnH(), 0); __ Dup(z18.VnH(), 0); __ Mov(z16.VnH(), p1.Merging(), z4.VnH()); __ Mov(z17.VnH(), p1.Merging(), z5.VnH()); __ Mov(z18.VnH(), p1.Merging(), z6.VnH()); __ Ld3h(z4.VnH(), z5.VnH(), z6.VnH(), p1.Zeroing(), SVEMemOperand(sp, x1, LSL, 1)); __ Dup(z20.VnS(), 0); __ Dup(z21.VnS(), 0); __ Dup(z22.VnS(), 0); __ Dup(z23.VnS(), 0); __ Mov(z20.VnS(), p2.Merging(), z8.VnS()); __ Mov(z21.VnS(), p2.Merging(), z9.VnS()); __ Mov(z22.VnS(), p2.Merging(), z10.VnS()); __ Mov(z23.VnS(), p2.Merging(), z11.VnS()); __ Ld4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2.Zeroing(), SVEMemOperand(sp, x2, LSL, 2)); __ DropVL(2 + 3 + 4); END(); if (CAN_RUN()) { RUN(); // The most likely failure mode is the that simulator reads sp as xzr and // crashes on execution. We already test the address calculations separately // and sp doesn't change this, so just test that we load the values we // stored. // st2b/ld2b ASSERT_EQUAL_SVE(z0, z12); ASSERT_EQUAL_SVE(z1, z13); // st3h/ld3h ASSERT_EQUAL_SVE(z4, z16); ASSERT_EQUAL_SVE(z5, z17); ASSERT_EQUAL_SVE(z6, z18); // st4h/ld4h ASSERT_EQUAL_SVE(z8, z20); ASSERT_EQUAL_SVE(z9, z21); ASSERT_EQUAL_SVE(z10, z22); ASSERT_EQUAL_SVE(z11, z23); } } TEST_SVE(sve_ld234_st234_scalar_plus_imm_sp) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Check that the simulator correctly interprets rn == 31 as sp. // The indexing logic is the same regardless so we just check one load and // store of each type. // There are no pre- or post-indexing modes, so reserve space first. // Note that the stores fill in an order that allows each immediate to be a // multiple of the number of registers. __ ClaimVL(4 + 2 + 3); __ Index(z0.VnB(), 42, 2); __ Index(z1.VnB(), 43, 2); __ Ptrue(p0.VnB(), SVE_POW2); __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, 4, SVE_MUL_VL)); __ Index(z4.VnH(), 42, 3); __ Index(z5.VnH(), 43, 3); __ Index(z6.VnH(), 44, 3); __ Ptrue(p1.VnH(), SVE_VL7); __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, 6, SVE_MUL_VL)); __ Index(z8.VnS(), 42, 4); __ Index(z9.VnS(), 43, 4); __ Index(z10.VnS(), 44, 4); __ Index(z11.VnS(), 45, 4); __ Ptrue(p2.VnS()); __ St4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2, SVEMemOperand(sp)); // Corresponding loads. // We have to explicitly zero inactive lanes in the reference values because // loads have zeroing predication. __ Dup(z12.VnB(), 0); __ Dup(z13.VnB(), 0); __ Mov(z12.VnB(), p0.Merging(), z0.VnB()); __ Mov(z13.VnB(), p0.Merging(), z1.VnB()); __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, 4, SVE_MUL_VL)); __ Dup(z16.VnH(), 0); __ Dup(z17.VnH(), 0); __ Dup(z18.VnH(), 0); __ Mov(z16.VnH(), p1.Merging(), z4.VnH()); __ Mov(z17.VnH(), p1.Merging(), z5.VnH()); __ Mov(z18.VnH(), p1.Merging(), z6.VnH()); __ Ld3h(z4.VnH(), z5.VnH(), z6.VnH(), p1.Zeroing(), SVEMemOperand(sp, 6, SVE_MUL_VL)); __ Dup(z20.VnS(), 0); __ Dup(z21.VnS(), 0); __ Dup(z22.VnS(), 0); __ Dup(z23.VnS(), 0); __ Mov(z20.VnS(), p2.Merging(), z8.VnS()); __ Mov(z21.VnS(), p2.Merging(), z9.VnS()); __ Mov(z22.VnS(), p2.Merging(), z10.VnS()); __ Mov(z23.VnS(), p2.Merging(), z11.VnS()); __ Ld4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2.Zeroing(), SVEMemOperand(sp)); __ DropVL(4 + 2 + 3); END(); if (CAN_RUN()) { RUN(); // The most likely failure mode is the that simulator reads sp as xzr and // crashes on execution. We already test the address calculations separately // and sp doesn't change this, so just test that we load the values we // stored. // TODO: Actually do this, once loads are implemented. } } // Fill the input buffer with arbitrary data. Meanwhile, assign random offsets // from the base address of the buffer and corresponding addresses to the // arguments if provided. static void BufferFillingHelper(uint64_t data_ptr, size_t buffer_size, unsigned lane_size_in_bytes, int lane_count, uint64_t* offsets, uint64_t* addresses = nullptr, uint64_t* max_address = nullptr) { // Use a fixed seed for nrand48() so that test runs are reproducible. unsigned short seed[3] = {1, 2, 3}; // NOLINT(google-runtime-int) // Fill a buffer with arbitrary data. for (size_t i = 0; i < buffer_size; i++) { uint8_t byte = nrand48(seed) & 0xff; memcpy(reinterpret_cast(data_ptr + i), &byte, 1); } if (max_address != nullptr) { *max_address = 0; } // Vectors of random addresses and offsets into the buffer. for (int i = 0; i < lane_count; i++) { uint64_t rnd = nrand48(seed); // Limit the range to the set of completely-accessible elements in memory. offsets[i] = rnd % (buffer_size - lane_size_in_bytes); if ((addresses != nullptr) && (max_address != nullptr)) { addresses[i] = data_ptr + offsets[i]; *max_address = std::max(*max_address, addresses[i]); } } } static void ScalarLoadHelper(MacroAssembler* masm, Register dst, Register addr, int msize_in_bits, bool is_signed) { if (is_signed) { switch (msize_in_bits) { case kBRegSize: masm->Ldrsb(dst, MemOperand(addr)); break; case kHRegSize: masm->Ldrsh(dst, MemOperand(addr)); break; case kWRegSize: masm->Ldrsw(dst, MemOperand(addr)); break; default: VIXL_UNIMPLEMENTED(); break; } } else { switch (msize_in_bits) { case kBRegSize: masm->Ldrb(dst, MemOperand(addr)); break; case kHRegSize: masm->Ldrh(dst, MemOperand(addr)); break; case kWRegSize: masm->Ldr(dst.W(), MemOperand(addr)); break; case kXRegSize: masm->Ldr(dst, MemOperand(addr)); break; default: VIXL_UNIMPLEMENTED(); break; } } } // Generate a reference result using scalar loads. // For now this helper doesn't save and restore the caller registers. // Clobber register z30, x28, x29 and p7. template static void ScalarLoadHelper(MacroAssembler* masm, int vl, const uint64_t (&addresses)[N], const ZRegister& zt_ref, const PRegisterZ& pg, unsigned esize_in_bits, unsigned msize_in_bits, bool is_signed) { unsigned esize_in_bytes = esize_in_bits / kBitsPerByte; ZRegister lane_numbers = z30.WithLaneSize(esize_in_bits); masm->Index(lane_numbers, 0, 1); masm->Dup(zt_ref, 0); for (unsigned i = 0; i < (vl / esize_in_bytes); i++) { masm->Mov(x29, addresses[N - i - 1]); Register rt(28, std::min(std::max(esize_in_bits, kSRegSize), kDRegSize)); ScalarLoadHelper(masm, rt, x29, msize_in_bits, is_signed); // Emulate predication. masm->Cmpeq(p7.WithLaneSize(esize_in_bits), pg, lane_numbers, i); masm->Cpy(zt_ref, p7.Merging(), rt); } } typedef void (MacroAssembler::*Ld1Macro)(const ZRegister& zt, const PRegisterZ& pg, const SVEMemOperand& addr); template static void Ldff1Helper(Test* config, uintptr_t data, unsigned msize_in_bits, unsigned esize_in_bits, CPURegister::RegisterType base_type, Ld1Macro ldff1, Ld1Macro ld1, T mod, bool scale = false) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); size_t page_size = sysconf(_SC_PAGE_SIZE); VIXL_ASSERT(page_size > static_cast(vl)); unsigned esize_in_bytes = esize_in_bits / kBitsPerByte; unsigned msize_in_bytes = msize_in_bits / kBitsPerByte; unsigned msize_in_bytes_log2 = std::log2(msize_in_bytes); VIXL_ASSERT(msize_in_bits <= esize_in_bits); PRegister all = p7; __ Ptrue(all.VnB()); size_t offset_modifier = 0; // The highest address at which a load stopped. Every FF load should fault at // `data + page_size`, so this value should not exceed that value. However, // the architecture allows fault-tolerant loads to fault arbitrarily, so the // real value may be lower. // // This is used to check that the `mprotect` above really does make the second // page inaccessible, and that the resulting FFR from each load reflects that. Register limit = x22; __ Mov(limit, 0); // If the FFR grows unexpectedly, we increment this register by the // difference. FFR should never grow, except when explicitly set. Register ffr_grow_count = x23; __ Mov(ffr_grow_count, 0); // Set the offset so that the load is guaranteed to start in the // accessible page, but end in the inaccessible one. VIXL_ASSERT((page_size % msize_in_bytes) == 0); VIXL_ASSERT((vl % msize_in_bytes) == 0); size_t elements_per_page = page_size / msize_in_bytes; size_t elements_per_access = vl / esize_in_bytes; size_t min_offset = (elements_per_page - elements_per_access) + 1; size_t max_offset = elements_per_page - 1; size_t offset = min_offset + (offset_modifier % (max_offset - min_offset + 1)); offset_modifier++; __ Setffr(); __ Mov(x20, data); __ Mov(x21, offset); if (base_type == CPURegister::kRegister) { // Scalar-plus-scalar mode. VIXL_ASSERT((std::is_same::value)); VIXL_ASSERT((static_cast(mod) == LSL) || (static_cast(mod) == NO_SHIFT)); (masm.*ldff1)(z0.WithLaneSize(esize_in_bits), all.Zeroing(), SVEMemOperand(x20, x21, mod, msize_in_bytes_log2)); } else { VIXL_ASSERT(base_type == CPURegister::kZRegister); int offs_size; bool offs_is_unsigned; if (std::is_same::value) { // Scalar-plus-vector mode with 32-bit optional unpacked or upacked, and // unscaled or scaled offset. VIXL_ASSERT((static_cast(mod) == SXTW) || (static_cast(mod) == UXTW)); if (scale == true) { // Gather first-fault bytes load doesn't support scaled offset. VIXL_ASSERT(msize_in_bits != kBRegSize); } offs_is_unsigned = (static_cast(mod) == UXTW) ? true : false; offs_size = kSRegSize; } else { // Scalar-plus-vector mode with 64-bit unscaled or scaled offset. VIXL_ASSERT((std::is_same::value)); VIXL_ASSERT((static_cast(mod) == LSL) || (static_cast(mod) == NO_SHIFT)); offs_is_unsigned = false; offs_size = kDRegSize; } // For generating the pattern of "base address + index << shift". // In case of unscaled-offset operation, use `msize_in_bytes` be an offset // of each decreasing memory accesses. otherwise, decreases the indexes by 1 // and then scale it by the shift value. int shift = (scale == true) ? msize_in_bytes_log2 : 0; int index_offset = msize_in_bytes >> shift; VIXL_ASSERT(index_offset > 0); uint64_t index = 0; uint64_t base_address = 0; if (offs_is_unsigned == true) { // Base address. base_address = data; // Maximum unsigned positive index. index = page_size >> shift; } else { // Base address. base_address = data + (2 * page_size); // Maximum unsigned positive index. uint64_t uint_e_max = (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX; index = uint_e_max - (page_size >> shift) + 1; } __ Mov(x19, base_address); if ((offs_size == kSRegSize) && (esize_in_bits == kDRegSize)) { // In this case, the index values are optionally sign or zero-extended // from 32 to 64 bits, assign a convenient value to the top 32 bits to // ensure only the low 32 bits be the index values. index |= 0x1234567800000000; } index -= index_offset * (elements_per_access - 1); __ Index(z17.WithLaneSize(esize_in_bits), index, index_offset); // Scalar plus vector mode. (masm.* ldff1)(z0.WithLaneSize(esize_in_bits), all.Zeroing(), SVEMemOperand(x19, z17.WithLaneSize(esize_in_bits), mod, shift)); } __ Rdffrs(p0.VnB(), all.Zeroing()); // Execute another Ldff1 with no offset, so that every element could be // read. It should respect FFR, and load no more than we loaded the // first time. (masm.* ldff1)(z16.WithLaneSize(esize_in_bits), all.Zeroing(), SVEMemOperand(x20)); __ Rdffrs(p1.VnB(), all.Zeroing()); __ Cntp(x0, all, p1.VnB()); __ Uqdecp(x0, p0.VnB()); __ Add(ffr_grow_count, ffr_grow_count, x0); // Use the FFR to predicate the normal load. If it wasn't properly set, // the normal load will abort. (masm.*ld1)(z16.WithLaneSize(esize_in_bits), p0.Zeroing(), SVEMemOperand(x20, x21, LSL, msize_in_bytes_log2)); // Work out the address after the one that was just accessed. __ Incp(x21, p0.WithLaneSize(esize_in_bits)); __ Add(x0, x20, Operand(x21, LSL, msize_in_bytes_log2)); __ Cmp(limit, x0); __ Csel(limit, limit, x0, hs); // Clear lanes inactive in FFR. These have an undefined result. __ Not(p0.VnB(), all.Zeroing(), p0.VnB()); __ Mov(z0.WithLaneSize(esize_in_bits), p0.Merging(), 0); END(); if (CAN_RUN()) { RUN(); uintptr_t expected_limit = data + page_size; uintptr_t measured_limit = core.xreg(limit.GetCode()); VIXL_CHECK(measured_limit <= expected_limit); if (measured_limit < expected_limit) { // We can't fail the test for this case, but a warning is helpful for // manually-run tests. printf( "WARNING: All fault-tolerant loads detected faults before the\n" "expected limit. This is architecturally possible, but improbable,\n" "and could be a symptom of another problem.\n"); } ASSERT_EQUAL_64(0, ffr_grow_count); ASSERT_EQUAL_SVE(z0.WithLaneSize(esize_in_bits), z16.WithLaneSize(esize_in_bits)); } } TEST_SVE(sve_ldff1_scalar_plus_scalar) { size_t page_size = sysconf(_SC_PAGE_SIZE); VIXL_ASSERT(page_size > static_cast(config->sve_vl_in_bytes())); // Allocate two pages, then mprotect the second one to make it inaccessible. uintptr_t data = reinterpret_cast(mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); mprotect(reinterpret_cast(data + page_size), page_size, PROT_NONE); // Fill the accessible page with arbitrary data. for (size_t i = 0; i < page_size; i++) { // Reverse bits so we get a mixture of positive and negative values. uint8_t byte = ReverseBits(static_cast(i)); memcpy(reinterpret_cast(data + i), &byte, 1); } auto ldff1_unscaled_offset_helper = std::bind(&Ldff1Helper, config, data, std::placeholders::_1, std::placeholders::_2, CPURegister::kRegister, std::placeholders::_3, std::placeholders::_4, NO_SHIFT, false); Ld1Macro ldff1b = &MacroAssembler::Ldff1b; Ld1Macro ld1b = &MacroAssembler::Ld1b; ldff1_unscaled_offset_helper(kBRegSize, kBRegSize, ldff1b, ld1b); ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1b, ld1b); ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1b, ld1b); ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1b, ld1b); Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb; Ld1Macro ld1sb = &MacroAssembler::Ld1sb; ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1sb, ld1sb); ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1sb, ld1sb); ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1sb, ld1sb); auto ldff1_scaled_offset_helper = std::bind(&Ldff1Helper, config, data, std::placeholders::_1, std::placeholders::_2, CPURegister::kRegister, std::placeholders::_3, std::placeholders::_4, LSL, true); Ld1Macro ldff1h = &MacroAssembler::Ldff1h; Ld1Macro ld1h = &MacroAssembler::Ld1h; ldff1_scaled_offset_helper(kHRegSize, kHRegSize, ldff1h, ld1h); ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1h, ld1h); ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1h, ld1h); Ld1Macro ldff1w = &MacroAssembler::Ldff1w; Ld1Macro ld1w = &MacroAssembler::Ld1w; ldff1_scaled_offset_helper(kSRegSize, kSRegSize, ldff1w, ld1w); ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1w, ld1w); Ld1Macro ldff1d = &MacroAssembler::Ldff1d; Ld1Macro ld1d = &MacroAssembler::Ld1d; ldff1_scaled_offset_helper(kDRegSize, kDRegSize, ldff1d, ld1d); Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; Ld1Macro ld1sh = &MacroAssembler::Ld1sh; ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1sh, ld1sh); ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1sh, ld1sh); Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw; Ld1Macro ld1sw = &MacroAssembler::Ld1sw; ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1sw, ld1sw); munmap(reinterpret_cast(data), page_size * 2); } static void sve_ldff1_scalar_plus_vector_32_scaled_offset(Test* config, uintptr_t data) { auto ldff1_32_scaled_offset_helper = std::bind(&Ldff1Helper, config, data, std::placeholders::_1, kSRegSize, CPURegister::kZRegister, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, true); Ld1Macro ldff1h = &MacroAssembler::Ldff1h; Ld1Macro ld1h = &MacroAssembler::Ld1h; ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW); ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW); Ld1Macro ldff1w = &MacroAssembler::Ldff1w; Ld1Macro ld1w = &MacroAssembler::Ld1w; ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW); ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW); Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; Ld1Macro ld1sh = &MacroAssembler::Ld1sh; ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW); ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW); } static void sve_ldff1_scalar_plus_vector_32_unscaled_offset(Test* config, uintptr_t data) { auto ldff1_32_unscaled_offset_helper = std::bind(&Ldff1Helper, config, data, std::placeholders::_1, kSRegSize, CPURegister::kZRegister, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, false); Ld1Macro ldff1b = &MacroAssembler::Ldff1b; Ld1Macro ld1b = &MacroAssembler::Ld1b; ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW); ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW); Ld1Macro ldff1h = &MacroAssembler::Ldff1h; Ld1Macro ld1h = &MacroAssembler::Ld1h; ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW); ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW); Ld1Macro ldff1w = &MacroAssembler::Ldff1w; Ld1Macro ld1w = &MacroAssembler::Ld1w; ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW); ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW); Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb; Ld1Macro ld1sb = &MacroAssembler::Ld1sb; ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW); ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW); Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; Ld1Macro ld1sh = &MacroAssembler::Ld1sh; ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW); ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW); } static void sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset( Test* config, uintptr_t data) { auto ldff1_32_unpacked_scaled_offset_helper = std::bind(&Ldff1Helper, config, data, std::placeholders::_1, kDRegSize, CPURegister::kZRegister, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, true); Ld1Macro ldff1h = &MacroAssembler::Ldff1h; Ld1Macro ld1h = &MacroAssembler::Ld1h; ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW); ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW); Ld1Macro ldff1w = &MacroAssembler::Ldff1w; Ld1Macro ld1w = &MacroAssembler::Ld1w; ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW); ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW); Ld1Macro ldff1d = &MacroAssembler::Ldff1d; Ld1Macro ld1d = &MacroAssembler::Ld1d; ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW); ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW); Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; Ld1Macro ld1sh = &MacroAssembler::Ld1sh; ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW); ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW); Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw; Ld1Macro ld1sw = &MacroAssembler::Ld1sw; ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW); ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW); } static void sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset( Test* config, uintptr_t data) { auto ldff1_32_unpacked_unscaled_offset_helper = std::bind(&Ldff1Helper, config, data, std::placeholders::_1, kDRegSize, CPURegister::kZRegister, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, false); Ld1Macro ldff1b = &MacroAssembler::Ldff1b; Ld1Macro ld1b = &MacroAssembler::Ld1b; ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW); ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW); Ld1Macro ldff1h = &MacroAssembler::Ldff1h; Ld1Macro ld1h = &MacroAssembler::Ld1h; ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW); ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW); Ld1Macro ldff1w = &MacroAssembler::Ldff1w; Ld1Macro ld1w = &MacroAssembler::Ld1w; ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW); ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW); Ld1Macro ldff1d = &MacroAssembler::Ldff1d; Ld1Macro ld1d = &MacroAssembler::Ld1d; ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW); ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW); Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb; Ld1Macro ld1sb = &MacroAssembler::Ld1sb; ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW); ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW); Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; Ld1Macro ld1sh = &MacroAssembler::Ld1sh; ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW); ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW); Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw; Ld1Macro ld1sw = &MacroAssembler::Ld1sw; ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW); ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW); } static void sve_ldff1_scalar_plus_vector_64_scaled_offset(Test* config, uintptr_t data) { auto ldff1_64_scaled_offset_helper = std::bind(&Ldff1Helper, config, data, std::placeholders::_1, kDRegSize, CPURegister::kZRegister, std::placeholders::_2, std::placeholders::_3, LSL, true); Ld1Macro ldff1h = &MacroAssembler::Ldff1h; Ld1Macro ld1h = &MacroAssembler::Ld1h; ldff1_64_scaled_offset_helper(kHRegSize, ldff1h, ld1h); Ld1Macro ldff1w = &MacroAssembler::Ldff1w; Ld1Macro ld1w = &MacroAssembler::Ld1w; ldff1_64_scaled_offset_helper(kSRegSize, ldff1w, ld1w); Ld1Macro ldff1d = &MacroAssembler::Ldff1d; Ld1Macro ld1d = &MacroAssembler::Ld1d; ldff1_64_scaled_offset_helper(kDRegSize, ldff1d, ld1d); Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; Ld1Macro ld1sh = &MacroAssembler::Ld1sh; ldff1_64_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh); Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw; Ld1Macro ld1sw = &MacroAssembler::Ld1sw; ldff1_64_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw); } static void sve_ldff1_scalar_plus_vector_64_unscaled_offset(Test* config, uintptr_t data) { auto ldff1_64_unscaled_offset_helper = std::bind(&Ldff1Helper, config, data, std::placeholders::_1, kDRegSize, CPURegister::kZRegister, std::placeholders::_2, std::placeholders::_3, NO_SHIFT, false); Ld1Macro ldff1b = &MacroAssembler::Ldff1b; Ld1Macro ld1b = &MacroAssembler::Ld1b; ldff1_64_unscaled_offset_helper(kBRegSize, ldff1b, ld1b); Ld1Macro ldff1h = &MacroAssembler::Ldff1h; Ld1Macro ld1h = &MacroAssembler::Ld1h; ldff1_64_unscaled_offset_helper(kHRegSize, ldff1h, ld1h); Ld1Macro ldff1w = &MacroAssembler::Ldff1w; Ld1Macro ld1w = &MacroAssembler::Ld1w; ldff1_64_unscaled_offset_helper(kSRegSize, ldff1w, ld1w); Ld1Macro ldff1d = &MacroAssembler::Ldff1d; Ld1Macro ld1d = &MacroAssembler::Ld1d; ldff1_64_unscaled_offset_helper(kDRegSize, ldff1d, ld1d); Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb; Ld1Macro ld1sb = &MacroAssembler::Ld1sb; ldff1_64_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb); Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; Ld1Macro ld1sh = &MacroAssembler::Ld1sh; ldff1_64_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh); Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw; Ld1Macro ld1sw = &MacroAssembler::Ld1sw; ldff1_64_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw); } TEST_SVE(sve_ldff1_scalar_plus_vector) { size_t page_size = sysconf(_SC_PAGE_SIZE); VIXL_ASSERT(page_size > static_cast(config->sve_vl_in_bytes())); // Allocate two pages, then mprotect the second one to make it inaccessible. uintptr_t data = reinterpret_cast(mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); mprotect(reinterpret_cast(data + page_size), page_size, PROT_NONE); // Fill the accessible page with arbitrary data. for (size_t i = 0; i < page_size; i++) { // Reverse bits so we get a mixture of positive and negative values. uint8_t byte = ReverseBits(static_cast(i)); memcpy(reinterpret_cast(data + i), &byte, 1); } sve_ldff1_scalar_plus_vector_32_scaled_offset(config, data); sve_ldff1_scalar_plus_vector_32_unscaled_offset(config, data); sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(config, data); sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(config, data); sve_ldff1_scalar_plus_vector_64_scaled_offset(config, data); sve_ldff1_scalar_plus_vector_64_unscaled_offset(config, data); munmap(reinterpret_cast(data), page_size * 2); } TEST_SVE(sve_ldnf1) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON, CPUFeatures::kFP); START(); size_t page_size = sysconf(_SC_PAGE_SIZE); VIXL_ASSERT(page_size > static_cast(config->sve_vl_in_bytes())); // Allocate two pages, fill them with data, then mprotect the second one to // make it inaccessible. uintptr_t data = reinterpret_cast(mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); // Fill the pages with arbitrary data. for (size_t i = 0; i < page_size; i++) { // Reverse bits so we get a mixture of positive and negative values. uint8_t byte = ReverseBits(static_cast(i)); memcpy(reinterpret_cast(data + i), &byte, 1); } mprotect(reinterpret_cast(data + page_size), page_size, PROT_NONE); __ Setffr(); __ Ptrue(p0.VnB()); __ Dup(z10.VnB(), 0); // Move an address that points to the last unprotected eight bytes. __ Mov(x0, data + page_size - (kQRegSizeInBytes / kBRegSizeInBytes) / 2); // Load, non-faulting, a vector of bytes from x0. At most, eight bytes will be // loaded, the rest being in a protected page. __ Ldnf1b(z0.VnB(), p0.Zeroing(), SVEMemOperand(x0)); __ Rdffr(p1.VnB()); __ Setffr(); // Create references using the FFR value in p1 to zero the undefined lanes. __ Sel(z0.VnB(), p1, z0.VnB(), z10.VnB()); __ Ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0)); // Repeat for larger elements and different addresses, giving different FFR // results. __ Add(x1, x0, 1); __ Ldnf1h(z1.VnH(), p0.Zeroing(), SVEMemOperand(x1)); __ Rdffr(p1.VnB()); __ Setffr(); __ Sel(z1.VnH(), p1, z1.VnH(), z10.VnH()); __ Ld1h(z21.VnH(), p1.Zeroing(), SVEMemOperand(x1)); __ Add(x1, x0, 2); __ Ldnf1w(z2.VnS(), p0.Zeroing(), SVEMemOperand(x1)); __ Rdffr(p1.VnB()); __ Setffr(); __ Sel(z2.VnS(), p1, z2.VnS(), z10.VnS()); __ Ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x1)); __ Sub(x1, x0, 1); __ Ldnf1d(z3.VnD(), p0.Zeroing(), SVEMemOperand(x1)); __ Rdffr(p1.VnB()); __ Setffr(); __ Sel(z3.VnD(), p1, z3.VnD(), z10.VnD()); __ Ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x1)); // Load from previous VL-sized area of memory. All of this should be in the // accessible page. __ Ldnf1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL)); __ Rdffr(p1.VnB()); __ Setffr(); __ Sel(z4.VnB(), p1, z4.VnB(), z10.VnB()); __ Ld1b(z24.VnB(), p1.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL)); // Repeat partial load for larger element size. __ Mov(x0, data + page_size - (kQRegSizeInBytes / kSRegSizeInBytes) / 2); __ Ldnf1b(z5.VnS(), p0.Zeroing(), SVEMemOperand(x0)); __ Rdffr(p1.VnB()); __ Setffr(); __ Sel(z5.VnS(), p1, z5.VnS(), z10.VnS()); __ Ld1b(z25.VnS(), p1.Zeroing(), SVEMemOperand(x0)); // Repeat for sign extension. __ Mov(x0, data + page_size - (kQRegSizeInBytes / kHRegSizeInBytes) / 2); __ Ldnf1sb(z6.VnH(), p0.Zeroing(), SVEMemOperand(x0)); __ Rdffr(p1.VnB()); __ Setffr(); __ Sel(z6.VnH(), p1, z6.VnH(), z10.VnH()); __ Ld1sb(z26.VnH(), p1.Zeroing(), SVEMemOperand(x0)); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z20, z0); ASSERT_EQUAL_SVE(z21, z1); ASSERT_EQUAL_SVE(z22, z2); ASSERT_EQUAL_SVE(z23, z3); ASSERT_EQUAL_SVE(z24, z4); ASSERT_EQUAL_SVE(z25, z5); ASSERT_EQUAL_SVE(z26, z6); } munmap(reinterpret_cast(data), page_size * 2); } // Emphasis on test if the modifiers are propagated and simulated correctly. TEST_SVE(sve_ldff1_regression_test) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); size_t page_size = sysconf(_SC_PAGE_SIZE); VIXL_ASSERT(page_size > static_cast(config->sve_vl_in_bytes())); uintptr_t data = reinterpret_cast(mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); uintptr_t middle = data + page_size; // Fill the accessible page with arbitrary data. for (size_t i = 0; i < page_size; i++) { // Reverse bits so we get a mixture of positive and negative values. uint8_t byte = ReverseBits(static_cast(i)); memcpy(reinterpret_cast(middle + i), &byte, 1); // Make one bit roughly different in every byte and copy the bytes in the // reverse direction that convenient to verifying the loads in negative // indexes. byte += 1; memcpy(reinterpret_cast(middle - i), &byte, 1); } PRegister all = p6; __ Ptrue(all.VnB()); __ Mov(x0, middle); __ Index(z31.VnS(), 0, 3); __ Neg(z30.VnS(), z31.VnS()); __ Setffr(); // Scalar plus vector 32 unscaled offset __ Ldff1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW)); __ Ldff1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW)); __ Ldff1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW)); __ Ldff1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW)); __ Ldff1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW)); // Scalar plus vector 32 scaled offset __ Ldff1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1)); __ Ldff1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2)); __ Ldff1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1)); __ Index(z31.VnD(), 0, 3); __ Neg(z30.VnD(), z31.VnD()); // Ensure only the low 32 bits are used for the testing with positive index // values. It also test if the indexes are treated as positive in `uxtw` form. __ Mov(x3, 0x8000000080000000); __ Dup(z28.VnD(), x3); __ Sub(x2, x0, 0x80000000); __ Add(z29.VnD(), z31.VnD(), z28.VnD()); // Scalar plus vector 32 unpacked unscaled offset __ Ldff1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW)); __ Ldff1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW)); __ Ldff1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW)); __ Ldff1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW)); __ Ldff1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW)); __ Ldff1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW)); // Scalar plus vector 32 unpacked scaled offset __ Ldff1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1)); __ Ldff1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2)); __ Ldff1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3)); __ Ldff1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1)); __ Ldff1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2)); __ Sub(x0, x0, x3); // Note that the positive indexes has been added by `0x8000000080000000`. The // wrong address will be accessed if the address is treated as negative. // Scalar plus vector 64 unscaled offset __ Ldff1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD())); __ Ldff1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD())); __ Ldff1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD())); __ Ldff1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD())); __ Ldff1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD())); // Scalar plus vector 64 scaled offset __ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000 __ Add(z30.VnD(), z31.VnD(), z29.VnD()); __ Ldff1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1)); __ Ldff1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1)); __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000 __ Add(z30.VnD(), z31.VnD(), z29.VnD()); __ Ldff1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2)); __ Ldff1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2)); __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000 __ Add(z30.VnD(), z31.VnD(), z29.VnD()); __ Ldff1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3)); __ Rdffr(p1.VnB()); __ Cntp(x10, all, p1.VnB()); END(); if (CAN_RUN()) { RUN(); int64_t loaded_data_in_bytes = core.xreg(x10.GetCode()); // Only check 128 bits in this test. if (loaded_data_in_bytes < kQRegSizeInBytes) { // Report a warning when we hit fault-tolerant loads before all expected // loads performed. printf( "WARNING: Fault-tolerant loads detected faults before the " "expected loads completed.\n"); return; } // Scalar plus vector 32 unscaled offset uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001}; uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001}; uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001}; uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001}; uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001}; ASSERT_EQUAL_SVE(expected_z1, z1.VnS()); ASSERT_EQUAL_SVE(expected_z2, z2.VnS()); ASSERT_EQUAL_SVE(expected_z3, z3.VnS()); ASSERT_EQUAL_SVE(expected_z4, z4.VnS()); ASSERT_EQUAL_SVE(expected_z5, z5.VnS()); // Scalar plus vector 32 scaled offset uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001}; uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001}; uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001}; ASSERT_EQUAL_SVE(expected_z6, z6.VnS()); ASSERT_EQUAL_SVE(expected_z7, z7.VnS()); ASSERT_EQUAL_SVE(expected_z8, z8.VnS()); // Scalar plus vector 32 unpacked unscaled offset uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001}; uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001}; uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001}; uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001}; uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001}; uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); ASSERT_EQUAL_SVE(expected_z11, z11.VnD()); ASSERT_EQUAL_SVE(expected_z12, z12.VnD()); ASSERT_EQUAL_SVE(expected_z13, z13.VnD()); ASSERT_EQUAL_SVE(expected_z14, z14.VnD()); // Scalar plus vector 32 unpacked scaled offset uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001}; uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001}; uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001}; uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001}; uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001}; ASSERT_EQUAL_SVE(expected_z15, z15.VnD()); ASSERT_EQUAL_SVE(expected_z16, z16.VnD()); ASSERT_EQUAL_SVE(expected_z17, z17.VnD()); ASSERT_EQUAL_SVE(expected_z18, z18.VnD()); ASSERT_EQUAL_SVE(expected_z19, z19.VnD()); // Scalar plus vector 64 unscaled offset uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001}; uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001}; uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001}; uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001}; uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001}; ASSERT_EQUAL_SVE(expected_z20, z20.VnD()); ASSERT_EQUAL_SVE(expected_z21, z21.VnD()); ASSERT_EQUAL_SVE(expected_z22, z22.VnD()); ASSERT_EQUAL_SVE(expected_z23, z23.VnD()); ASSERT_EQUAL_SVE(expected_z24, z24.VnD()); uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001}; uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001}; uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001}; uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001}; uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001}; // Scalar plus vector 64 scaled offset ASSERT_EQUAL_SVE(expected_z25, z25.VnD()); ASSERT_EQUAL_SVE(expected_z26, z26.VnD()); ASSERT_EQUAL_SVE(expected_z27, z27.VnD()); ASSERT_EQUAL_SVE(expected_z28, z28.VnD()); ASSERT_EQUAL_SVE(expected_z29, z29.VnD()); } } // Emphasis on test if the modifiers are propagated and simulated correctly. TEST_SVE(sve_ld1_regression_test) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); size_t page_size = sysconf(_SC_PAGE_SIZE); VIXL_ASSERT(page_size > static_cast(config->sve_vl_in_bytes())); uintptr_t data = reinterpret_cast(mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); uintptr_t middle = data + page_size; // Fill the accessible page with arbitrary data. for (size_t i = 0; i < page_size; i++) { // Reverse bits so we get a mixture of positive and negative values. uint8_t byte = ReverseBits(static_cast(i)); memcpy(reinterpret_cast(middle + i), &byte, 1); // Make one bit roughly different in every byte and copy the bytes in the // reverse direction that convenient to verifying the loads in negative // indexes. byte += 1; memcpy(reinterpret_cast(middle - i), &byte, 1); } PRegister all = p6; __ Ptrue(all.VnB()); __ Mov(x0, middle); __ Index(z31.VnS(), 0, 3); __ Neg(z30.VnS(), z31.VnS()); // Scalar plus vector 32 unscaled offset __ Ld1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW)); __ Ld1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW)); __ Ld1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW)); __ Ld1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW)); __ Ld1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW)); // Scalar plus vector 32 scaled offset __ Ld1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1)); __ Ld1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2)); __ Ld1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1)); __ Index(z31.VnD(), 0, 3); __ Neg(z30.VnD(), z31.VnD()); // Ensure only the low 32 bits are used for the testing with positive index // values. It also test if the indexes are treated as positive in `uxtw` form. __ Mov(x3, 0x8000000080000000); __ Dup(z28.VnD(), x3); __ Sub(x2, x0, 0x80000000); __ Add(z29.VnD(), z31.VnD(), z28.VnD()); // Scalar plus vector 32 unpacked unscaled offset __ Ld1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW)); __ Ld1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW)); __ Ld1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW)); __ Ld1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW)); __ Ld1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW)); __ Ld1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW)); // Scalar plus vector 32 unpacked scaled offset __ Ld1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1)); __ Ld1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2)); __ Ld1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3)); __ Ld1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1)); __ Ld1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2)); __ Sub(x0, x0, x3); // Note that the positive indexes has been added by `0x8000000080000000`. The // wrong address will be accessed if the address is treated as negative. // Scalar plus vector 64 unscaled offset __ Ld1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD())); __ Ld1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD())); __ Ld1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD())); __ Ld1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD())); __ Ld1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD())); // Scalar plus vector 64 scaled offset __ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000 __ Add(z30.VnD(), z31.VnD(), z29.VnD()); __ Ld1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1)); __ Ld1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1)); __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000 __ Add(z30.VnD(), z31.VnD(), z29.VnD()); __ Ld1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2)); __ Ld1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2)); __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000 __ Add(z30.VnD(), z31.VnD(), z29.VnD()); __ Ld1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3)); END(); if (CAN_RUN()) { RUN(); // Scalar plus vector 32 unscaled offset uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001}; uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001}; uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001}; uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001}; uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001}; ASSERT_EQUAL_SVE(expected_z1, z1.VnS()); ASSERT_EQUAL_SVE(expected_z2, z2.VnS()); ASSERT_EQUAL_SVE(expected_z3, z3.VnS()); ASSERT_EQUAL_SVE(expected_z4, z4.VnS()); ASSERT_EQUAL_SVE(expected_z5, z5.VnS()); // Scalar plus vector 32 scaled offset uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001}; uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001}; uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001}; ASSERT_EQUAL_SVE(expected_z6, z6.VnS()); ASSERT_EQUAL_SVE(expected_z7, z7.VnS()); ASSERT_EQUAL_SVE(expected_z8, z8.VnS()); // Scalar plus vector 32 unpacked unscaled offset uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001}; uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001}; uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001}; uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001}; uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001}; uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); ASSERT_EQUAL_SVE(expected_z11, z11.VnD()); ASSERT_EQUAL_SVE(expected_z12, z12.VnD()); ASSERT_EQUAL_SVE(expected_z13, z13.VnD()); ASSERT_EQUAL_SVE(expected_z14, z14.VnD()); // Scalar plus vector 32 unpacked scaled offset uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001}; uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001}; uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001}; uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001}; uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001}; ASSERT_EQUAL_SVE(expected_z15, z15.VnD()); ASSERT_EQUAL_SVE(expected_z16, z16.VnD()); ASSERT_EQUAL_SVE(expected_z17, z17.VnD()); ASSERT_EQUAL_SVE(expected_z18, z18.VnD()); ASSERT_EQUAL_SVE(expected_z19, z19.VnD()); // Scalar plus vector 64 unscaled offset uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001}; uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001}; uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001}; uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001}; uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001}; ASSERT_EQUAL_SVE(expected_z20, z20.VnD()); ASSERT_EQUAL_SVE(expected_z21, z21.VnD()); ASSERT_EQUAL_SVE(expected_z22, z22.VnD()); ASSERT_EQUAL_SVE(expected_z23, z23.VnD()); ASSERT_EQUAL_SVE(expected_z24, z24.VnD()); uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001}; uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001}; uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001}; uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001}; uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001}; // Scalar plus vector 64 scaled offset ASSERT_EQUAL_SVE(expected_z25, z25.VnD()); ASSERT_EQUAL_SVE(expected_z26, z26.VnD()); ASSERT_EQUAL_SVE(expected_z27, z27.VnD()); ASSERT_EQUAL_SVE(expected_z28, z28.VnD()); ASSERT_EQUAL_SVE(expected_z29, z29.VnD()); } } // Test gather loads by comparing them with the result of a set of equivalent // scalar loads. template static void GatherLoadScalarPlusVectorHelper(Test* config, unsigned msize_in_bits, unsigned esize_in_bits, Ld1Macro ld1, Ld1Macro ldff1, T mod, bool is_signed, bool is_scaled) { // SVE supports 32- and 64-bit addressing for gather loads. VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize)); static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize; SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); unsigned msize_in_bytes = msize_in_bits / kBitsPerByte; int vl = config->sve_vl_in_bytes(); uint64_t addresses[kMaxLaneCount]; uint64_t offsets[kMaxLaneCount]; uint64_t max_address = 0; uint64_t buffer_size = vl * 64; uint64_t data = reinterpret_cast(malloc(buffer_size)); // Fill the buffer with arbitrary data. Meanwhile, create the random addresses // and offsets into the buffer placed in the argument list. BufferFillingHelper(data, buffer_size, msize_in_bytes, kMaxLaneCount, offsets, addresses, &max_address); ZRegister zn = z0.WithLaneSize(esize_in_bits); ZRegister zt_ref = z1.WithLaneSize(esize_in_bits); ZRegister zt = z2.WithLaneSize(esize_in_bits); ZRegister zt_ff = z3.WithLaneSize(esize_in_bits); PRegisterWithLaneSize pg_ff = p1.WithLaneSize(esize_in_bits); PRegisterWithLaneSize pg_diff = p2.WithLaneSize(esize_in_bits); int shift = 0; if (is_scaled) { shift = std::log2(msize_in_bytes); for (unsigned i = 0; i < kMaxLaneCount; i++) { // Ensure the offsets are the multiple of the scale factor of the // operation. offsets[i] = (offsets[i] >> shift) << shift; addresses[i] = data + offsets[i]; } } PRegister all = p6; __ Ptrue(all.WithLaneSize(esize_in_bits)); PRegisterZ pg = p0.Zeroing(); Initialise(&masm, pg, 0x9abcdef012345678, 0xabcdef0123456789, 0xf4f3f1f0fefdfcfa, 0xf9f8f6f5f3f2f1ff); __ Mov(x0, data); // Generate a reference result for scalar-plus-scalar form using scalar loads. ScalarLoadHelper(&masm, vl, addresses, zt_ref, pg, esize_in_bits, msize_in_bits, is_signed); InsrHelper(&masm, zn, offsets); if (is_scaled) { // Scale down the offsets if testing scaled-offset operation. __ Lsr(zn, zn, shift); } (masm.*ld1)(zt, pg, SVEMemOperand(x0, zn, mod, shift)); Register ffr_check_count = x17; __ Mov(ffr_check_count, 0); // Test the data correctness in which the data gather load from different // addresses. The first-fault behavior test is emphasized in `Ldff1Helper`. __ Setffr(); (masm.*ldff1)(zt_ff, pg, SVEMemOperand(x0, zn, mod, shift)); // Compare these two vector register and place the different to // `ffr_check_count`. __ Rdffrs(pg_ff.VnB(), all.Zeroing()); __ Cmpeq(pg_diff, all.Zeroing(), zt_ref, zt_ff); __ Eor(pg_diff.VnB(), all.Zeroing(), pg_diff.VnB(), pg_ff.VnB()); __ Incp(ffr_check_count, pg_diff); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zt_ref, zt); ASSERT_EQUAL_64(0, ffr_check_count); } free(reinterpret_cast(data)); } // Test gather loads by comparing them with the result of a set of equivalent // scalar loads. template static void GatherLoadScalarPlusScalarOrImmHelper(Test* config, unsigned msize_in_bits, unsigned esize_in_bits, F sve_ld1, bool is_signed) { // SVE supports 32- and 64-bit addressing for gather loads. VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize)); static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize; SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); unsigned msize_in_bytes = msize_in_bits / kBitsPerByte; int vl = config->sve_vl_in_bytes(); uint64_t addresses[kMaxLaneCount]; uint64_t offsets[kMaxLaneCount]; uint64_t max_address = 0; uint64_t buffer_size = vl * 64; uint64_t data = reinterpret_cast(malloc(buffer_size)); BufferFillingHelper(data, buffer_size, msize_in_bytes, kMaxLaneCount, offsets, addresses, &max_address); // Maximised offsets, to ensure that the address calculation is modulo-2^64, // and that the vector addresses are not sign-extended. uint64_t uint_e_max = (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX; uint64_t maxed_offsets[kMaxLaneCount]; uint64_t maxed_offsets_imm = max_address - uint_e_max; for (unsigned i = 0; i < kMaxLaneCount; i++) { maxed_offsets[i] = addresses[i] - maxed_offsets_imm; } ZRegister zn = z0.WithLaneSize(esize_in_bits); ZRegister zt_addresses = z1.WithLaneSize(esize_in_bits); ZRegister zt_offsets = z2.WithLaneSize(esize_in_bits); ZRegister zt_maxed = z3.WithLaneSize(esize_in_bits); ZRegister zt_ref = z4.WithLaneSize(esize_in_bits); PRegisterZ pg = p0.Zeroing(); Initialise(&masm, pg, 0x9abcdef012345678, 0xabcdef0123456789, 0xf4f3f1f0fefdfcfa, 0xf9f8f6f5f3f2f0ff); // Execute each load. if (esize_in_bits == kDRegSize) { // Only test `addresses` if we can use 64-bit pointers. InsrHelper will fail // if any value won't fit in a lane of zn. InsrHelper(&masm, zn, addresses); (masm.*sve_ld1)(zt_addresses, pg, SVEMemOperand(zn)); } InsrHelper(&masm, zn, offsets); (masm.*sve_ld1)(zt_offsets, pg, SVEMemOperand(zn, data)); InsrHelper(&masm, zn, maxed_offsets); (masm.*sve_ld1)(zt_maxed, pg, SVEMemOperand(zn, maxed_offsets_imm)); // Generate a reference result using scalar loads. ScalarLoadHelper(&masm, vl, addresses, zt_ref, pg, esize_in_bits, msize_in_bits, is_signed); END(); if (CAN_RUN()) { RUN(); if (esize_in_bits == kDRegSize) { ASSERT_EQUAL_SVE(zt_ref, zt_addresses); } ASSERT_EQUAL_SVE(zt_ref, zt_offsets); ASSERT_EQUAL_SVE(zt_ref, zt_maxed); } free(reinterpret_cast(data)); } TEST_SVE(sve_ld1b_64bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1b, false); } TEST_SVE(sve_ld1h_64bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1h, false); } TEST_SVE(sve_ld1w_64bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1w, false); } TEST_SVE(sve_ld1d_64bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kDRegSize, kDRegSize, &MacroAssembler::Ld1d, false); } TEST_SVE(sve_ld1sb_64bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1sb, true); } TEST_SVE(sve_ld1sh_64bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1sh, true); } TEST_SVE(sve_ld1sw_64bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1sw, true); } TEST_SVE(sve_ld1b_32bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1b, false); } TEST_SVE(sve_ld1h_32bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1h, false); } TEST_SVE(sve_ld1w_32bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kSRegSize, kSRegSize, &MacroAssembler::Ld1w, false); } TEST_SVE(sve_ld1sb_32bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1sb, true); } TEST_SVE(sve_ld1sh_32bit_vector_plus_immediate) { GatherLoadScalarPlusScalarOrImmHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1sh, true); } TEST_SVE(sve_ld1_scalar_plus_vector_32_scaled_offset) { auto ld1_32_scaled_offset_helper = std::bind(&GatherLoadScalarPlusVectorHelper, config, std::placeholders::_1, kSRegSize, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, true); Ld1Macro ld1h = &MacroAssembler::Ld1h; Ld1Macro ldff1h = &MacroAssembler::Ldff1h; ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false); ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false); Ld1Macro ld1w = &MacroAssembler::Ld1w; Ld1Macro ldff1w = &MacroAssembler::Ldff1w; ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false); ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false); Ld1Macro ld1sh = &MacroAssembler::Ld1sh; Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true); ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true); } TEST_SVE(sve_ld1_scalar_plus_vector_32_unscaled_offset) { auto ld1_32_unscaled_offset_helper = std::bind(&GatherLoadScalarPlusVectorHelper, config, std::placeholders::_1, kSRegSize, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, false); Ld1Macro ld1b = &MacroAssembler::Ld1b; Ld1Macro ldff1b = &MacroAssembler::Ldff1b; ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, UXTW, false); ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, SXTW, false); Ld1Macro ld1h = &MacroAssembler::Ld1h; Ld1Macro ldff1h = &MacroAssembler::Ldff1h; ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false); ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false); Ld1Macro ld1w = &MacroAssembler::Ld1w; Ld1Macro ldff1w = &MacroAssembler::Ldff1w; ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false); ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false); Ld1Macro ld1sb = &MacroAssembler::Ld1sb; Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb; ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, UXTW, true); ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, SXTW, true); Ld1Macro ld1sh = &MacroAssembler::Ld1sh; Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true); ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true); } TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_scaled_offset) { auto ld1_32_unpacked_scaled_offset_helper = std::bind(&GatherLoadScalarPlusVectorHelper, config, std::placeholders::_1, kDRegSize, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, true); Ld1Macro ld1h = &MacroAssembler::Ld1h; Ld1Macro ldff1h = &MacroAssembler::Ldff1h; ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false); ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false); Ld1Macro ld1w = &MacroAssembler::Ld1w; Ld1Macro ldff1w = &MacroAssembler::Ldff1w; ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false); ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false); Ld1Macro ld1d = &MacroAssembler::Ld1d; Ld1Macro ldff1d = &MacroAssembler::Ldff1d; ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false); ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false); Ld1Macro ld1sh = &MacroAssembler::Ld1sh; Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true); ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true); Ld1Macro ld1sw = &MacroAssembler::Ld1sw; Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw; ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true); ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true); } TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_unscaled_offset) { auto ld1_32_unpacked_unscaled_offset_helper = std::bind(&GatherLoadScalarPlusVectorHelper, config, std::placeholders::_1, kDRegSize, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, false); Ld1Macro ld1h = &MacroAssembler::Ld1h; Ld1Macro ldff1h = &MacroAssembler::Ldff1h; ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false); ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false); Ld1Macro ld1w = &MacroAssembler::Ld1w; Ld1Macro ldff1w = &MacroAssembler::Ldff1w; ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false); ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false); Ld1Macro ld1d = &MacroAssembler::Ld1d; Ld1Macro ldff1d = &MacroAssembler::Ldff1d; ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false); ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false); Ld1Macro ld1sh = &MacroAssembler::Ld1sh; Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true); ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true); Ld1Macro ld1sw = &MacroAssembler::Ld1sw; Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw; ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true); ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true); } TEST_SVE(sve_ld1_scalar_plus_vector_64_scaled_offset) { auto ld1_64_scaled_offset_helper = std::bind(&GatherLoadScalarPlusVectorHelper, config, std::placeholders::_1, kDRegSize, std::placeholders::_2, std::placeholders::_3, LSL, std::placeholders::_4, true); Ld1Macro ld1h = &MacroAssembler::Ld1h; Ld1Macro ldff1h = &MacroAssembler::Ldff1h; ld1_64_scaled_offset_helper(kHRegSize, ld1h, ldff1h, false); Ld1Macro ld1w = &MacroAssembler::Ld1w; Ld1Macro ldff1w = &MacroAssembler::Ldff1w; ld1_64_scaled_offset_helper(kSRegSize, ld1w, ldff1w, false); Ld1Macro ld1d = &MacroAssembler::Ld1d; Ld1Macro ldff1d = &MacroAssembler::Ldff1d; ld1_64_scaled_offset_helper(kDRegSize, ld1d, ldff1d, false); Ld1Macro ld1sh = &MacroAssembler::Ld1sh; Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; ld1_64_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true); Ld1Macro ld1sw = &MacroAssembler::Ld1sw; Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw; ld1_64_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true); } TEST_SVE(sve_ld1_scalar_plus_vector_64_unscaled_offset) { auto ld1_64_unscaled_offset_helper = std::bind(&GatherLoadScalarPlusVectorHelper, config, std::placeholders::_1, kDRegSize, std::placeholders::_2, std::placeholders::_3, NO_SHIFT, std::placeholders::_4, false); Ld1Macro ld1b = &MacroAssembler::Ld1b; Ld1Macro ldff1b = &MacroAssembler::Ldff1b; ld1_64_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, false); Ld1Macro ld1h = &MacroAssembler::Ld1h; Ld1Macro ldff1h = &MacroAssembler::Ldff1h; ld1_64_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, false); Ld1Macro ld1w = &MacroAssembler::Ld1w; Ld1Macro ldff1w = &MacroAssembler::Ldff1w; ld1_64_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, false); Ld1Macro ld1d = &MacroAssembler::Ld1d; Ld1Macro ldff1d = &MacroAssembler::Ldff1d; ld1_64_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, false); Ld1Macro ld1sb = &MacroAssembler::Ld1sb; Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb; ld1_64_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, true); Ld1Macro ld1sh = &MacroAssembler::Ld1sh; Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh; ld1_64_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true); Ld1Macro ld1sw = &MacroAssembler::Ld1sw; Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw; ld1_64_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true); } TEST_SVE(sve_ldnt1) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int data_size = kZRegMaxSizeInBytes * 16; uint8_t* data = new uint8_t[data_size]; for (int i = 0; i < data_size; i++) { data[i] = i & 0xff; } // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); __ Ptrue(p0.VnB()); __ Punpklo(p1.VnH(), p0.VnB()); __ Punpklo(p2.VnH(), p1.VnB()); __ Punpklo(p3.VnH(), p2.VnB()); __ Punpklo(p4.VnH(), p3.VnB()); __ Mov(x1, 42); __ Ld1b(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1)); __ Ldnt1b(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1)); __ Mov(x1, -21); __ Ld1h(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1)); __ Ldnt1h(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1)); __ Mov(x1, 10); __ Ld1w(z4.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2)); __ Ldnt1w(z5.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2)); __ Mov(x1, -5); __ Ld1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3)); __ Ldnt1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3)); __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL)); __ Ldnt1b(z9.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL)); __ Ld1h(z10.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL)); __ Ldnt1h(z11.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL)); __ Ld1w(z12.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL)); __ Ldnt1w(z13.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL)); __ Ld1d(z14.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL)); __ Ldnt1d(z15.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL)); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z0, z1); ASSERT_EQUAL_SVE(z2, z3); ASSERT_EQUAL_SVE(z4, z5); ASSERT_EQUAL_SVE(z6, z7); ASSERT_EQUAL_SVE(z8, z9); ASSERT_EQUAL_SVE(z10, z11); ASSERT_EQUAL_SVE(z12, z13); ASSERT_EQUAL_SVE(z14, z15); } } TEST_SVE(sve_stnt1) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int data_size = kZRegMaxSizeInBytes * 16; uint8_t* data = new uint8_t[data_size]; // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); __ Ptrue(p0.VnB()); __ Punpklo(p1.VnH(), p0.VnB()); __ Punpklo(p2.VnH(), p1.VnB()); __ Punpklo(p3.VnH(), p2.VnB()); __ Punpklo(p4.VnH(), p3.VnB()); __ Dup(z0.VnB(), 0x55); __ Index(z1.VnB(), 0, 1); // Store with all-true and patterned predication, load back, and create a // reference value for later comparison. __ Rdvl(x1, 1); __ Stnt1b(z0.VnB(), p0, SVEMemOperand(x0, x1)); __ Stnt1b(z1.VnB(), p1, SVEMemOperand(x0, 1, SVE_MUL_VL)); __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1)); __ Sel(z3.VnB(), p1, z1.VnB(), z0.VnB()); // Repeated, with wider elements and different offsets. __ Rdvl(x1, -1); __ Lsr(x1, x1, 1); __ Stnt1h(z0.VnH(), p0, SVEMemOperand(x0, x1, LSL, 1)); __ Stnt1h(z1.VnH(), p2, SVEMemOperand(x0, -1, SVE_MUL_VL)); __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 1)); __ Sel(z5.VnH(), p2, z1.VnH(), z0.VnH()); __ Rdvl(x1, 7); __ Lsr(x1, x1, 2); __ Stnt1w(z0.VnS(), p0, SVEMemOperand(x0, x1, LSL, 2)); __ Stnt1w(z1.VnS(), p3, SVEMemOperand(x0, 7, SVE_MUL_VL)); __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 2)); __ Sel(z7.VnS(), p3, z1.VnS(), z0.VnS()); __ Rdvl(x1, -8); __ Lsr(x1, x1, 3); __ Stnt1d(z0.VnD(), p0, SVEMemOperand(x0, x1, LSL, 3)); __ Stnt1d(z1.VnD(), p4, SVEMemOperand(x0, -8, SVE_MUL_VL)); __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 3)); __ Sel(z9.VnD(), p4, z1.VnD(), z0.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z2, z3); ASSERT_EQUAL_SVE(z4, z5); ASSERT_EQUAL_SVE(z6, z7); ASSERT_EQUAL_SVE(z8, z9); } } TEST_SVE(sve_ld1rq) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int data_size = (kQRegSizeInBytes + 128) * 2; uint8_t* data = new uint8_t[data_size]; for (int i = 0; i < data_size; i++) { data[i] = i & 0xff; } // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); __ Index(z0.VnB(), 0, 1); __ Ptrue(p0.VnB()); __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4); __ Pfalse(p1.VnB()); __ Zip1(p1.VnB(), p0.VnB(), p1.VnB()); // Load and broadcast using scalar offsets. __ Mov(x1, -42); __ Ld1rqb(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1)); __ Add(x2, x0, 1); __ Mov(x1, -21); __ Punpklo(p2.VnH(), p1.VnB()); __ Ld1rqh(z1.VnH(), p2.Zeroing(), SVEMemOperand(x2, x1, LSL, 1)); __ Add(x2, x2, 1); __ Mov(x1, -10); __ Punpklo(p3.VnH(), p2.VnB()); __ Ld1rqw(z2.VnS(), p3.Zeroing(), SVEMemOperand(x2, x1, LSL, 2)); __ Add(x2, x2, 1); __ Mov(x1, 5); __ Punpklo(p4.VnH(), p3.VnB()); __ Ld1rqd(z3.VnD(), p4.Zeroing(), SVEMemOperand(x2, x1, LSL, 3)); // Check that all segments match by rotating the vector by one segment, // eoring, and orring across the vector. __ Mov(z4, z0); __ Ext(z4.VnB(), z4.VnB(), z4.VnB(), 16); __ Eor(z4.VnB(), z4.VnB(), z0.VnB()); __ Orv(b4, p0, z4.VnB()); __ Mov(z5, z1); __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16); __ Eor(z5.VnB(), z5.VnB(), z1.VnB()); __ Orv(b5, p0, z5.VnB()); __ Orr(z4, z4, z5); __ Mov(z5, z2); __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16); __ Eor(z5.VnB(), z5.VnB(), z2.VnB()); __ Orv(b5, p0, z5.VnB()); __ Orr(z4, z4, z5); __ Mov(z5, z3); __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16); __ Eor(z5.VnB(), z5.VnB(), z3.VnB()); __ Orv(b5, p0, z5.VnB()); __ Orr(z4, z4, z5); // Load and broadcast the same values, using immediate offsets. __ Add(x1, x0, 6); __ Ld1rqb(z5.VnB(), p1.Zeroing(), SVEMemOperand(x1, -48)); __ Add(x1, x0, -9); __ Ld1rqh(z6.VnH(), p2.Zeroing(), SVEMemOperand(x1, -32)); __ Add(x1, x0, -70); __ Ld1rqw(z7.VnS(), p3.Zeroing(), SVEMemOperand(x1, 32)); __ Add(x1, x0, 27); __ Ld1rqd(z8.VnD(), p4.Zeroing(), SVEMemOperand(x1, 16)); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z0[] = {0x0000000000000000, 0x006c006a00680066}; uint64_t expected_z1[] = {0x000074730000706f, 0x00006c6b00006867}; uint64_t expected_z2[] = {0x0000000075747372, 0x000000006d6c6b6a}; uint64_t expected_z3[] = {0x0000000000000000, 0xc2c1c0bfbebdbcbb}; uint64_t expected_z4[] = {0, 0}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); ASSERT_EQUAL_SVE(z0, z5); ASSERT_EQUAL_SVE(z1, z6); ASSERT_EQUAL_SVE(z2, z7); ASSERT_EQUAL_SVE(z3, z8); } } TEST_SVE(sve_st1_vec_imm) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE); START(); // TODO: Use mmap() to request a buffer in the low 4GB, which allows testing // 32-bit address vectors. int data_size = kZRegMaxSizeInBytes * 16; uint8_t* data = new uint8_t[data_size]; // Set the base to 16 bytes from the end of the buffer so we can use negative // indices. __ Mov(x0, reinterpret_cast(&data[data_size - 16])); __ Ptrue(p0.VnB()); // Store a vector of index values in reverse order, using // vector-plus-immediate addressing to begin at byte 15, then storing to // bytes 14, 13, etc. __ Index(z1.VnD(), x0, -1); __ Index(z2.VnD(), 0, 1); // Iterate in order to store at least 16 bytes. The number of iterations // depends on VL, eg. VL128 iterates eight times, storing bytes 15 and 14 // on the first iteration, 13 and 12 on the next, etc. uint64_t dlanes = config->sve_vl_in_bytes() / kDRegSizeInBytes; for (int i = 15; i >= 0; i -= dlanes * kBRegSizeInBytes) { __ St1b(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i)); __ Incd(z2.VnD()); } // Reload the stored data, and build a reference for comparison. The reference // is truncated to a Q register, as only the least-significant 128 bits are // checked. __ Ldr(q4, MemOperand(x0)); __ Index(z5.VnB(), 15, -1); __ Mov(q5, q5); // Repeat for wider elements. __ Index(z1.VnD(), x0, -2); // Stepping by -2 for H-sized elements. __ Index(z2.VnD(), 0, 1); for (int i = 14; i >= 0; i -= dlanes * kHRegSizeInBytes) { __ St1h(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i)); __ Incd(z2.VnD()); } __ Ldr(q6, MemOperand(x0)); __ Index(z7.VnH(), 7, -1); __ Mov(q7, q7); __ Index(z1.VnD(), x0, -4); // Stepping by -4 for S-sized elements. __ Index(z2.VnD(), 0, 1); for (int i = 12; i >= 0; i -= dlanes * kSRegSizeInBytes) { __ St1w(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i)); __ Incd(z2.VnD()); } __ Ldr(q8, MemOperand(x0)); __ Index(z9.VnS(), 3, -1); __ Mov(q9, q9); __ Index(z1.VnD(), x0, -8); // Stepping by -8 for D-sized elements. __ Index(z2.VnD(), 0, 1); for (int i = 8; i >= 0; i -= dlanes * kDRegSizeInBytes) { __ St1d(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i)); __ Incd(z2.VnD()); } __ Ldr(q10, MemOperand(x0)); __ Index(z11.VnD(), 1, -1); __ Mov(q11, q11); // Test predication by storing even halfwords to memory (using predication) // at byte-separated addresses. The result should be the same as storing // even halfwords contiguously to memory. __ Pfalse(p1.VnB()); __ Zip1(p1.VnD(), p0.VnD(), p1.VnD()); __ Mov(x0, reinterpret_cast(data)); __ Index(z1.VnD(), x0, 1); __ Index(z2.VnD(), 0x1000, 1); for (int i = 0; i < 16; i += dlanes) { __ St1h(z2.VnD(), p1, SVEMemOperand(z1.VnD(), i)); __ Incd(z2.VnD()); } __ Ldr(q2, MemOperand(x0)); __ Index(z3.VnH(), 0x1000, 2); __ Mov(q3, q3); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z3, z2); ASSERT_EQUAL_SVE(z5, z4); ASSERT_EQUAL_SVE(z7, z6); ASSERT_EQUAL_SVE(z9, z8); ASSERT_EQUAL_SVE(z11, z10); } } template static void sve_st1_scalar_plus_vector_helper(Test* config, int esize_in_bits, T mod, bool is_scaled) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int vl = config->sve_vl_in_bytes(); int data_size = vl * 160; uint8_t* data = new uint8_t[data_size]; memset(data, 0, data_size); int vl_per_esize = vl / (esize_in_bits / kBitsPerByte); ZRegister zn_b = z0.WithLaneSize(esize_in_bits); ZRegister zn_h = z1.WithLaneSize(esize_in_bits); ZRegister zn_s = z2.WithLaneSize(esize_in_bits); ZRegister zn_d = z3.WithLaneSize(esize_in_bits); ZRegister zn_ld_b = z10.WithLaneSize(esize_in_bits); ZRegister zn_ld_h = z11.WithLaneSize(esize_in_bits); ZRegister zn_ld_s = z12.WithLaneSize(esize_in_bits); ZRegister zn_ld_d = z13.WithLaneSize(esize_in_bits); ZRegister offsets = z31.WithLaneSize(esize_in_bits); // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); __ Ptrue(p6.WithLaneSize(esize_in_bits)); __ Pfalse(p7.WithLaneSize(esize_in_bits)); __ Zip1(p0.WithLaneSize(esize_in_bits), p6.WithLaneSize(esize_in_bits), p7.WithLaneSize(esize_in_bits)); __ Zip1(p1.WithLaneSize(esize_in_bits), p7.WithLaneSize(esize_in_bits), p6.WithLaneSize(esize_in_bits)); // `st1b` doesn't have the scaled-offset forms. if (is_scaled == false) { // Simply stepping the index by 2 to simulate a scatter memory access. __ Index(offsets, 1, 2); __ St1b(offsets, p0, SVEMemOperand(x0, offsets, mod)); __ Ld1b(zn_ld_b, p0.Zeroing(), SVEMemOperand(x0, offsets, mod)); __ Dup(zn_b, 0); __ Mov(zn_b, p0.Merging(), offsets); } // Store the values to isolated range different with other stores. int scale = is_scaled ? 1 : 0; __ Add(x1, x0, vl_per_esize * 4); __ Index(offsets, 6, 4); __ St1h(offsets, p0, SVEMemOperand(x1, offsets, mod, scale)); __ Ld1h(zn_ld_h, p0.Zeroing(), SVEMemOperand(x1, offsets, mod, scale)); __ Dup(zn_h, 0); __ Mov(zn_h, p0.Merging(), offsets); scale = is_scaled ? 2 : 0; __ Add(x2, x0, UINT64_MAX + (vl_per_esize * -8) + 1); __ Index(offsets, 64, 8); if ((std::is_same::value) && (static_cast(mod) == SXTW)) { // Testing negative offsets. __ Neg(offsets, p6.Merging(), offsets); } __ St1w(offsets, p1, SVEMemOperand(x2, offsets, mod, scale)); __ Ld1w(zn_ld_s, p1.Zeroing(), SVEMemOperand(x2, offsets, mod, scale)); __ Dup(zn_s, 0); __ Mov(zn_s, p1.Merging(), offsets); if (esize_in_bits == kDRegSize) { // Test st1w by comparing the 32-bit value loaded correspondingly with the // 32-bit value stored. __ Lsl(zn_s, zn_s, kSRegSize); __ Lsr(zn_s, zn_s, kSRegSize); } // `st1d` doesn't have the S-sized lane forms. if (esize_in_bits == kDRegSize) { scale = is_scaled ? 3 : 0; __ Add(x3, x0, UINT64_MAX + (vl_per_esize * -16) + 1); __ Index(offsets, 128, 16); if ((std::is_same::value) && (static_cast(mod) == SXTW)) { __ Neg(offsets, p6.Merging(), offsets); } __ St1d(offsets, p1, SVEMemOperand(x3, offsets, mod, scale)); __ Ld1d(zn_ld_d, p1.Zeroing(), SVEMemOperand(x3, offsets, mod, scale)); __ Dup(zn_d, 0); __ Mov(zn_d, p1.Merging(), offsets); } END(); if (CAN_RUN()) { RUN(); if (scale == false) { ASSERT_EQUAL_SVE(zn_ld_b, zn_b); } ASSERT_EQUAL_SVE(zn_ld_h, zn_h); ASSERT_EQUAL_SVE(zn_ld_s, zn_s); if (esize_in_bits == kDRegSize) { ASSERT_EQUAL_SVE(zn_ld_d, zn_d); } } delete[] data; } TEST_SVE(sve_st1_sca_vec_32_unpacked_unscaled) { sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, false); sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, false); } TEST_SVE(sve_st1_sca_vec_32_unpacked_scaled) { sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, true); sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, true); } TEST_SVE(sve_st1_sca_vec_32_unscaled) { sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, false); sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, false); } TEST_SVE(sve_st1_sca_vec_32_scaled) { sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, true); sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, true); } TEST_SVE(sve_st1_sca_vec_64_scaled) { sve_st1_scalar_plus_vector_helper(config, kDRegSize, LSL, true); } TEST_SVE(sve_st1_sca_vec_64_unscaled) { sve_st1_scalar_plus_vector_helper(config, kDRegSize, NO_SHIFT, false); } typedef void (MacroAssembler::*IntWideImmFn)(const ZRegister& zd, const ZRegister& zn, const IntegerOperand imm); template static void IntWideImmHelper(Test* config, F macro, unsigned lane_size_in_bits, const Tn& zn_inputs, IntegerOperand imm, const Td& zd_expected) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); ZRegister zd1 = z0.WithLaneSize(lane_size_in_bits); InsrHelper(&masm, zd1, zn_inputs); // Also test with a different zn, to test the movprfx case. ZRegister zn = z1.WithLaneSize(lane_size_in_bits); InsrHelper(&masm, zn, zn_inputs); ZRegister zd2 = z2.WithLaneSize(lane_size_in_bits); ZRegister zn_copy = z3.WithSameLaneSizeAs(zn); // Make a copy so we can check that constructive operations preserve zn. __ Mov(zn_copy, zn); { UseScratchRegisterScope temps(&masm); // The MacroAssembler needs a P scratch register for some of these macros, // and it doesn't have one by default. temps.Include(p3); (masm.*macro)(zd1, zd1, imm); (masm.*macro)(zd2, zn, imm); } END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zd_expected, zd1); // Check the result from `instr` with movprfx is the same as // the immediate version. ASSERT_EQUAL_SVE(zd_expected, zd2); ASSERT_EQUAL_SVE(zn_copy, zn); } } TEST_SVE(sve_int_wide_imm_unpredicated_smax) { int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55}; int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555}; int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555}; int64_t in_d[] = {1, 10, 10000, 1000000}; IntWideImmFn fn = &MacroAssembler::Smax; int exp_b_1[] = {0, -1, 127, -1, 126, 1, -1, 55}; int exp_h_1[] = {127, 127, 127, 127, INT16_MAX, 127, 127, 5555}; int exp_s_1[] = {0, -128, 127, -128, INT32_MAX, 1, -1, 555555}; int64_t exp_d_1[] = {99, 99, 10000, 1000000}; IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1); int exp_h_2[] = {0, -128, 127, -255, INT16_MAX, 1, -1, 5555}; int exp_s_2[] = {2048, 2048, 2048, 2048, INT32_MAX, 2048, 2048, 555555}; int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000}; // The immediate is in the range [-128, 127], but the macro is able to // synthesise unencodable immediates. // B-sized lanes cannot take an immediate out of the range [-128, 127]. IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2); } TEST_SVE(sve_int_wide_imm_unpredicated_smin) { int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55}; int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555}; int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555}; int64_t in_d[] = {1, 10, 10000, 1000000}; IntWideImmFn fn = &MacroAssembler::Smin; int exp_b_1[] = {-1, -128, -1, -127, -1, -1, -1, -1}; int exp_h_1[] = {0, -128, 127, INT16_MIN, 127, 1, -1, 127}; int exp_s_1[] = {-128, -128, -128, INT32_MIN, -128, -128, -128, -128}; int64_t exp_d_1[] = {1, 10, 99, 99}; IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1); int exp_h_2[] = {-255, -255, -255, INT16_MIN, -255, -255, -255, -255}; int exp_s_2[] = {0, -128, 127, INT32_MIN, 2048, 1, -1, 2048}; int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX}; // The immediate is in the range [-128, 127], but the macro is able to // synthesise unencodable immediates. // B-sized lanes cannot take an immediate out of the range [-128, 127]. IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2); } TEST_SVE(sve_int_wide_imm_unpredicated_umax) { int in_b[] = {0, 255, 127, 0x80, 1, 55}; int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555}; int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555}; int64_t in_d[] = {1, 10, 10000, 1000000}; IntWideImmFn fn = &MacroAssembler::Umax; int exp_b_1[] = {17, 255, 127, 0x80, 17, 55}; int exp_h_1[] = {127, 255, 127, INT16_MAX, 127, 5555}; int exp_s_1[] = {255, 255, 255, INT32_MAX, 255, 555555}; int64_t exp_d_1[] = {99, 99, 10000, 1000000}; IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, 0xff, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1); int exp_h_2[] = {511, 511, 511, INT16_MAX, 511, 5555}; int exp_s_2[] = {2048, 2048, 2048, INT32_MAX, 2048, 555555}; int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000}; // The immediate is in the range [0, 255], but the macro is able to // synthesise unencodable immediates. // B-sized lanes cannot take an immediate out of the range [0, 255]. IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2); } TEST_SVE(sve_int_wide_imm_unpredicated_umin) { int in_b[] = {0, 255, 127, 0x80, 1, 55}; int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555}; int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555}; int64_t in_d[] = {1, 10, 10000, 1000000}; IntWideImmFn fn = &MacroAssembler::Umin; int exp_b_1[] = {0, 17, 17, 17, 1, 17}; int exp_h_1[] = {0, 127, 127, 127, 1, 127}; int exp_s_1[] = {0, 255, 127, 255, 1, 255}; int64_t exp_d_1[] = {1, 10, 99, 99}; IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, 255, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1); int exp_h_2[] = {0, 255, 127, 511, 1, 511}; int exp_s_2[] = {0, 255, 127, 2048, 1, 2048}; int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX}; // The immediate is in the range [0, 255], but the macro is able to // synthesise unencodable immediates. // B-sized lanes cannot take an immediate out of the range [0, 255]. IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2); } TEST_SVE(sve_int_wide_imm_unpredicated_mul) { int in_b[] = {11, -1, 7, -3}; int in_h[] = {111, -1, 17, -123}; int in_s[] = {11111, -1, 117, -12345}; int64_t in_d[] = {0x7fffffff, 0x80000000}; IntWideImmFn fn = &MacroAssembler::Mul; int exp_b_1[] = {66, -6, 42, -18}; int exp_h_1[] = {-14208, 128, -2176, 15744}; int exp_s_1[] = {11111 * 127, -127, 117 * 127, -12345 * 127}; int64_t exp_d_1[] = {0xfffffffe, 0x100000000}; IntWideImmHelper(config, fn, kBRegSize, in_b, 6, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, -128, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 2, exp_d_1); int exp_h_2[] = {-28305, 255, -4335, 31365}; int exp_s_2[] = {22755328, -2048, 239616, -25282560}; int64_t exp_d_2[] = {0x00000063ffffff38, 0x0000006400000000}; // The immediate is in the range [-128, 127], but the macro is able to // synthesise unencodable immediates. // B-sized lanes cannot take an immediate out of the range [0, 255]. IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, 200, exp_d_2); // Integer overflow on multiplication. unsigned exp_b_3[] = {0x75, 0x81, 0x79, 0x83}; IntWideImmHelper(config, fn, kBRegSize, in_b, 0x7f, exp_b_3); } TEST_SVE(sve_int_wide_imm_unpredicated_add) { unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff}; unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa}; unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0}; uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f}; IntWideImmFn fn = &MacroAssembler::Add; unsigned exp_b_1[] = {0x02, 0x00, 0x91, 0x80}; unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba}; unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f}; uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e}; // Encodable with `add` (shift 0). IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1); unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa}; unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0}; uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f}; // Encodable with `add` (shift 8). // B-sized lanes cannot take a shift of 8. IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2); unsigned exp_s_3[] = {0x80808181, 0x807e7f7f, 0xab29aaaa, 0xf07ff0f0}; // The macro is able to synthesise unencodable immediates. IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3); unsigned exp_b_4[] = {0x61, 0x5f, 0xf0, 0xdf}; unsigned exp_h_4[] = {0x6181, 0x5f7f, 0xf010, 0x8aaa}; unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0}; uint64_t exp_d_4[] = {0x8000000180018180, 0x7fffffff7fff7f7e}; // Negative immediates use `sub`. IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4); IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4); IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4); IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4); } TEST_SVE(sve_int_wide_imm_unpredicated_sqadd) { unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff}; unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa}; unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0}; uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f}; IntWideImmFn fn = &MacroAssembler::Sqadd; unsigned exp_b_1[] = {0x02, 0x7f, 0x7f, 0x7f}; unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba}; unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f}; uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e}; // Encodable with `sqadd` (shift 0). // Note that encodable immediates are unsigned, even for signed saturation. IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1); unsigned exp_h_2[] = {0x9181, 0x7fff, 0x2010, 0xbaaa}; unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0}; uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f}; // Encodable with `sqadd` (shift 8). // B-sized lanes cannot take a shift of 8. IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2); } TEST_SVE(sve_int_wide_imm_unpredicated_uqadd) { unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff}; unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa}; unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0}; uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f}; IntWideImmFn fn = &MacroAssembler::Uqadd; unsigned exp_b_1[] = {0xff, 0xff, 0x91, 0xff}; unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba}; unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f}; uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e}; // Encodable with `uqadd` (shift 0). IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1); unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa}; unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0}; uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f}; // Encodable with `uqadd` (shift 8). // B-sized lanes cannot take a shift of 8. IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2); } TEST_SVE(sve_int_wide_imm_unpredicated_sub) { unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff}; unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa}; unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0}; uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f}; IntWideImmFn fn = &MacroAssembler::Sub; unsigned exp_b_1[] = {0x00, 0xfe, 0x8f, 0x7e}; unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a}; unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071}; uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80}; // Encodable with `sub` (shift 0). IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1); unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa}; unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0}; uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f}; // Encodable with `sub` (shift 8). // B-sized lanes cannot take a shift of 8. IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2); unsigned exp_s_3[] = {0x7f828181, 0x7f807f7f, 0xaa2baaaa, 0xef81f0f0}; // The macro is able to synthesise unencodable immediates. IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3); unsigned exp_b_4[] = {0xa1, 0x9f, 0x30, 0x1f}; unsigned exp_h_4[] = {0xa181, 0x9f7f, 0x3010, 0xcaaa}; unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0}; uint64_t exp_d_4[] = {0x8000000180018182, 0x7fffffff7fff7f80}; // Negative immediates use `add`. IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4); IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4); IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4); IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4); } TEST_SVE(sve_int_wide_imm_unpredicated_sqsub) { unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff}; unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa}; unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0}; uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f}; IntWideImmFn fn = &MacroAssembler::Sqsub; unsigned exp_b_1[] = {0x80, 0xfe, 0x8f, 0x80}; unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a}; unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071}; uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80}; // Encodable with `sqsub` (shift 0). // Note that encodable immediates are unsigned, even for signed saturation. IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1); unsigned exp_h_2[] = {0x8000, 0x6f7f, 0x0010, 0x9aaa}; unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0}; uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f}; // Encodable with `sqsub` (shift 8). // B-sized lanes cannot take a shift of 8. IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2); } TEST_SVE(sve_int_wide_imm_unpredicated_uqsub) { unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff}; unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa}; unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0}; uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f}; IntWideImmFn fn = &MacroAssembler::Uqsub; unsigned exp_b_1[] = {0x00, 0x00, 0x00, 0x7e}; unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a}; unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071}; uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80}; // Encodable with `uqsub` (shift 0). IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1); IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1); IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1); IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1); unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa}; unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0}; uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f}; // Encodable with `uqsub` (shift 8). // B-sized lanes cannot take a shift of 8. IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2); IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2); IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2); } TEST_SVE(sve_int_wide_imm_unpredicated_subr) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Encodable with `subr` (shift 0). __ Index(z0.VnD(), 1, 1); __ Sub(z0.VnD(), 100, z0.VnD()); __ Index(z1.VnS(), 0x7f, 1); __ Sub(z1.VnS(), 0xf7, z1.VnS()); __ Index(z2.VnH(), 0xaaaa, 0x2222); __ Sub(z2.VnH(), 0x80, z2.VnH()); __ Index(z3.VnB(), 133, 1); __ Sub(z3.VnB(), 255, z3.VnB()); // Encodable with `subr` (shift 8). __ Index(z4.VnD(), 256, -1); __ Sub(z4.VnD(), 42 * 256, z4.VnD()); __ Index(z5.VnS(), 0x7878, 1); __ Sub(z5.VnS(), 0x8000, z5.VnS()); __ Index(z6.VnH(), 0x30f0, -1); __ Sub(z6.VnH(), 0x7f00, z6.VnH()); // B-sized lanes cannot take a shift of 8. // Select with movprfx. __ Index(z31.VnD(), 256, 4001); __ Sub(z7.VnD(), 42 * 256, z31.VnD()); // Out of immediate encodable range of `sub`. __ Index(z30.VnS(), 0x11223344, 1); __ Sub(z8.VnS(), 0x88776655, z30.VnS()); END(); if (CAN_RUN()) { RUN(); int expected_z0[] = {87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); int expected_z1[] = {0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78}; ASSERT_EQUAL_SVE(expected_z1, z1.VnS()); int expected_z2[] = {0xab2c, 0xcd4e, 0xef70, 0x1192, 0x33b4, 0x55d6}; ASSERT_EQUAL_SVE(expected_z2, z2.VnH()); int expected_z3[] = {0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a}; ASSERT_EQUAL_SVE(expected_z3, z3.VnB()); int expected_z4[] = {10502, 10501, 10500, 10499, 10498, 10497, 10496}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); int expected_z5[] = {0x0783, 0x0784, 0x0785, 0x0786, 0x0787, 0x0788}; ASSERT_EQUAL_SVE(expected_z5, z5.VnS()); int expected_z6[] = {0x4e15, 0x4e14, 0x4e13, 0x4e12, 0x4e11, 0x4e10}; ASSERT_EQUAL_SVE(expected_z6, z6.VnH()); int expected_z7[] = {-13510, -9509, -5508, -1507, 2494, 6495, 10496}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); int expected_z8[] = {0x7755330e, 0x7755330f, 0x77553310, 0x77553311}; ASSERT_EQUAL_SVE(expected_z8, z8.VnS()); } } TEST_SVE(sve_int_wide_imm_unpredicated_fdup) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Immediates which can be encoded in the instructions. __ Fdup(z0.VnH(), RawbitsToFloat16(0xc500)); __ Fdup(z1.VnS(), Float16(2.0)); __ Fdup(z2.VnD(), Float16(3.875)); __ Fdup(z3.VnH(), 8.0f); __ Fdup(z4.VnS(), -4.75f); __ Fdup(z5.VnD(), 0.5f); __ Fdup(z6.VnH(), 1.0); __ Fdup(z7.VnS(), 2.125); __ Fdup(z8.VnD(), -13.0); // Immediates which cannot be encoded in the instructions. __ Fdup(z10.VnH(), Float16(0.0)); __ Fdup(z11.VnH(), kFP16PositiveInfinity); __ Fdup(z12.VnS(), 255.0f); __ Fdup(z13.VnS(), kFP32NegativeInfinity); __ Fdup(z14.VnD(), 12.3456); __ Fdup(z15.VnD(), kFP64PositiveInfinity); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(0xc500, z0.VnH()); ASSERT_EQUAL_SVE(0x40000000, z1.VnS()); ASSERT_EQUAL_SVE(0x400f000000000000, z2.VnD()); ASSERT_EQUAL_SVE(0x4800, z3.VnH()); ASSERT_EQUAL_SVE(FloatToRawbits(-4.75f), z4.VnS()); ASSERT_EQUAL_SVE(DoubleToRawbits(0.5), z5.VnD()); ASSERT_EQUAL_SVE(0x3c00, z6.VnH()); ASSERT_EQUAL_SVE(FloatToRawbits(2.125f), z7.VnS()); ASSERT_EQUAL_SVE(DoubleToRawbits(-13.0), z8.VnD()); ASSERT_EQUAL_SVE(0x0000, z10.VnH()); ASSERT_EQUAL_SVE(Float16ToRawbits(kFP16PositiveInfinity), z11.VnH()); ASSERT_EQUAL_SVE(FloatToRawbits(255.0), z12.VnS()); ASSERT_EQUAL_SVE(FloatToRawbits(kFP32NegativeInfinity), z13.VnS()); ASSERT_EQUAL_SVE(DoubleToRawbits(12.3456), z14.VnD()); ASSERT_EQUAL_SVE(DoubleToRawbits(kFP64PositiveInfinity), z15.VnD()); } } TEST_SVE(sve_andv_eorv_orv) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in[] = {0x8899aabbccddeeff, 0x7777555533331111, 0x123456789abcdef0}; InsrHelper(&masm, z31.VnD(), in); // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 1, 0 // For S lanes: 1, 1, 1, 0, 0 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0}; Initialise(&masm, p0.VnB(), pg_in); // Make a copy so we can check that constructive operations preserve zn. __ Mov(z0, z31); __ Andv(b0, p0, z0.VnB()); // destructive __ Andv(h1, p0, z31.VnH()); __ Mov(z2, z31); __ Andv(s2, p0, z2.VnS()); // destructive __ Andv(d3, p0, z31.VnD()); __ Eorv(b4, p0, z31.VnB()); __ Mov(z5, z31); __ Eorv(h5, p0, z5.VnH()); // destructive __ Eorv(s6, p0, z31.VnS()); __ Mov(z7, z31); __ Eorv(d7, p0, z7.VnD()); // destructive __ Mov(z8, z31); __ Orv(b8, p0, z8.VnB()); // destructive __ Orv(h9, p0, z31.VnH()); __ Mov(z10, z31); __ Orv(s10, p0, z10.VnS()); // destructive __ Orv(d11, p0, z31.VnD()); END(); if (CAN_RUN()) { RUN(); if (static_cast(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) { ASSERT_EQUAL_64(0x10, d0); ASSERT_EQUAL_64(0x1010, d1); ASSERT_EQUAL_64(0x33331111, d2); ASSERT_EQUAL_64(0x7777555533331111, d3); ASSERT_EQUAL_64(0xbf, d4); ASSERT_EQUAL_64(0xedcb, d5); ASSERT_EQUAL_64(0x44444444, d6); ASSERT_EQUAL_64(0x7777555533331111, d7); ASSERT_EQUAL_64(0xff, d8); ASSERT_EQUAL_64(0xffff, d9); ASSERT_EQUAL_64(0x77775555, d10); ASSERT_EQUAL_64(0x7777555533331111, d11); } else { ASSERT_EQUAL_64(0, d0); ASSERT_EQUAL_64(0x0010, d1); ASSERT_EQUAL_64(0x00110011, d2); ASSERT_EQUAL_64(0x0011001100110011, d3); ASSERT_EQUAL_64(0x62, d4); ASSERT_EQUAL_64(0x0334, d5); ASSERT_EQUAL_64(0x8899aabb, d6); ASSERT_EQUAL_64(0xffeeffeeffeeffee, d7); ASSERT_EQUAL_64(0xff, d8); ASSERT_EQUAL_64(0xffff, d9); ASSERT_EQUAL_64(0xffffffff, d10); ASSERT_EQUAL_64(0xffffffffffffffff, d11); } // Check the upper lanes above the top of the V register are all clear. for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) { ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z8.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z9.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z10.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z11.VnD(), i); } } } TEST_SVE(sve_saddv_uaddv) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in[] = {0x8899aabbccddeeff, 0x8182838485868788, 0x0807060504030201}; InsrHelper(&masm, z31.VnD(), in); // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 1, 0 // For S lanes: 1, 1, 1, 0, 0 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0}; Initialise(&masm, p0.VnB(), pg_in); // Make a copy so we can check that constructive operations preserve zn. __ Mov(z0, z31); __ Saddv(b0, p0, z0.VnB()); // destructive __ Saddv(h1, p0, z31.VnH()); __ Mov(z2, z31); __ Saddv(s2, p0, z2.VnS()); // destructive __ Uaddv(b4, p0, z31.VnB()); __ Mov(z5, z31); __ Uaddv(h5, p0, z5.VnH()); // destructive __ Uaddv(s6, p0, z31.VnS()); __ Mov(z7, z31); __ Uaddv(d7, p0, z7.VnD()); // destructive END(); if (CAN_RUN()) { RUN(); if (static_cast(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) { // Saddv ASSERT_EQUAL_64(0xfffffffffffffda9, d0); ASSERT_EQUAL_64(0xfffffffffffe9495, d1); ASSERT_EQUAL_64(0xffffffff07090b0c, d2); // Uaddv ASSERT_EQUAL_64(0x00000000000002a9, d4); ASSERT_EQUAL_64(0x0000000000019495, d5); ASSERT_EQUAL_64(0x0000000107090b0c, d6); ASSERT_EQUAL_64(0x8182838485868788, d7); } else { // Saddv ASSERT_EQUAL_64(0xfffffffffffffd62, d0); ASSERT_EQUAL_64(0xfffffffffffe8394, d1); ASSERT_EQUAL_64(0xfffffffed3e6fa0b, d2); // Uaddv ASSERT_EQUAL_64(0x0000000000000562, d4); ASSERT_EQUAL_64(0x0000000000028394, d5); ASSERT_EQUAL_64(0x00000001d3e6fa0b, d6); ASSERT_EQUAL_64(0x0a1c2e4052647687, d7); } // Check the upper lanes above the top of the V register are all clear. for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) { ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i); } } } TEST_SVE(sve_sminv_uminv) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00}; InsrHelper(&masm, z31.VnD(), in); // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 0, 1 // For S lanes: 1, 1, 0, 0, 1 // For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1 int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1}; Initialise(&masm, p0.VnB(), pg_in); // Make a copy so we can check that constructive operations preserve zn. __ Mov(z0, z31); __ Sminv(b0, p0, z0.VnB()); // destructive __ Sminv(h1, p0, z31.VnH()); __ Mov(z2, z31); __ Sminv(s2, p0, z2.VnS()); // destructive __ Sminv(d3, p0, z31.VnD()); __ Uminv(b4, p0, z31.VnB()); __ Mov(z5, z31); __ Uminv(h5, p0, z5.VnH()); // destructive __ Uminv(s6, p0, z31.VnS()); __ Mov(z7, z31); __ Uminv(d7, p0, z7.VnD()); // destructive END(); if (CAN_RUN()) { RUN(); if (static_cast(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) { // Sminv ASSERT_EQUAL_64(0xaa, d0); ASSERT_EQUAL_64(0xaabb, d1); ASSERT_EQUAL_64(0xaabbfc00, d2); ASSERT_EQUAL_64(0x00112233aabbfc00, d3); // The smaller lane is inactive. // Uminv ASSERT_EQUAL_64(0, d4); ASSERT_EQUAL_64(0x2233, d5); ASSERT_EQUAL_64(0x112233, d6); ASSERT_EQUAL_64(0x00112233aabbfc00, d7); // The smaller lane is inactive. } else { // Sminv ASSERT_EQUAL_64(0xaa, d0); ASSERT_EQUAL_64(0xaaaa, d1); ASSERT_EQUAL_64(0xaaaaaaaa, d2); ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d3); // Uminv ASSERT_EQUAL_64(0, d4); ASSERT_EQUAL_64(0x2233, d5); ASSERT_EQUAL_64(0x112233, d6); ASSERT_EQUAL_64(0x00112233aabbfc00, d7); } // Check the upper lanes above the top of the V register are all clear. for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) { ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i); } } } TEST_SVE(sve_smaxv_umaxv) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00}; InsrHelper(&masm, z31.VnD(), in); // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 0, 1 // For S lanes: 1, 1, 0, 0, 1 // For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1 int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1}; Initialise(&masm, p0.VnB(), pg_in); // Make a copy so we can check that constructive operations preserve zn. __ Mov(z0, z31); __ Smaxv(b0, p0, z0.VnB()); // destructive __ Smaxv(h1, p0, z31.VnH()); __ Mov(z2, z31); __ Smaxv(s2, p0, z2.VnS()); // destructive __ Smaxv(d3, p0, z31.VnD()); __ Umaxv(b4, p0, z31.VnB()); __ Mov(z5, z31); __ Umaxv(h5, p0, z5.VnH()); // destructive __ Umaxv(s6, p0, z31.VnS()); __ Mov(z7, z31); __ Umaxv(d7, p0, z7.VnD()); // destructive END(); if (CAN_RUN()) { RUN(); if (static_cast(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) { // Smaxv ASSERT_EQUAL_64(0x33, d0); ASSERT_EQUAL_64(0x44aa, d1); ASSERT_EQUAL_64(0x112233, d2); ASSERT_EQUAL_64(0x112233aabbfc00, d3); // Umaxv ASSERT_EQUAL_64(0xfe, d4); ASSERT_EQUAL_64(0xfc00, d5); ASSERT_EQUAL_64(0xaabbfc00, d6); ASSERT_EQUAL_64(0x112233aabbfc00, d7); } else { // Smaxv ASSERT_EQUAL_64(0x33, d0); ASSERT_EQUAL_64(0x44aa, d1); ASSERT_EQUAL_64(0x112233, d2); ASSERT_EQUAL_64(0x00112233aabbfc00, d3); // Umaxv ASSERT_EQUAL_64(0xfe, d4); ASSERT_EQUAL_64(0xfc00, d5); ASSERT_EQUAL_64(0xaabbfc00, d6); ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d7); } // Check the upper lanes above the top of the V register are all clear. for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) { ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i); ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i); } } } template static void SdotUdotHelper(Test* config, unsigned lane_size_in_bits, const T (&zd_inputs)[M], const T (&za_inputs)[M], const T (&zn_inputs)[N], const T (&zm_inputs)[N], const T (&zd_expected)[M], const T (&zdnm_expected)[M], bool is_signed, int index = -1) { VIXL_STATIC_ASSERT(N == (M * 4)); SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); auto dot_fn = [&](const ZRegister& zd, const ZRegister& za, const ZRegister& zn, const ZRegister& zm, bool is_signed_fn, int index_fn) { if (is_signed_fn) { if (index_fn < 0) { __ Sdot(zd, za, zn, zm); } else { __ Sdot(zd, za, zn, zm, index_fn); } } else { if (index_fn < 0) { __ Udot(zd, za, zn, zm); } else { __ Udot(zd, za, zn, zm, index_fn); } } }; ZRegister zd = z0.WithLaneSize(lane_size_in_bits); ZRegister za = z1.WithLaneSize(lane_size_in_bits); ZRegister zn = z2.WithLaneSize(lane_size_in_bits / 4); ZRegister zm = z3.WithLaneSize(lane_size_in_bits / 4); InsrHelper(&masm, zd, zd_inputs); InsrHelper(&masm, za, za_inputs); InsrHelper(&masm, zn, zn_inputs); InsrHelper(&masm, zm, zm_inputs); // The Dot macro handles arbitrarily-aliased registers in the argument list. ZRegister dm_result = z4.WithLaneSize(lane_size_in_bits); ZRegister dnm_result = z5.WithLaneSize(lane_size_in_bits); ZRegister da_result = z6.WithLaneSize(lane_size_in_bits); ZRegister dn_result = z7.WithLaneSize(lane_size_in_bits); ZRegister d_result = z8.WithLaneSize(lane_size_in_bits); __ Mov(da_result, za); // zda = zda + (zn . zm) dot_fn(da_result, da_result, zn, zm, is_signed, index); __ Mov(dn_result, zn.WithSameLaneSizeAs(dn_result)); // zdn = za + (zdn . zm) dot_fn(dn_result, za, dn_result.WithSameLaneSizeAs(zn), zm, is_signed, index); __ Mov(dm_result, zm.WithSameLaneSizeAs(dm_result)); // zdm = za + (zn . zdm) dot_fn(dm_result, za, zn, dm_result.WithSameLaneSizeAs(zm), is_signed, index); __ Mov(d_result, zd); // zd = za + (zn . zm) dot_fn(d_result, za, zn, zm, is_signed, index); __ Mov(dnm_result, zn.WithSameLaneSizeAs(dnm_result)); // zdnm = za + (zdmn . zdnm) dot_fn(dnm_result, za, dnm_result.WithSameLaneSizeAs(zn), dnm_result.WithSameLaneSizeAs(zm), is_signed, index); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits)); ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits / 4)); ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits / 4)); ASSERT_EQUAL_SVE(zd_expected, da_result); ASSERT_EQUAL_SVE(zd_expected, dn_result); ASSERT_EQUAL_SVE(zd_expected, dm_result); ASSERT_EQUAL_SVE(zd_expected, d_result); ASSERT_EQUAL_SVE(zdnm_expected, dnm_result); } } TEST_SVE(sve_sdot) { int64_t zd_inputs[] = {0x33, 0xee, 0xff}; int64_t za_inputs[] = {INT32_MAX, -3, 2}; int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8}; int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5}; // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[]) int64_t zd_expected_s[] = {-2147418113, -183, 133}; // 0x8000ffff int64_t zd_expected_d[] = {2147549183, -183, 133}; // 0x8000ffff // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[]) int64_t zdnm_expected_s[] = {-2147418113, 980, 572}; int64_t zdnm_expected_d[] = {2147549183, 980, 572}; SdotUdotHelper(config, kSRegSize, zd_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_s, zdnm_expected_s, true); SdotUdotHelper(config, kDRegSize, zd_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_d, zdnm_expected_d, true); } TEST_SVE(sve_udot) { int64_t zd_inputs[] = {0x33, 0xee, 0xff}; int64_t za_inputs[] = {INT32_MAX, -3, 2}; int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8}; int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5}; // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[]) int64_t zd_expected_s[] = {0x8000ffff, 0x00001749, 0x0000f085}; int64_t zd_expected_d[] = {0x000000047c00ffff, 0x000000000017ff49, 0x00000000fff00085}; // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[]) int64_t zdnm_expected_s[] = {0x8000ffff, 0x000101d4, 0x0001d03c}; int64_t zdnm_expected_d[] = {0x000000047c00ffff, 0x00000000fffe03d4, 0x00000001ffce023c}; SdotUdotHelper(config, kSRegSize, zd_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_s, zdnm_expected_s, false); SdotUdotHelper(config, kDRegSize, zd_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_d, zdnm_expected_d, false); } TEST_SVE(sve_sdot_indexed_s) { int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff}; int64_t za_inputs[] = {0, 1, 2, 3}; int64_t zn_inputs[] = {-1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4}; int64_t zm_inputs[] = {127, 127, 127, 127, -128, -128, -128, -128, -1, -1, -1, -1, 0, 0, 0, 0}; constexpr int s = kQRegSize / kSRegSize; // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[]) int64_t zd_expected_s[][s] = {{0, 1, 2, 3}, // Generated from zm[0] {4, 9, 14, 19}, {512, 1025, 1538, 2051}, {-508, -1015, -1522, -2029}}; // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[]) int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67}, {12, 25, 38, 51}, {8, 17, 26, 35}, {4, 9, 14, 19}}; for (unsigned i = 0; i < s; i++) { SdotUdotHelper(config, kSRegSize, zd_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_s[i], zdnm_expected_s[i], true, i); } } TEST_SVE(sve_sdot_indexed_d) { int64_t zd_inputs[] = {0xff, 0xff}; int64_t za_inputs[] = {0, 1}; int64_t zn_inputs[] = {-1, -1, -1, -1, -1, -1, -1, -1}; int64_t zm_inputs[] = {-128, -128, -128, -128, 127, 127, 127, 127}; constexpr int d = kQRegSize / kDRegSize; // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[]) int64_t zd_expected_d[][d] = {{-508, -507}, // Generated from zm[0] {512, 513}}; // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[]) int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}}; for (unsigned i = 0; i < d; i++) { SdotUdotHelper(config, kDRegSize, zd_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_d[i], zdnm_expected_d[i], true, i); } } TEST_SVE(sve_udot_indexed_s) { int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff}; int64_t za_inputs[] = {0, 1, 2, 3}; int64_t zn_inputs[] = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4}; int64_t zm_inputs[] = {127, 127, 127, 127, 255, 255, 255, 255, 1, 1, 1, 1, 0, 0, 0, 0}; constexpr int s = kQRegSize / kSRegSize; // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[]) int64_t zd_expected_s[][s] = {{0, 1, 2, 3}, {4, 9, 14, 19}, {1020, 2041, 3062, 4083}, {508, 1017, 1526, 2035}}; // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[]) int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67}, {12, 25, 38, 51}, {8, 17, 26, 35}, {4, 9, 14, 19}}; for (unsigned i = 0; i < s; i++) { SdotUdotHelper(config, kSRegSize, zd_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_s[i], zdnm_expected_s[i], false, i); } } TEST_SVE(sve_udot_indexed_d) { int64_t zd_inputs[] = {0xff, 0xff}; int64_t za_inputs[] = {0, 1}; int64_t zn_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1}; int64_t zm_inputs[] = {255, 255, 255, 255, 127, 127, 127, 127}; constexpr int d = kQRegSize / kDRegSize; // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[]) int64_t zd_expected_d[][d] = {{508, 509}, {1020, 1021}}; // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[]) int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}}; for (unsigned i = 0; i < d; i++) { SdotUdotHelper(config, kDRegSize, zd_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_d[i], zdnm_expected_d[i], false, i); } } static void IntSegmentPatternHelper(MacroAssembler* masm, const ZRegister& dst, const ZRegister& src) { VIXL_ASSERT(AreSameLaneSize(dst, src)); UseScratchRegisterScope temps(masm); ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst); masm->Index(ztmp, 0, 1); masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2()); masm->Add(dst, src, ztmp); } TEST_SVE(sve_sdot_udot_indexed_s) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); const int multiplier = 2; __ Dup(z9.VnS(), multiplier); __ Ptrue(p0.VnB()); __ Index(z29.VnS(), 4, 1); // z29 = [... 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0] __ And(z29.VnS(), z29.VnS(), 3); // p7 = [... 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1] __ Cmple(p7.VnS(), p0.Zeroing(), z29.VnS(), 0); // p6 = [... 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] __ Cmple(p6.VnS(), p0.Zeroing(), z29.VnS(), 1); // p5 = [... 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1] __ Cmple(p5.VnS(), p0.Zeroing(), z29.VnS(), 2); __ Index(z28.VnB(), 1, 1); __ Dup(z27.VnS(), z28.VnS(), 0); // z27 = [... 3, 2, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1] IntSegmentPatternHelper(&masm, z27.VnB(), z27.VnB()); // z27 = [... 6, 4, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2] __ Mul(z27.VnS(), p7.Merging(), z27.VnS(), z9.VnS()); // z27 = [... 12, 8, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4] __ Mul(z27.VnS(), p6.Merging(), z27.VnS(), z9.VnS()); // 2nd segment | 1st segment | // v v // z27 = [... 24, 16, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4, 32, 24, 16, 8] __ Mul(z27.VnS(), p5.Merging(), z27.VnS(), z9.VnS()); __ Dup(z0.VnS(), 0); __ Dup(z1.VnS(), 0); __ Dup(z2.VnS(), 0); __ Dup(z3.VnS(), 0); __ Dup(z4.VnS(), 0); __ Dup(z5.VnS(), 0); // Skip the lanes starting from the 129th lane since the value of these lanes // are overflow after the number sequence creation by `index`. __ Cmpls(p3.VnB(), p0.Zeroing(), z28.VnB(), 128); __ Mov(z0.VnB(), p3.Merging(), z27.VnB()); __ Mov(z1.VnB(), p3.Merging(), z28.VnB()); __ Dup(z2.VnS(), 0); __ Dup(z3.VnS(), 0); __ Dup(z4.VnS(), 0); __ Dup(z5.VnS(), 0); __ Udot(z2.VnS(), z2.VnS(), z1.VnB(), z0.VnB(), 0); __ Udot(z3.VnS(), z3.VnS(), z1.VnB(), z0.VnB(), 1); __ Mul(z3.VnS(), z3.VnS(), 2); __ Udot(z4.VnS(), z4.VnS(), z1.VnB(), z0.VnB(), 2); __ Mul(z4.VnS(), z4.VnS(), 4); __ Udot(z5.VnS(), z5.VnS(), z1.VnB(), z0.VnB(), 3); __ Mul(z5.VnS(), z5.VnS(), 8); __ Dup(z7.VnS(), 0); __ Dup(z8.VnS(), 0); __ Dup(z9.VnS(), 0); __ Dup(z10.VnS(), 0); // Negate the all positive vector for testing signed dot. __ Neg(z6.VnB(), p0.Merging(), z0.VnB()); __ Sdot(z7.VnS(), z7.VnS(), z1.VnB(), z6.VnB(), 0); __ Sdot(z8.VnS(), z8.VnS(), z1.VnB(), z6.VnB(), 1); __ Mul(z8.VnS(), z8.VnS(), 2); __ Sdot(z9.VnS(), z9.VnS(), z1.VnB(), z6.VnB(), 2); __ Mul(z9.VnS(), z9.VnS(), 4); __ Sdot(z10.VnS(), z10.VnS(), z1.VnB(), z6.VnB(), 3); __ Mul(z10.VnS(), z10.VnS(), 8); END(); if (CAN_RUN()) { RUN(); // Only compare the first 128-bit segment of destination register, use // another result from generated instructions to check the remaining part. // s_lane[0] = (1 * 8) + (2 * 16) + (3 * 24) + (4 * 32) = 240 // ... // s_lane[3] = (13 * 8) + (14 * 16) + (15 * 24) + (16 * 32) = 1200 int udot_expected[] = {1200, 880, 560, 240}; ASSERT_EQUAL_SVE(udot_expected, z2.VnS()); ASSERT_EQUAL_SVE(z2.VnS(), z3.VnS()); ASSERT_EQUAL_SVE(z2.VnS(), z4.VnS()); ASSERT_EQUAL_SVE(z2.VnS(), z5.VnS()); int sdot_expected[] = {-1200, -880, -560, -240}; ASSERT_EQUAL_SVE(sdot_expected, z7.VnS()); ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS()); ASSERT_EQUAL_SVE(z7.VnS(), z9.VnS()); ASSERT_EQUAL_SVE(z7.VnS(), z10.VnS()); } } TEST_SVE(sve_sdot_udot_indexed_d) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); const int multiplier = 2; __ Dup(z9.VnD(), multiplier); __ Ptrue(p0.VnD()); __ Pfalse(p1.VnD()); // p2 = [..., 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] __ Zip1(p2.VnD(), p0.VnD(), p1.VnD()); __ Index(z1.VnH(), 1, 1); __ Dup(z0.VnD(), z1.VnD(), 0); // z0 = [... 5, 4, 3, 2, 5, 4, 3, 2, 4, 3, 2, 1, 4, 3, 2, 1] IntSegmentPatternHelper(&masm, z0.VnH(), z0.VnH()); // 2nd segment | 1st segment | // v v // z0 = [... 5, 4, 3, 2, 10, 8, 6, 4, 4, 3, 2, 1, 8, 6, 4, 2] __ Mul(z0.VnD(), p2.Merging(), z0.VnD(), z9.VnD()); __ Dup(z3.VnD(), 0); __ Dup(z4.VnD(), 0); __ Udot(z3.VnD(), z3.VnD(), z1.VnH(), z0.VnH(), 0); __ Udot(z4.VnD(), z4.VnD(), z1.VnH(), z0.VnH(), 1); __ Mul(z4.VnD(), z4.VnD(), multiplier); __ Dup(z12.VnD(), 0); __ Dup(z13.VnD(), 0); __ Ptrue(p4.VnH()); __ Neg(z10.VnH(), p4.Merging(), z0.VnH()); __ Sdot(z12.VnD(), z12.VnD(), z1.VnH(), z10.VnH(), 0); __ Sdot(z13.VnD(), z13.VnD(), z1.VnH(), z10.VnH(), 1); __ Mul(z13.VnD(), z13.VnD(), multiplier); END(); if (CAN_RUN()) { RUN(); // Only compare the first 128-bit segment of destination register, use // another result from generated instructions to check the remaining part. // d_lane[0] = (1 * 2) + (2 * 4) + (3 * 6) + (4 * 8) = 60 // d_lane[1] = (5 * 2) + (6 * 4) + (7 * 6) + (8 * 8) = 140 uint64_t udot_expected[] = {416, 304, 140, 60}; ASSERT_EQUAL_SVE(udot_expected, z3.VnD()); ASSERT_EQUAL_SVE(z3.VnD(), z4.VnD()); int64_t sdot_expected[] = {-416, -304, -140, -60}; ASSERT_EQUAL_SVE(sdot_expected, z12.VnD()); ASSERT_EQUAL_SVE(z12.VnD(), z13.VnD()); } } template static void FPToRawbitsWithSize(const T (&inputs)[N], uint64_t* outputs, unsigned size_in_bits) { for (size_t i = 0; i < N; i++) { outputs[i] = vixl::FPToRawbitsWithSize(size_in_bits, inputs[i]); } } template static void FPBinArithHelper(Test* config, ArithFn macro, int lane_size_in_bits, const Ti (&zn_inputs)[N], const Ti (&zm_inputs)[N], const Te (&zd_expected)[N]) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); ZRegister zd = z29.WithLaneSize(lane_size_in_bits); ZRegister zn = z30.WithLaneSize(lane_size_in_bits); ZRegister zm = z31.WithLaneSize(lane_size_in_bits); uint64_t zn_rawbits[N]; uint64_t zm_rawbits[N]; FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits); FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits); InsrHelper(&masm, zn, zn_rawbits); InsrHelper(&masm, zm, zm_rawbits); (masm.*macro)(zd, zn, zm); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zd_expected, zd); } } TEST_SVE(sve_fp_arithmetic_unpredicated_fadd) { double zn_inputs[] = {24.0, 5.5, 0.0, 3.875, 2.125, kFP64PositiveInfinity, kFP64NegativeInfinity}; double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0}; ArithFn fn = &MacroAssembler::Fadd; uint16_t expected_h[] = {Float16ToRawbits(Float16(1048.0)), Float16ToRawbits(Float16(2053.5)), Float16ToRawbits(Float16(0.1)), Float16ToRawbits(Float16(-0.875)), Float16ToRawbits(Float16(14.465)), Float16ToRawbits(kFP16PositiveInfinity), Float16ToRawbits(kFP16NegativeInfinity)}; FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h); uint32_t expected_s[] = {FloatToRawbits(1048.0f), FloatToRawbits(2053.5f), FloatToRawbits(0.1f), FloatToRawbits(-0.875f), FloatToRawbits(14.465f), FloatToRawbits(kFP32PositiveInfinity), FloatToRawbits(kFP32NegativeInfinity)}; FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s); uint64_t expected_d[] = {DoubleToRawbits(1048.0), DoubleToRawbits(2053.5), DoubleToRawbits(0.1), DoubleToRawbits(-0.875), DoubleToRawbits(14.465), DoubleToRawbits(kFP64PositiveInfinity), DoubleToRawbits(kFP64NegativeInfinity)}; FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d); } TEST_SVE(sve_fp_arithmetic_unpredicated_fsub) { double zn_inputs[] = {24.0, 5.5, 0.0, 3.875, 2.125, kFP64PositiveInfinity, kFP64NegativeInfinity}; double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0}; ArithFn fn = &MacroAssembler::Fsub; uint16_t expected_h[] = {Float16ToRawbits(Float16(-1000.0)), Float16ToRawbits(Float16(-2042.5)), Float16ToRawbits(Float16(-0.1)), Float16ToRawbits(Float16(8.625)), Float16ToRawbits(Float16(-10.215)), Float16ToRawbits(kFP16PositiveInfinity), Float16ToRawbits(kFP16NegativeInfinity)}; FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h); uint32_t expected_s[] = {FloatToRawbits(-1000.0), FloatToRawbits(-2042.5), FloatToRawbits(-0.1), FloatToRawbits(8.625), FloatToRawbits(-10.215), FloatToRawbits(kFP32PositiveInfinity), FloatToRawbits(kFP32NegativeInfinity)}; FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s); uint64_t expected_d[] = {DoubleToRawbits(-1000.0), DoubleToRawbits(-2042.5), DoubleToRawbits(-0.1), DoubleToRawbits(8.625), DoubleToRawbits(-10.215), DoubleToRawbits(kFP64PositiveInfinity), DoubleToRawbits(kFP64NegativeInfinity)}; FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d); } TEST_SVE(sve_fp_arithmetic_unpredicated_fmul) { double zn_inputs[] = {24.0, 5.5, 0.0, 3.875, 2.125, kFP64PositiveInfinity, kFP64NegativeInfinity}; double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0}; ArithFn fn = &MacroAssembler::Fmul; uint16_t expected_h[] = {Float16ToRawbits(Float16(24576.0)), Float16ToRawbits(Float16(11264.0)), Float16ToRawbits(Float16(0.0)), Float16ToRawbits(Float16(-18.4)), Float16ToRawbits(Float16(26.23)), Float16ToRawbits(kFP16PositiveInfinity), Float16ToRawbits(kFP16PositiveInfinity)}; FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h); uint32_t expected_s[] = {FloatToRawbits(24576.0), FloatToRawbits(11264.0), FloatToRawbits(0.0), FloatToRawbits(-18.40625), FloatToRawbits(26.2225), FloatToRawbits(kFP32PositiveInfinity), FloatToRawbits(kFP32PositiveInfinity)}; FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s); uint64_t expected_d[] = {DoubleToRawbits(24576.0), DoubleToRawbits(11264.0), DoubleToRawbits(0.0), DoubleToRawbits(-18.40625), DoubleToRawbits(26.2225), DoubleToRawbits(kFP64PositiveInfinity), DoubleToRawbits(kFP64PositiveInfinity)}; FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d); } typedef void (MacroAssembler::*FPArithPredicatedFn)( const ZRegister& zd, const PRegisterM& pg, const ZRegister& zn, const ZRegister& zm, FPMacroNaNPropagationOption nan_option); typedef void (MacroAssembler::*FPArithPredicatedNoNaNOptFn)( const ZRegister& zd, const PRegisterM& pg, const ZRegister& zn, const ZRegister& zm); template static void FPBinArithHelper( Test* config, FPArithPredicatedFn macro, FPArithPredicatedNoNaNOptFn macro_nonan, unsigned lane_size_in_bits, const Ti (&zd_inputs)[N], const int (&pg_inputs)[N], const Ti (&zn_inputs)[N], const Ti (&zm_inputs)[N], const Te (&zd_expected)[N], FPMacroNaNPropagationOption nan_option = FastNaNPropagation) { VIXL_ASSERT((macro == NULL) ^ (macro_nonan == NULL)); SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Avoid choosing default scratch registers. ZRegister zd = z26.WithLaneSize(lane_size_in_bits); ZRegister zn = z27.WithLaneSize(lane_size_in_bits); ZRegister zm = z28.WithLaneSize(lane_size_in_bits); uint64_t zn_inputs_rawbits[N]; uint64_t zm_inputs_rawbits[N]; uint64_t zd_inputs_rawbits[N]; FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits); FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits); FPToRawbitsWithSize(zd_inputs, zd_inputs_rawbits, lane_size_in_bits); InsrHelper(&masm, zn, zn_inputs_rawbits); InsrHelper(&masm, zm, zm_inputs_rawbits); InsrHelper(&masm, zd, zd_inputs_rawbits); PRegisterWithLaneSize pg = p0.WithLaneSize(lane_size_in_bits); Initialise(&masm, pg, pg_inputs); // `instr` zdn, pg, zdn, zm ZRegister dn_result = z0.WithLaneSize(lane_size_in_bits); __ Mov(dn_result, zn); if (macro_nonan == NULL) { (masm.*macro)(dn_result, pg.Merging(), dn_result, zm, nan_option); } else { (masm.*macro_nonan)(dn_result, pg.Merging(), dn_result, zm); } // Based on whether zd and zm registers are aliased, the macro of instructions // (`Instr`) swaps the order of operands if it has the commutative property, // otherwise, transfer to the reversed `Instr`, such as fdivr. // `instr` zdm, pg, zn, zdm ZRegister dm_result = z1.WithLaneSize(lane_size_in_bits); __ Mov(dm_result, zm); if (macro_nonan == NULL) { (masm.*macro)(dm_result, pg.Merging(), zn, dm_result, nan_option); } else { (masm.*macro_nonan)(dm_result, pg.Merging(), zn, dm_result); } // The macro of instructions (`Instr`) automatically selects between `instr` // and movprfx + `instr` based on whether zd and zn registers are aliased. // A generated movprfx instruction is predicated that using the same // governing predicate register. In order to keep the result constant, // initialize the destination register first. // `instr` zd, pg, zn, zm ZRegister d_result = z2.WithLaneSize(lane_size_in_bits); __ Mov(d_result, zd); if (macro_nonan == NULL) { (masm.*macro)(d_result, pg.Merging(), zn, zm, nan_option); } else { (masm.*macro_nonan)(d_result, pg.Merging(), zn, zm); } END(); if (CAN_RUN()) { RUN(); for (size_t i = 0; i < ArrayLength(zd_expected); i++) { int lane = static_cast(ArrayLength(zd_expected) - i - 1); if (!core.HasSVELane(dn_result, lane)) break; if ((pg_inputs[i] & 1) != 0) { ASSERT_EQUAL_SVE_LANE(zd_expected[i], dn_result, lane); } else { ASSERT_EQUAL_SVE_LANE(zn_inputs_rawbits[i], dn_result, lane); } } for (size_t i = 0; i < ArrayLength(zd_expected); i++) { int lane = static_cast(ArrayLength(zd_expected) - i - 1); if (!core.HasSVELane(dm_result, lane)) break; if ((pg_inputs[i] & 1) != 0) { ASSERT_EQUAL_SVE_LANE(zd_expected[i], dm_result, lane); } else { ASSERT_EQUAL_SVE_LANE(zm_inputs_rawbits[i], dm_result, lane); } } ASSERT_EQUAL_SVE(zd_expected, d_result); } } TEST_SVE(sve_binary_arithmetic_predicated_fdiv) { // The inputs are shared with different precision tests. double zd_in[] = {0.1, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9}; double zn_in[] = {24.0, 24.0, -2.0, -2.0, 5.5, 5.5, kFP64PositiveInfinity, kFP64PositiveInfinity, kFP64NegativeInfinity, kFP64NegativeInfinity}; double zm_in[] = {-2.0, -2.0, 24.0, 24.0, 0.5, 0.5, 0.65, 0.65, 24.0, 24.0}; int pg_in[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; uint16_t exp_h[] = {Float16ToRawbits(Float16(0.1)), Float16ToRawbits(Float16(-12.0)), Float16ToRawbits(Float16(2.2)), Float16ToRawbits(Float16(-0.0833)), Float16ToRawbits(Float16(4.4)), Float16ToRawbits(Float16(11.0)), Float16ToRawbits(Float16(6.6)), Float16ToRawbits(kFP16PositiveInfinity), Float16ToRawbits(Float16(8.8)), Float16ToRawbits(kFP16NegativeInfinity)}; FPBinArithHelper(config, NULL, &MacroAssembler::Fdiv, kHRegSize, zd_in, pg_in, zn_in, zm_in, exp_h); uint32_t exp_s[] = {FloatToRawbits(0.1), FloatToRawbits(-12.0), FloatToRawbits(2.2), 0xbdaaaaab, FloatToRawbits(4.4), FloatToRawbits(11.0), FloatToRawbits(6.6), FloatToRawbits(kFP32PositiveInfinity), FloatToRawbits(8.8), FloatToRawbits(kFP32NegativeInfinity)}; FPBinArithHelper(config, NULL, &MacroAssembler::Fdiv, kSRegSize, zd_in, pg_in, zn_in, zm_in, exp_s); uint64_t exp_d[] = {DoubleToRawbits(0.1), DoubleToRawbits(-12.0), DoubleToRawbits(2.2), 0xbfb5555555555555, DoubleToRawbits(4.4), DoubleToRawbits(11.0), DoubleToRawbits(6.6), DoubleToRawbits(kFP64PositiveInfinity), DoubleToRawbits(8.8), DoubleToRawbits(kFP64NegativeInfinity)}; FPBinArithHelper(config, NULL, &MacroAssembler::Fdiv, kDRegSize, zd_in, pg_in, zn_in, zm_in, exp_d); } TEST_SVE(sve_select) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in0[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0}; uint64_t in1[] = {0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa}; // For simplicity, we re-use the same pg for various lane sizes. // For D lanes: 1, 1, 0 // For S lanes: 1, 1, 1, 0, 0 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0}; Initialise(&masm, p0.VnB(), pg_in); PRegisterM pg = p0.Merging(); InsrHelper(&masm, z30.VnD(), in0); InsrHelper(&masm, z31.VnD(), in1); __ Sel(z0.VnB(), pg, z30.VnB(), z31.VnB()); __ Sel(z1.VnH(), pg, z30.VnH(), z31.VnH()); __ Sel(z2.VnS(), pg, z30.VnS(), z31.VnS()); __ Sel(z3.VnD(), pg, z30.VnD(), z31.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z0[] = {0xaaaaaaaa05aa07f8, 0xfeaaaaf0aac3870f, 0xaaaa56aa9abcdeaa}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); uint64_t expected_z1[] = {0xaaaaaaaaaaaa07f8, 0xaaaaf8f0e1c3870f, 0xaaaaaaaa9abcaaaa}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); uint64_t expected_z2[] = {0xaaaaaaaa05f607f8, 0xfefcf8f0e1c3870f, 0xaaaaaaaaaaaaaaaa}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0xaaaaaaaaaaaaaaaa}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); } } TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_h) { double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8}; double zn_inputs[] = {-2.1, 8.5, 225.5, 0.0, 8.8, -4.75, kFP64PositiveInfinity, kFP64NegativeInfinity}; double zm_inputs[] = {-2.0, -13.0, 24.0, 0.01, 0.5, 300.75, kFP64NegativeInfinity, kFP64PositiveInfinity}; int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1}; uint16_t zd_expected_max[] = {Float16ToRawbits(Float16(-2.0)), Float16ToRawbits(Float16(8.5)), Float16ToRawbits(Float16(3.3)), Float16ToRawbits(Float16(0.01)), Float16ToRawbits(Float16(5.5)), Float16ToRawbits(Float16(300.75)), Float16ToRawbits(kFP16PositiveInfinity), Float16ToRawbits(kFP16PositiveInfinity)}; FPBinArithHelper(config, &MacroAssembler::Fmax, NULL, kHRegSize, zd_inputs, pg_inputs, zn_inputs, zm_inputs, zd_expected_max); uint16_t zd_expected_min[] = {Float16ToRawbits(Float16(-2.1)), Float16ToRawbits(Float16(-13.0)), Float16ToRawbits(Float16(3.3)), Float16ToRawbits(Float16(0.0)), Float16ToRawbits(Float16(5.5)), Float16ToRawbits(Float16(-4.75)), Float16ToRawbits(kFP16NegativeInfinity), Float16ToRawbits(kFP16NegativeInfinity)}; FPBinArithHelper(config, &MacroAssembler::Fmin, NULL, kHRegSize, zd_inputs, pg_inputs, zn_inputs, zm_inputs, zd_expected_min); } TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_s) { double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8}; double zn_inputs[] = {-2.1, 8.5, 225.5, 0.0, 8.8, -4.75, kFP64PositiveInfinity, kFP64NegativeInfinity}; double zm_inputs[] = {-2.0, -13.0, 24.0, 0.01, 0.5, 300.75, kFP64NegativeInfinity, kFP64PositiveInfinity}; int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1}; uint32_t zd_expected_max[] = {FloatToRawbits(-2.0), FloatToRawbits(8.5), FloatToRawbits(3.3), FloatToRawbits(0.01), FloatToRawbits(5.5), FloatToRawbits(300.75), FloatToRawbits(kFP32PositiveInfinity), FloatToRawbits(kFP32PositiveInfinity)}; FPBinArithHelper(config, &MacroAssembler::Fmax, NULL, kSRegSize, zd_inputs, pg_inputs, zn_inputs, zm_inputs, zd_expected_max); uint32_t zd_expected_min[] = {FloatToRawbits(-2.1), FloatToRawbits(-13.0), FloatToRawbits(3.3), FloatToRawbits(0.0), FloatToRawbits(5.5), FloatToRawbits(-4.75), FloatToRawbits(kFP32NegativeInfinity), FloatToRawbits(kFP32NegativeInfinity)}; FPBinArithHelper(config, &MacroAssembler::Fmin, NULL, kSRegSize, zd_inputs, pg_inputs, zn_inputs, zm_inputs, zd_expected_min); } TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_d) { double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8}; double zn_inputs[] = {-2.1, 8.5, 225.5, 0.0, 8.8, -4.75, kFP64PositiveInfinity, kFP64NegativeInfinity}; double zm_inputs[] = {-2.0, -13.0, 24.0, 0.01, 0.5, 300.75, kFP64NegativeInfinity, kFP64PositiveInfinity}; int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1}; uint64_t zd_expected_max[] = {DoubleToRawbits(-2.0), DoubleToRawbits(8.5), DoubleToRawbits(3.3), DoubleToRawbits(0.01), DoubleToRawbits(5.5), DoubleToRawbits(300.75), DoubleToRawbits(kFP64PositiveInfinity), DoubleToRawbits(kFP64PositiveInfinity)}; FPBinArithHelper(config, &MacroAssembler::Fmax, NULL, kDRegSize, zd_inputs, pg_inputs, zn_inputs, zm_inputs, zd_expected_max); uint64_t zd_expected_min[] = {DoubleToRawbits(-2.1), DoubleToRawbits(-13.0), DoubleToRawbits(3.3), DoubleToRawbits(0.0), DoubleToRawbits(5.5), DoubleToRawbits(-4.75), DoubleToRawbits(kFP64NegativeInfinity), DoubleToRawbits(kFP64NegativeInfinity)}; FPBinArithHelper(config, &MacroAssembler::Fmin, NULL, kDRegSize, zd_inputs, pg_inputs, zn_inputs, zm_inputs, zd_expected_min); } template static void BitwiseShiftImmHelper(Test* config, int lane_size_in_bits, const T (&zn_inputs)[N], int shift) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); ZRegister zd_asr = z25.WithLaneSize(lane_size_in_bits); ZRegister zd_lsr = z26.WithLaneSize(lane_size_in_bits); ZRegister zd_lsl = z27.WithLaneSize(lane_size_in_bits); ZRegister zn = z28.WithLaneSize(lane_size_in_bits); InsrHelper(&masm, zn, zn_inputs); __ Asr(zd_asr, zn, shift); __ Lsr(zd_lsr, zn, shift); __ Lsl(zd_lsl, zn, shift - 1); // Lsl supports 0 - lane_size-1. END(); if (CAN_RUN()) { RUN(); const uint64_t mask = GetUintMask(lane_size_in_bits); for (int i = 0; i < static_cast(N); i++) { int lane = N - i - 1; if (!core.HasSVELane(zd_asr, lane)) break; bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0; uint64_t result; if (shift >= lane_size_in_bits) { result = is_negative ? mask : 0; } else { result = zn_inputs[i] >> shift; if (is_negative) { result |= mask << (lane_size_in_bits - shift); result &= mask; } } ASSERT_EQUAL_SVE_LANE(result, zd_asr, lane); } for (int i = 0; i < static_cast(N); i++) { int lane = N - i - 1; if (!core.HasSVELane(zd_lsr, lane)) break; uint64_t result = (shift >= lane_size_in_bits) ? 0 : zn_inputs[i] >> shift; ASSERT_EQUAL_SVE_LANE(result, zd_lsr, lane); } for (int i = 0; i < static_cast(N); i++) { int lane = N - i - 1; if (!core.HasSVELane(zd_lsl, lane)) break; uint64_t result = (shift > lane_size_in_bits) ? 0 : zn_inputs[i] << (shift - 1); ASSERT_EQUAL_SVE_LANE(result & mask, zd_lsl, lane); } } } TEST_SVE(sve_bitwise_shift_imm_unpredicated) { uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80}; int shift_b[] = {1, 3, 5, 8}; for (size_t i = 0; i < ArrayLength(shift_b); i++) { BitwiseShiftImmHelper(config, kBRegSize, inputs_b, shift_b[i]); } uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233}; int shift_h[] = {1, 8, 11, 16}; for (size_t i = 0; i < ArrayLength(shift_h); i++) { BitwiseShiftImmHelper(config, kHRegSize, inputs_h, shift_h[i]); } uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233}; int shift_s[] = {1, 9, 17, 32}; for (size_t i = 0; i < ArrayLength(shift_s); i++) { BitwiseShiftImmHelper(config, kSRegSize, inputs_s, shift_s[i]); } uint64_t inputs_d[] = {0xfedcba98fedcba98, 0xfffa5555aaaaaaaa, 0x0011223344aafe80}; int shift_d[] = {1, 23, 45, 64}; for (size_t i = 0; i < ArrayLength(shift_d); i++) { BitwiseShiftImmHelper(config, kDRegSize, inputs_d, shift_d[i]); } } template static void BitwiseShiftWideElementsHelper(Test* config, Shift shift_type, int lane_size_in_bits, const T (&zn_inputs)[N], const R& zm_inputs, const T (&zd_expected)[N]) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); ArithFn macro; // Since logical shift left and right by the current lane size width is equal // to 0, so initialize the array to 0 for convenience. uint64_t zd_expected_max_shift_amount[N] = {0}; switch (shift_type) { case ASR: { macro = &MacroAssembler::Asr; uint64_t mask = GetUintMask(lane_size_in_bits); for (size_t i = 0; i < ArrayLength(zn_inputs); i++) { bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0; zd_expected_max_shift_amount[i] = is_negative ? mask : 0; } break; } case LSR: macro = &MacroAssembler::Lsr; break; case LSL: macro = &MacroAssembler::Lsl; break; default: VIXL_UNIMPLEMENTED(); macro = NULL; break; } ZRegister zd = z26.WithLaneSize(lane_size_in_bits); ZRegister zn = z27.WithLaneSize(lane_size_in_bits); ZRegister zm = z28.WithLaneSize(kDRegSize); InsrHelper(&masm, zn, zn_inputs); InsrHelper(&masm, zm, zm_inputs); (masm.*macro)(zd, zn, zm); ZRegister zm_max_shift_amount = z25.WithLaneSize(kDRegSize); ZRegister zd_max_shift_amount = z24.WithLaneSize(lane_size_in_bits); __ Dup(zm_max_shift_amount, lane_size_in_bits); (masm.*macro)(zd_max_shift_amount, zn, zm_max_shift_amount); ZRegister zm_out_of_range = z23.WithLaneSize(kDRegSize); ZRegister zd_out_of_range = z22.WithLaneSize(lane_size_in_bits); __ Dup(zm_out_of_range, GetUintMask(lane_size_in_bits)); (masm.*macro)(zd_out_of_range, zn, zm_out_of_range); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zd_expected, zd); ASSERT_EQUAL_SVE(zd_expected_max_shift_amount, zd_max_shift_amount); ASSERT_EQUAL_SVE(zd_max_shift_amount, zd_out_of_range); } } TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_asr) { // clang-format off uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80, 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80}; int shift_b[] = {1, 3}; uint64_t expected_b[] = {0xff, 0xee, 0xdd, 0xcc, 0xff, 0x2a, 0xd5, 0xc0, 0xff, 0xfb, 0xf7, 0xf3, 0xff, 0x0a, 0xf5, 0xf0}; BitwiseShiftWideElementsHelper(config, ASR, kBRegSize, inputs_b, shift_b, expected_b); uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233, 0xfedc, 0xfa55, 0x0011, 0x2233, 0xfedc, 0xfa55, 0x0011, 0x2233}; int shift_h[] = {1, 8, 11}; uint64_t expected_h[] = {0xff6e, 0xfd2a, 0x0008, 0x1119, 0xfffe, 0xfffa, 0x0000, 0x0022, 0xffff, 0xffff, 0x0000, 0x0004}; BitwiseShiftWideElementsHelper(config, ASR, kHRegSize, inputs_h, shift_h, expected_h); uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888}; int shift_s[] = {1, 9, 23}; uint64_t expected_s[] = {0xff6e5d4c, 0xfffd2ad5, 0x00000891, 0x000091a2, 0xffffff55, 0xffffff11}; BitwiseShiftWideElementsHelper(config, ASR, kSRegSize, inputs_s, shift_s, expected_s); // clang-format on } TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsr) { // clang-format off uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80, 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80}; int shift_b[] = {1, 3}; uint64_t expected_b[] = {0x7f, 0x6e, 0x5d, 0x4c, 0x7f, 0x2a, 0x55, 0x40, 0x1f, 0x1b, 0x17, 0x13, 0x1f, 0x0a, 0x15, 0x10}; BitwiseShiftWideElementsHelper(config, LSR, kBRegSize, inputs_b, shift_b, expected_b); uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233, 0xfedc, 0xfa55, 0x0011, 0x2233, 0xfedc, 0xfa55, 0x0011, 0x2233}; int shift_h[] = {1, 8, 11}; uint64_t expected_h[] = {0x7f6e, 0x7d2a, 0x0008, 0x1119, 0x00fe, 0x00fa, 0x0000, 0x0022, 0x001f, 0x001f, 0x0000, 0x0004}; BitwiseShiftWideElementsHelper(config, LSR, kHRegSize, inputs_h, shift_h, expected_h); uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888}; int shift_s[] = {1, 9, 23}; uint64_t expected_s[] = {0x7f6e5d4c, 0x7ffd2ad5, 0x00000891, 0x000091a2, 0x00000155, 0x00000111}; BitwiseShiftWideElementsHelper(config, LSR, kSRegSize, inputs_s, shift_s, expected_s); // clang-format on } TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsl) { // clang-format off uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80, 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80}; int shift_b[] = {1, 5}; uint64_t expected_b[] = {0xfc, 0xb8, 0x74, 0x30, 0xfe, 0xaa, 0x54, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xe0, 0xa0, 0x40, 0x00}; BitwiseShiftWideElementsHelper(config, LSL, kBRegSize, inputs_b, shift_b, expected_b); uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233, 0xfedc, 0xfa55, 0x0011, 0x2233, 0xfedc, 0xfa55, 0x0011, 0x2233}; int shift_h[] = {1, 2, 14}; uint64_t expected_h[] = {0xfdb8, 0xf4aa, 0x0022, 0x4466, 0xfb70, 0xe954, 0x0044, 0x88cc, 0x0000, 0x4000, 0x4000, 0xc000}; BitwiseShiftWideElementsHelper(config, LSL, kHRegSize, inputs_h, shift_h, expected_h); uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888}; int shift_s[] = {1, 19, 26}; uint64_t expected_s[] = {0xfdb97530, 0xfff4ab54, 0x11980000, 0x2b380000, 0xa8000000, 0x20000000}; BitwiseShiftWideElementsHelper(config, LSL, kSRegSize, inputs_s, shift_s, expected_s); // Test large shifts outside the range of the "unsigned" type. uint64_t inputs_b2[] = {1, 2, 4, 8, 3, 5, 7, 9, 1, 2, 4, 8, 3, 5, 7, 9}; uint64_t shift_b2[] = {1, 0x1000000001}; uint64_t expected_b2[] = {2, 4, 8, 16, 6, 10, 14, 18, 0, 0, 0, 0, 0, 0, 0, 0}; BitwiseShiftWideElementsHelper(config, LSL, kBRegSize, inputs_b2, shift_b2, expected_b2); // clang-format on } TEST_SVE(sve_shift_by_vector) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Pfalse(p1.VnB()); __ Zip1(p2.VnB(), p0.VnB(), p1.VnB()); __ Zip1(p3.VnH(), p0.VnH(), p1.VnH()); __ Zip1(p4.VnS(), p0.VnS(), p1.VnS()); __ Zip1(p5.VnD(), p0.VnD(), p1.VnD()); __ Dup(z31.VnD(), 0x8000000080008080); __ Dup(z0.VnB(), -1); __ Index(z1.VnB(), 0, 1); __ Dup(z2.VnB(), 0x55); __ Lsr(z2.VnB(), p2.Merging(), z0.VnB(), z1.VnB()); __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnB()); __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnB()); __ Index(z1.VnH(), 0, 1); __ Dup(z6.VnB(), 0x55); __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnH()); __ Lsl(z6.VnH(), p3.Merging(), z0.VnH(), z1.VnH()); __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnH()); __ Index(z1.VnS(), 0, 1); __ Dup(z10.VnB(), 0x55); __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnS()); __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnS()); __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnS()); __ Index(z1.VnD(), 0, 1); __ Lsr(z0.VnD(), p5.Merging(), z0.VnD(), z1.VnD()); __ Lsl(z12.VnD(), p0.Merging(), z0.VnD(), z1.VnD()); __ Asr(z13.VnD(), p0.Merging(), z31.VnD(), z1.VnD()); __ Dup(z11.VnD(), 0x100000001); __ Lsl(z14.VnD(), p0.Merging(), z1.VnD(), z11.VnD()); __ Index(z0.VnH(), 7, -1); __ Lsr(z0.VnH(), p0.Merging(), z31.VnH(), z0.VnH()); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z0[] = {0x8000000020001010, 0x0800000002000101}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); uint64_t expected_z2[] = {0x5500550055005500, 0x5503550f553f55ff}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0x0000000000000000, 0x80c0e0f0f8fcfeff}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0xff000000ff00ffff, 0xff000000f000c080}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x01ff03ff07ff0fff, 0x1fff3fff7fffffff}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x5555ffc05555fff0, 0x5555fffc5555ffff}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0xff000000fc00f808, 0xf0000000c0008080}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0x1fffffff3fffffff, 0x7fffffffffffffff}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = {0xfffffff8fffffffc, 0xfffffffeffffffff}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z10[] = {0x55555555e0002020, 0x5555555580008080}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); uint64_t expected_z12[] = {0xfffffffffffffffe, 0xffffffffffffffff}; ASSERT_EQUAL_SVE(expected_z12, z12.VnD()); uint64_t expected_z13[] = {0xc000000040004040, 0x8000000080008080}; ASSERT_EQUAL_SVE(expected_z13, z13.VnD()); uint64_t expected_z14[] = {0, 0}; ASSERT_EQUAL_SVE(expected_z14, z14.VnD()); } } TEST_SVE(sve_shift_by_wide_vector) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Pfalse(p1.VnB()); __ Zip1(p2.VnB(), p0.VnB(), p1.VnB()); __ Zip1(p3.VnH(), p0.VnH(), p1.VnH()); __ Zip1(p4.VnS(), p0.VnS(), p1.VnS()); __ Dup(z31.VnD(), 0x8000000080008080); __ Dup(z0.VnB(), -1); __ Index(z1.VnD(), 1, 5); __ Dup(z2.VnB(), 0x55); __ Lsr(z2.VnB(), p2.Merging(), z2.VnB(), z1.VnD()); __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnD()); __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnD()); __ Dup(z6.VnB(), 0x55); __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnD()); __ Lsl(z6.VnH(), p3.Merging(), z6.VnH(), z1.VnD()); __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnD()); __ Dup(z10.VnB(), 0x55); __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnD()); __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnD()); __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z2[] = {0x5501550155015501, 0x552a552a552a552a}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0xc0c0c0c0c0c0c0c0, 0xfefefefefefefefe}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0xfe000000fe00fefe, 0xc0000000c000c0c0}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x03ff03ff03ff03ff, 0x7fff7fff7fff7fff}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x5555554055555540, 0x5555aaaa5555aaaa}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0xfe000000fe00fe02, 0xc0000000c000c040}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0x03ffffff03ffffff, 0x7fffffff7fffffff}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = {0xffffffc0ffffffc0, 0xfffffffefffffffe}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z10[] = {0x55555555fe000202, 0x55555555c0004040}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); } } TEST_SVE(sve_pred_shift_imm) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Pfalse(p1.VnB()); __ Zip1(p2.VnB(), p0.VnB(), p1.VnB()); __ Zip1(p3.VnH(), p0.VnH(), p1.VnH()); __ Zip1(p4.VnS(), p0.VnS(), p1.VnS()); __ Zip1(p5.VnD(), p0.VnD(), p1.VnD()); __ Dup(z31.VnD(), 0x8000000080008080); __ Lsr(z0.VnB(), p0.Merging(), z31.VnB(), 1); __ Mov(z1, z0); __ Lsl(z1.VnB(), p2.Merging(), z1.VnB(), 1); __ Asr(z2.VnB(), p0.Merging(), z1.VnB(), 2); __ Lsr(z3.VnH(), p0.Merging(), z31.VnH(), 2); __ Mov(z4, z3); __ Lsl(z4.VnH(), p3.Merging(), z4.VnH(), 2); __ Asr(z5.VnH(), p0.Merging(), z4.VnH(), 3); __ Lsr(z6.VnS(), p0.Merging(), z31.VnS(), 3); __ Mov(z7, z6); __ Lsl(z7.VnS(), p4.Merging(), z7.VnS(), 3); __ Asr(z8.VnS(), p0.Merging(), z7.VnS(), 4); __ Lsr(z9.VnD(), p0.Merging(), z31.VnD(), 4); __ Mov(z10, z9); __ Lsl(z10.VnD(), p5.Merging(), z10.VnD(), 4); __ Asr(z11.VnD(), p0.Merging(), z10.VnD(), 5); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z0[] = {0x4000000040004040, 0x4000000040004040}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); uint64_t expected_z1[] = {0x4000000040004080, 0x4000000040004080}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); uint64_t expected_z2[] = {0x10000000100010e0, 0x10000000100010e0}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0x2000000020002020, 0x2000000020002020}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0x2000000020008080, 0x2000000020008080}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x040000000400f010, 0x040000000400f010}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x1000000010001010, 0x1000000010001010}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0x1000000080008080, 0x1000000080008080}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0x01000000f8000808, 0x01000000f8000808}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = {0x0800000008000808, 0x0800000008000808}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z10[] = {0x0800000008000808, 0x8000000080008080}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); uint64_t expected_z11[] = {0x0040000000400040, 0xfc00000004000404}; ASSERT_EQUAL_SVE(expected_z11, z11.VnD()); } } TEST_SVE(sve_asrd) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Pfalse(p1.VnB()); __ Zip1(p2.VnB(), p0.VnB(), p1.VnB()); __ Zip1(p3.VnH(), p0.VnH(), p1.VnH()); __ Zip1(p4.VnS(), p0.VnS(), p1.VnS()); __ Zip1(p5.VnD(), p0.VnD(), p1.VnD()); __ Index(z31.VnB(), 0x7f - 3, 1); __ Asrd(z0.VnB(), p0.Merging(), z31.VnB(), 1); __ Mov(z1, z31); __ Asrd(z1.VnB(), p2.Merging(), z1.VnB(), 2); __ Asrd(z2.VnB(), p0.Merging(), z31.VnB(), 7); __ Asrd(z3.VnB(), p0.Merging(), z31.VnB(), 8); __ Index(z31.VnH(), 0x7fff - 3, 1); __ Asrd(z4.VnH(), p0.Merging(), z31.VnH(), 1); __ Mov(z5, z31); __ Asrd(z5.VnH(), p3.Merging(), z5.VnH(), 2); __ Asrd(z6.VnH(), p0.Merging(), z31.VnH(), 15); __ Asrd(z7.VnH(), p0.Merging(), z31.VnH(), 16); __ Index(z31.VnS(), 0x7fffffff - 1, 1); __ Asrd(z8.VnS(), p0.Merging(), z31.VnS(), 1); __ Mov(z9, z31); __ Asrd(z9.VnS(), p4.Merging(), z9.VnS(), 2); __ Asrd(z10.VnS(), p0.Merging(), z31.VnS(), 31); __ Asrd(z11.VnS(), p0.Merging(), z31.VnS(), 32); __ Index(z31.VnD(), 0x7fffffffffffffff, 1); __ Asrd(z12.VnD(), p0.Merging(), z31.VnD(), 1); __ Mov(z13, z31); __ Asrd(z13.VnD(), p5.Merging(), z13.VnD(), 2); __ Asrd(z14.VnD(), p0.Merging(), z31.VnD(), 63); __ Asrd(z31.VnD(), p0.Merging(), z31.VnD(), 64); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z0[] = {0xc6c5c5c4c4c3c3c2, 0xc2c1c1c03f3f3e3e}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); uint64_t expected_z1[] = {0x8be389e287e285e1, 0x83e181e07f1f7d1f}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); uint64_t expected_z2[] = {0x0000000000000000, 0x000000ff00000000}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0x0000000000000000, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0xc002c001c001c000, 0x3fff3fff3ffe3ffe}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x8003e0018001e000, 0x7fff1fff7ffd1fff}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x000000000000ffff, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0x0000000000000000, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0xc0000001c0000000, 0x3fffffff3fffffff}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = {0x80000001e0000000, 0x7fffffff1fffffff}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z10[] = {0x00000000ffffffff, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); uint64_t expected_z11[] = {0x0000000000000000, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z11, z11.VnD()); uint64_t expected_z12[] = {0xc000000000000000, 0x3fffffffffffffff}; ASSERT_EQUAL_SVE(expected_z12, z12.VnD()); uint64_t expected_z13[] = {0x8000000000000000, 0x1fffffffffffffff}; ASSERT_EQUAL_SVE(expected_z13, z13.VnD()); uint64_t expected_z14[] = {0xffffffffffffffff, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z14, z14.VnD()); uint64_t expected_z31[] = {0x0000000000000000, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z31, z31.VnD()); } } TEST_SVE(sve_setffr) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p15.VnB()); __ Setffr(); __ Rdffr(p14.VnB()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(p14.VnB(), p15.VnB()); } } static void WrffrHelper(Test* config, unsigned active_lanes) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int inputs[kPRegMaxSize] = {0}; VIXL_ASSERT(active_lanes <= kPRegMaxSize); for (unsigned i = 0; i < active_lanes; i++) { // The rightmost (highest-indexed) array element maps to the lowest-numbered // lane. inputs[kPRegMaxSize - i - 1] = 1; } Initialise(&masm, p1.VnB(), inputs); __ Wrffr(p1.VnB()); __ Rdffr(p2.VnB()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(p1.VnB(), p2.VnB()); } } TEST_SVE(sve_wrffr) { int active_lanes_inputs[] = {0, 1, 7, 10, 32, 48, kPRegMaxSize}; for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) { WrffrHelper(config, active_lanes_inputs[i]); } } template static void RdffrHelper(Test* config, size_t active_lanes, const int (&pg_inputs)[N]) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); VIXL_ASSERT(active_lanes <= kPRegMaxSize); // The rightmost (highest-indexed) array element maps to the lowest-numbered // lane. int pd[kPRegMaxSize] = {0}; for (unsigned i = 0; i < active_lanes; i++) { pd[kPRegMaxSize - i - 1] = 1; } int pg[kPRegMaxSize] = {0}; for (unsigned i = 0; i < N; i++) { pg[kPRegMaxSize - i - 1] = pg_inputs[i]; } int pd_expected[kPRegMaxSize] = {0}; for (unsigned i = 0; i < std::min(active_lanes, N); i++) { int lane = kPRegMaxSize - i - 1; pd_expected[lane] = pd[lane] & pg[lane]; } Initialise(&masm, p0.VnB(), pg); Initialise(&masm, p1.VnB(), pd); // The unpredicated form of rdffr has been tested in `WrffrHelper`. __ Wrffr(p1.VnB()); __ Rdffr(p14.VnB(), p0.Zeroing()); __ Rdffrs(p13.VnB(), p0.Zeroing()); __ Mrs(x8, NZCV); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(pd_expected, p14.VnB()); ASSERT_EQUAL_SVE(pd_expected, p13.VnB()); StatusFlags nzcv_expected = GetPredTestFlags(pd_expected, pg, core.GetSVELaneCount(kBRegSize)); ASSERT_EQUAL_64(nzcv_expected, x8); } } TEST_SVE(sve_rdffr_rdffrs) { // clang-format off int active_lanes_inputs[] = {0, 1, 15, 26, 39, 47, kPRegMaxSize}; int pg_inputs_0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int pg_inputs_1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0}; int pg_inputs_2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}; int pg_inputs_3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1}; int pg_inputs_4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // clang-format on for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) { RdffrHelper(config, active_lanes_inputs[i], pg_inputs_0); RdffrHelper(config, active_lanes_inputs[i], pg_inputs_1); RdffrHelper(config, active_lanes_inputs[i], pg_inputs_2); RdffrHelper(config, active_lanes_inputs[i], pg_inputs_3); RdffrHelper(config, active_lanes_inputs[i], pg_inputs_4); } } typedef void (MacroAssembler::*BrkpFn)(const PRegisterWithLaneSize& pd, const PRegisterZ& pg, const PRegisterWithLaneSize& pn, const PRegisterWithLaneSize& pm); template static void BrkpaBrkpbHelper(Test* config, BrkpFn macro, BrkpFn macro_set_flags, const Tg& pg_inputs, const Tn& pn_inputs, const Tn& pm_inputs, const Td& pd_expected) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); PRegister pg = p15; PRegister pn = p14; PRegister pm = p13; Initialise(&masm, pg.VnB(), pg_inputs); Initialise(&masm, pn.VnB(), pn_inputs); Initialise(&masm, pm.VnB(), pm_inputs); // Initialise NZCV to an impossible value, to check that we actually write it. __ Mov(x10, NZCVFlag); __ Msr(NZCV, x10); (masm.*macro_set_flags)(p0.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB()); __ Mrs(x0, NZCV); (masm.*macro)(p1.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(pd_expected, p0.VnB()); // Check that the flags were properly set. StatusFlags nzcv_expected = GetPredTestFlags(pd_expected, pg_inputs, core.GetSVELaneCount(kBRegSize)); ASSERT_EQUAL_64(nzcv_expected, x0); ASSERT_EQUAL_SVE(p0.VnB(), p1.VnB()); } } template static void BrkpaHelper(Test* config, const Tg& pg_inputs, const Tn& pn_inputs, const Tn& pm_inputs, const Td& pd_expected) { BrkpaBrkpbHelper(config, &MacroAssembler::Brkpa, &MacroAssembler::Brkpas, pg_inputs, pn_inputs, pm_inputs, pd_expected); } template static void BrkpbHelper(Test* config, const Tg& pg_inputs, const Tn& pn_inputs, const Tn& pm_inputs, const Td& pd_expected) { BrkpaBrkpbHelper(config, &MacroAssembler::Brkpb, &MacroAssembler::Brkpbs, pg_inputs, pn_inputs, pm_inputs, pd_expected); } TEST_SVE(sve_brkpb) { // clang-format off // The last active element of `pn` are `true` in all vector length configurations. // | boundary of 128-bits VL. // v int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}; int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}; int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; // | highest-numbered lane lowest-numbered lane | // v v int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}; int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0}; int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1}; int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}; int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}; // | first active // v int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}; // | first active // v int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}; // | first active // v int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}; BrkpbHelper(config, pg_1, pn_1, pm_1, exp_1_1_1); BrkpbHelper(config, pg_1, pn_2, pm_2, exp_1_2_2); BrkpbHelper(config, pg_1, pn_3, pm_3, exp_1_3_3); // | first active // v int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}; // | first active // v int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}; // | first active // v int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1}; BrkpbHelper(config, pg_2, pn_1, pm_2, exp_2_1_2); BrkpbHelper(config, pg_2, pn_2, pm_3, exp_2_2_3); BrkpbHelper(config, pg_2, pn_3, pm_1, exp_2_3_1); // | first active // v int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1}; // | first active // v int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1}; // | first active // v int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; BrkpbHelper(config, pg_3, pn_1, pm_3, exp_3_1_3); BrkpbHelper(config, pg_3, pn_2, pm_1, exp_3_2_1); BrkpbHelper(config, pg_3, pn_3, pm_2, exp_3_3_2); // The last active element of `pn` are `false` in all vector length configurations. // | last active lane when VL > 128 bits. // v // | last active lane when VL == 128 bits. // v int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; BrkpbHelper(config, pg_4, pn_1, pm_1, exp_4_x_x); BrkpbHelper(config, pg_4, pn_2, pm_2, exp_4_x_x); BrkpbHelper(config, pg_4, pn_3, pm_3, exp_4_x_x); // clang-format on } TEST_SVE(sve_brkpa) { // clang-format off // The last active element of `pn` are `true` in all vector length configurations. // | boundary of 128-bits VL. // v int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}; int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}; int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; // | highest-numbered lane lowest-numbered lane | // v v int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}; int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0}; int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1}; int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}; int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}; // | first active // v int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0}; // | first active // v int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}; // | first active // v int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0}; BrkpaHelper(config, pg_1, pn_1, pm_1, exp_1_1_1); BrkpaHelper(config, pg_1, pn_2, pm_2, exp_1_2_2); BrkpaHelper(config, pg_1, pn_3, pm_3, exp_1_3_3); // | first active // v int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}; // | first active // v int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}; // | first active // v int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1}; BrkpaHelper(config, pg_2, pn_1, pm_2, exp_2_1_2); BrkpaHelper(config, pg_2, pn_2, pm_3, exp_2_2_3); BrkpaHelper(config, pg_2, pn_3, pm_1, exp_2_3_1); // | first active // v int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; // | first active // v int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; // | first active // v int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; BrkpaHelper(config, pg_3, pn_1, pm_3, exp_3_1_3); BrkpaHelper(config, pg_3, pn_2, pm_1, exp_3_2_1); BrkpaHelper(config, pg_3, pn_3, pm_2, exp_3_3_2); // The last active element of `pn` are `false` in all vector length configurations. // | last active lane when VL > 128 bits. // v // | last active lane when VL == 128 bits. // v int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; BrkpaHelper(config, pg_4, pn_1, pm_1, exp_4_x_x); BrkpaHelper(config, pg_4, pn_2, pm_2, exp_4_x_x); BrkpaHelper(config, pg_4, pn_3, pm_3, exp_4_x_x); // clang-format on } TEST_SVE(sve_rbit) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55}; InsrHelper(&masm, z0.VnD(), inputs); __ Ptrue(p1.VnB()); int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1}; Initialise(&masm, p2.VnB(), pred); __ Rbit(z0.VnB(), p1.Merging(), z0.VnB()); __ Rbit(z0.VnB(), p1.Merging(), z0.VnB()); __ Rbit(z1.VnB(), p1.Merging(), z0.VnB()); __ Rbit(z2.VnH(), p1.Merging(), z0.VnH()); __ Rbit(z3.VnS(), p1.Merging(), z0.VnS()); __ Rbit(z4.VnD(), p1.Merging(), z0.VnD()); __ Dup(z5.VnB(), 0x42); __ Rbit(z5.VnB(), p2.Merging(), z0.VnB()); __ Dup(z6.VnB(), 0x42); __ Rbit(z6.VnS(), p2.Merging(), z0.VnS()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(inputs, z0.VnD()); uint64_t expected_z1[] = {0x55555555aaaaaaaa, 0x5555aaaa55aa55aa}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); uint64_t expected_z2[] = {0x55555555aaaaaaaa, 0x5555aaaaaa55aa55}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0xaaaa5555aa55aa55}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0xaa55aa55aaaa5555}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x4255425542aa42aa, 0x4255424242aa42aa}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0x42424242aa55aa55}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); } } TEST_SVE(sve_rev_bhw) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55}; InsrHelper(&masm, z0.VnD(), inputs); __ Ptrue(p1.VnB()); int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1}; Initialise(&masm, p2.VnB(), pred); __ Revb(z1.VnH(), p1.Merging(), z0.VnH()); __ Revb(z2.VnS(), p1.Merging(), z0.VnS()); __ Revb(z3.VnD(), p1.Merging(), z0.VnD()); __ Revh(z4.VnS(), p1.Merging(), z0.VnS()); __ Revh(z5.VnD(), p1.Merging(), z0.VnD()); __ Revw(z6.VnD(), p1.Merging(), z0.VnD()); __ Dup(z7.VnB(), 0x42); __ Revb(z7.VnH(), p2.Merging(), z0.VnH()); __ Dup(z8.VnB(), 0x42); __ Revh(z8.VnS(), p2.Merging(), z0.VnS()); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z1[] = {0xaaaaaaaa55555555, 0xaaaa555555aa55aa}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); uint64_t expected_z2[] = {0xaaaaaaaa55555555, 0x5555aaaa55aa55aa}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0x55aa55aa5555aaaa}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0x5555aaaaaa55aa55}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x55555555aaaaaaaa, 0xaa55aa555555aaaa}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0xaa55aa55aaaa5555}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0xaaaaaaaa55555555, 0xaaaa424255aa55aa}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0xaaaaaaaa55555555, 0x42424242aa55aa55}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); } } TEST_SVE(sve_ftssel) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in[] = {0x1111777766665555, 0xaaaabbbbccccdddd}; uint64_t q[] = {0x0001000300000002, 0x0001000200000003}; InsrHelper(&masm, z0.VnD(), in); InsrHelper(&masm, z1.VnD(), q); __ Ftssel(z2.VnH(), z0.VnH(), z1.VnH()); __ Ftssel(z3.VnS(), z0.VnS(), z1.VnS()); __ Ftssel(z4.VnD(), z0.VnD(), z1.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z2[] = {0x3c00bc006666d555, 0x3c003bbbccccbc00}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0xbf800000e6665555, 0x2aaabbbbbf800000}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0x9111777766665555, 0xbff0000000000000}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); } } TEST_SVE(sve_fexpa) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in0[] = {0x3ff0000000000000, 0x3ff0000000011001}; uint64_t in1[] = {0x3ff000000002200f, 0xbff000000003301f}; uint64_t in2[] = {0xbff000000004403f, 0x3ff0000000055040}; uint64_t in3[] = {0x3f800000bf800001, 0x3f80000f3f80001f}; uint64_t in4[] = {0x3f80002f3f82203f, 0xbf8000403f833041}; uint64_t in5[] = {0x3c003c01bc00bc07, 0x3c08bc0f3c1fbc20}; InsrHelper(&masm, z0.VnD(), in0); InsrHelper(&masm, z1.VnD(), in1); InsrHelper(&masm, z2.VnD(), in2); InsrHelper(&masm, z3.VnD(), in3); InsrHelper(&masm, z4.VnD(), in4); InsrHelper(&masm, z5.VnD(), in5); __ Fexpa(z6.VnD(), z0.VnD()); __ Fexpa(z7.VnD(), z1.VnD()); __ Fexpa(z8.VnD(), z2.VnD()); __ Fexpa(z9.VnS(), z3.VnS()); __ Fexpa(z10.VnS(), z4.VnS()); __ Fexpa(z11.VnH(), z5.VnH()); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z6[] = {0x0000000000000000, 0x44002c9a3e778061}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0x0802d285a6e4030b, 0x4c06623882552225}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0x100fa7c1819e90d8, 0x5410000000000000}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = {0x00000000000164d2, 0x0016942d003311c4}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z10[] = {0x0054f35b407d3e0c, 0x00800000608164d2}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); uint64_t expected_z11[] = {0x00000016000000a8, 0x00c2018903d40400}; ASSERT_EQUAL_SVE(expected_z11, z11.VnD()); } } TEST_SVE(sve_rev_p) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); Initialise(&masm, p0.VnB(), 0xabcdabcdabcdabcd, 0xabcdabcdabcdabcd, 0xabcdabcdabcdabcd, 0xabcdabcdabcdabcd); __ Rev(p1.VnB(), p0.VnB()); __ Rev(p2.VnH(), p0.VnH()); __ Rev(p3.VnS(), p0.VnS()); __ Rev(p4.VnD(), p0.VnD()); END(); if (CAN_RUN()) { RUN(); int p1_expected[] = {1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1}; ASSERT_EQUAL_SVE(p1_expected, p1.VnB()); int p2_expected[] = {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p2_expected, p2.VnB()); int p3_expected[] = {1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p3_expected, p3.VnB()); int p4_expected[] = {1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1}; ASSERT_EQUAL_SVE(p4_expected, p4.VnB()); } } TEST_SVE(sve_trn_p_bh) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); Initialise(&masm, p0.VnB(), 0xa5a55a5a); __ Pfalse(p1.VnB()); __ Trn1(p2.VnB(), p0.VnB(), p0.VnB()); __ Trn2(p3.VnB(), p0.VnB(), p0.VnB()); __ Trn1(p4.VnB(), p1.VnB(), p0.VnB()); __ Trn2(p5.VnB(), p1.VnB(), p0.VnB()); __ Trn1(p6.VnB(), p0.VnB(), p1.VnB()); __ Trn2(p7.VnB(), p0.VnB(), p1.VnB()); __ Trn1(p8.VnH(), p0.VnH(), p0.VnH()); __ Trn2(p9.VnH(), p0.VnH(), p0.VnH()); __ Trn1(p10.VnH(), p1.VnH(), p0.VnH()); __ Trn2(p11.VnH(), p1.VnH(), p0.VnH()); __ Trn1(p12.VnH(), p0.VnH(), p1.VnH()); __ Trn2(p13.VnH(), p0.VnH(), p1.VnH()); END(); if (CAN_RUN()) { RUN(); int p2_expected[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0}; int p3_expected[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p2_expected, p2.VnB()); ASSERT_EQUAL_SVE(p3_expected, p3.VnB()); int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}; int p5_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p4_expected, p4.VnB()); ASSERT_EQUAL_SVE(p5_expected, p5.VnB()); int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0}; int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1}; ASSERT_EQUAL_SVE(p6_expected, p6.VnB()); ASSERT_EQUAL_SVE(p7_expected, p7.VnB()); int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0}; int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p8_expected, p8.VnB()); ASSERT_EQUAL_SVE(p9_expected, p9.VnB()); int p10_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0}; int p11_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0}; ASSERT_EQUAL_SVE(p10_expected, p10.VnB()); ASSERT_EQUAL_SVE(p11_expected, p11.VnB()); int p12_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0}; int p13_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0}; ASSERT_EQUAL_SVE(p12_expected, p12.VnB()); ASSERT_EQUAL_SVE(p13_expected, p13.VnB()); } } TEST_SVE(sve_trn_p_sd) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); Initialise(&masm, p0.VnB(), 0x55a55aaa); __ Pfalse(p1.VnB()); __ Trn1(p2.VnS(), p0.VnS(), p0.VnS()); __ Trn2(p3.VnS(), p0.VnS(), p0.VnS()); __ Trn1(p4.VnS(), p1.VnS(), p0.VnS()); __ Trn2(p5.VnS(), p1.VnS(), p0.VnS()); __ Trn1(p6.VnS(), p0.VnS(), p1.VnS()); __ Trn2(p7.VnS(), p0.VnS(), p1.VnS()); __ Trn1(p8.VnD(), p0.VnD(), p0.VnD()); __ Trn2(p9.VnD(), p0.VnD(), p0.VnD()); __ Trn1(p10.VnD(), p1.VnD(), p0.VnD()); __ Trn2(p11.VnD(), p1.VnD(), p0.VnD()); __ Trn1(p12.VnD(), p0.VnD(), p1.VnD()); __ Trn2(p13.VnD(), p0.VnD(), p1.VnD()); END(); if (CAN_RUN()) { RUN(); int p2_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}; int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p2_expected, p2.VnB()); ASSERT_EQUAL_SVE(p3_expected, p3.VnB()); int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}; int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}; ASSERT_EQUAL_SVE(p4_expected, p4.VnB()); ASSERT_EQUAL_SVE(p5_expected, p5.VnB()); int p6_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0}; int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p6_expected, p6.VnB()); ASSERT_EQUAL_SVE(p7_expected, p7.VnB()); int p8_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}; int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p8_expected, p8.VnB()); ASSERT_EQUAL_SVE(p9_expected, p9.VnB()); int p10_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}; ASSERT_EQUAL_SVE(p10_expected, p10.VnB()); ASSERT_EQUAL_SVE(p11_expected, p11.VnB()); int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0}; int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p12_expected, p12.VnB()); ASSERT_EQUAL_SVE(p13_expected, p13.VnB()); } } TEST_SVE(sve_zip_p_bh) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); Initialise(&masm, p0.VnB(), 0x5a5a5a5a5a5a5a5a, 0x5a5a5a5a5a5a5a5a, 0x5a5a5a5a5a5a5a5a, 0x5a5a5a5a5a5a5a5a); __ Pfalse(p1.VnB()); __ Zip1(p2.VnB(), p0.VnB(), p0.VnB()); __ Zip2(p3.VnB(), p0.VnB(), p0.VnB()); __ Zip1(p4.VnB(), p1.VnB(), p0.VnB()); __ Zip2(p5.VnB(), p1.VnB(), p0.VnB()); __ Zip1(p6.VnB(), p0.VnB(), p1.VnB()); __ Zip2(p7.VnB(), p0.VnB(), p1.VnB()); __ Zip1(p8.VnH(), p0.VnH(), p0.VnH()); __ Zip2(p9.VnH(), p0.VnH(), p0.VnH()); __ Zip1(p10.VnH(), p1.VnH(), p0.VnH()); __ Zip2(p11.VnH(), p1.VnH(), p0.VnH()); __ Zip1(p12.VnH(), p0.VnH(), p1.VnH()); __ Zip2(p13.VnH(), p0.VnH(), p1.VnH()); END(); if (CAN_RUN()) { RUN(); int p2_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0}; int p3_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0}; ASSERT_EQUAL_SVE(p2_expected, p2.VnB()); ASSERT_EQUAL_SVE(p3_expected, p3.VnB()); int p4_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0}; int p5_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0}; ASSERT_EQUAL_SVE(p4_expected, p4.VnB()); ASSERT_EQUAL_SVE(p5_expected, p5.VnB()); int p6_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0}; int p7_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0}; ASSERT_EQUAL_SVE(p6_expected, p6.VnB()); ASSERT_EQUAL_SVE(p7_expected, p7.VnB()); int p8_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0}; int p9_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p8_expected, p8.VnB()); ASSERT_EQUAL_SVE(p9_expected, p9.VnB()); int p10_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}; int p11_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}; ASSERT_EQUAL_SVE(p10_expected, p10.VnB()); ASSERT_EQUAL_SVE(p11_expected, p11.VnB()); int p12_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0}; int p13_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0}; ASSERT_EQUAL_SVE(p12_expected, p12.VnB()); ASSERT_EQUAL_SVE(p13_expected, p13.VnB()); } } TEST_SVE(sve_zip_p_sd) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); Initialise(&masm, p0.VnB(), 0x5a5a5a5a5a5a5a5a, 0x5a5a5a5a5a5a5a5a, 0x5a5a5a5a5a5a5a5a, 0x5a5a5a5a5a5a5a5a); __ Pfalse(p1.VnB()); __ Zip1(p2.VnS(), p0.VnS(), p0.VnS()); __ Zip2(p3.VnS(), p0.VnS(), p0.VnS()); __ Zip1(p4.VnS(), p1.VnS(), p0.VnS()); __ Zip2(p5.VnS(), p1.VnS(), p0.VnS()); __ Zip1(p6.VnS(), p0.VnS(), p1.VnS()); __ Zip2(p7.VnS(), p0.VnS(), p1.VnS()); __ Zip1(p8.VnD(), p0.VnD(), p0.VnD()); __ Zip2(p9.VnD(), p0.VnD(), p0.VnD()); __ Zip1(p10.VnD(), p1.VnD(), p0.VnD()); __ Zip2(p11.VnD(), p1.VnD(), p0.VnD()); __ Zip1(p12.VnD(), p0.VnD(), p1.VnD()); __ Zip2(p13.VnD(), p0.VnD(), p1.VnD()); END(); if (CAN_RUN()) { RUN(); int p2_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0}; int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p2_expected, p2.VnB()); ASSERT_EQUAL_SVE(p3_expected, p3.VnB()); int p4_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}; int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}; ASSERT_EQUAL_SVE(p4_expected, p4.VnB()); ASSERT_EQUAL_SVE(p5_expected, p5.VnB()); int p6_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0}; int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p6_expected, p6.VnB()); ASSERT_EQUAL_SVE(p7_expected, p7.VnB()); int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0}; int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p8_expected, p8.VnB()); ASSERT_EQUAL_SVE(p9_expected, p9.VnB()); int p10_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}; ASSERT_EQUAL_SVE(p10_expected, p10.VnB()); ASSERT_EQUAL_SVE(p11_expected, p11.VnB()); int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0}; int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0}; ASSERT_EQUAL_SVE(p12_expected, p12.VnB()); ASSERT_EQUAL_SVE(p13_expected, p13.VnB()); } } TEST_SVE(sve_uzp_p) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); Initialise(&masm, p0.VnB(), 0xf0f0ff00ffff0000, 0x4242424242424242, 0x5a5a5a5a5a5a5a5a, 0x0123456789abcdef); __ Rev(p1.VnB(), p0.VnB()); __ Zip1(p2.VnB(), p0.VnB(), p1.VnB()); __ Zip2(p3.VnB(), p0.VnB(), p1.VnB()); __ Uzp1(p4.VnB(), p2.VnB(), p3.VnB()); __ Uzp2(p5.VnB(), p2.VnB(), p3.VnB()); __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); __ Zip2(p3.VnH(), p0.VnH(), p1.VnH()); __ Uzp1(p6.VnH(), p2.VnH(), p3.VnH()); __ Uzp2(p7.VnH(), p2.VnH(), p3.VnH()); __ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); __ Zip2(p3.VnS(), p0.VnS(), p1.VnS()); __ Uzp1(p8.VnS(), p2.VnS(), p3.VnS()); __ Uzp2(p9.VnS(), p2.VnS(), p3.VnS()); __ Zip1(p2.VnD(), p0.VnD(), p1.VnD()); __ Zip2(p3.VnD(), p0.VnD(), p1.VnD()); __ Uzp1(p10.VnD(), p2.VnD(), p3.VnD()); __ Uzp2(p11.VnD(), p2.VnD(), p3.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(p0, p4); ASSERT_EQUAL_SVE(p1, p5); ASSERT_EQUAL_SVE(p0, p6); ASSERT_EQUAL_SVE(p1, p7); ASSERT_EQUAL_SVE(p0, p8); ASSERT_EQUAL_SVE(p1, p9); ASSERT_EQUAL_SVE(p0, p10); ASSERT_EQUAL_SVE(p1, p11); } } TEST_SVE(sve_punpk) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); auto get_64_bits_at = [](int byte_index) -> uint64_t { // Each 8-bit chunk has the value 0x50 + the byte index of the chunk. return 0x5756555453525150 + (0x0101010101010101 * byte_index); }; Initialise(&masm, p0.VnB(), get_64_bits_at(24), get_64_bits_at(16), get_64_bits_at(8), get_64_bits_at(0)); __ Punpklo(p1.VnH(), p0.VnB()); __ Punpkhi(p2.VnH(), p0.VnB()); END(); if (CAN_RUN()) { RUN(); int pl = config->sve_vl_in_bits() / kZRegBitsPerPRegBit; // For simplicity, just test the bottom 64 H-sized lanes. uint64_t p1_h_bits = get_64_bits_at(0); uint64_t p2_h_bits = get_64_bits_at(pl / (2 * 8)); int p1_expected[64]; int p2_expected[64]; for (size_t i = 0; i < 64; i++) { p1_expected[63 - i] = (p1_h_bits >> i) & 1; p2_expected[63 - i] = (p2_h_bits >> i) & 1; } // Testing `VnH` ensures that odd-numbered B lanes are zero. ASSERT_EQUAL_SVE(p1_expected, p1.VnH()); ASSERT_EQUAL_SVE(p2_expected, p2.VnH()); } } typedef void (MacroAssembler::*BrkFn)(const PRegisterWithLaneSize& pd, const PRegister& pg, const PRegisterWithLaneSize& pn); typedef void (MacroAssembler::*BrksFn)(const PRegisterWithLaneSize& pd, const PRegisterZ& pg, const PRegisterWithLaneSize& pn); template static void BrkaBrkbHelper(Test* config, BrkFn macro, BrksFn macro_set_flags, const T (&pd_inputs)[N], const T (&pg_inputs)[N], const T (&pn_inputs)[N], const T (&pd_z_expected)[N]) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); PRegister pg = p10; PRegister pn = p9; PRegister pd_z = p0; PRegister pd_z_s = p1; PRegister pd_m = p2; Initialise(&masm, pg.VnB(), pg_inputs); Initialise(&masm, pn.VnB(), pn_inputs); Initialise(&masm, pd_m.VnB(), pd_inputs); // Initialise NZCV to an impossible value, to check that we actually write it. __ Mov(x10, NZCVFlag); __ Msr(NZCV, x10); (masm.*macro)(pd_z.VnB(), pg.Zeroing(), pn.VnB()); (masm.*macro_set_flags)(pd_z_s.VnB(), pg.Zeroing(), pn.VnB()); __ Mrs(x0, NZCV); (masm.*macro)(pd_m.VnB(), pg.Merging(), pn.VnB()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(pd_z_expected, pd_z.VnB()); // Check that the flags were properly set. StatusFlags nzcv_expected = GetPredTestFlags(pd_z_expected, pg_inputs, core.GetSVELaneCount(kBRegSize)); ASSERT_EQUAL_64(nzcv_expected, x0); ASSERT_EQUAL_SVE(pd_z.VnB(), pd_z_s.VnB()); T pd_m_expected[N]; // Set expected `pd` result on merging predication. for (size_t i = 0; i < N; i++) { pd_m_expected[i] = pg_inputs[i] ? pd_z_expected[i] : pd_inputs[i]; } ASSERT_EQUAL_SVE(pd_m_expected, pd_m.VnB()); } } template static void BrkaHelper(Test* config, const T& pd_inputs, const T& pg_inputs, const T& pn_inputs, const T& pd_expected) { BrkaBrkbHelper(config, &MacroAssembler::Brka, &MacroAssembler::Brkas, pd_inputs, pg_inputs, pn_inputs, pd_expected); } TEST_SVE(sve_brka) { // clang-format off // | boundary of 128-bits VL. // v int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // | highest-numbered lane lowest-numbered lane | // v v int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}; int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}; int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}; int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1}; // | first break // v int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0}; // | first break // v int exp_1_2[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}; // | first break // v int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}; BrkaHelper(config, pd, pg_1, pn_1, exp_1_1); BrkaHelper(config, pd, pg_1, pn_2, exp_1_2); BrkaHelper(config, pd, pg_1, pn_3, exp_1_3); // | first break // v int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1}; // | first break // v int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}; // | first break // v int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; BrkaHelper(config, pd, pg_2, pn_1, exp_2_1); BrkaHelper(config, pd, pg_2, pn_2, exp_2_2); BrkaHelper(config, pd, pg_2, pn_3, exp_2_3); // The all-inactive zeroing predicate sets destination predicate all-false. int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; BrkaHelper(config, pd, pg_3, pn_1, exp_3_x); BrkaHelper(config, pd, pg_3, pn_2, exp_3_x); BrkaHelper(config, pd, pg_3, pn_3, exp_3_x); // clang-format on } template static void BrkbHelper(Test* config, const T& pd_inputs, const T& pg_inputs, const T& pn_inputs, const T& pd_expected) { BrkaBrkbHelper(config, &MacroAssembler::Brkb, &MacroAssembler::Brkbs, pd_inputs, pg_inputs, pn_inputs, pd_expected); } TEST_SVE(sve_brkb) { // clang-format off // | boundary of 128-bits VL. // v int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // | highest-numbered lane lowest-numbered lane | // v v int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}; int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}; int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}; int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1}; // | first break // v int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}; // | first break // v int exp_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}; // | first break // v int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0}; BrkbHelper(config, pd, pg_1, pn_1, exp_1_1); BrkbHelper(config, pd, pg_1, pn_2, exp_1_2); BrkbHelper(config, pd, pg_1, pn_3, exp_1_3); // | first break // v int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1}; // | first break // v int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}; // | first break // v int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; BrkbHelper(config, pd, pg_2, pn_1, exp_2_1); BrkbHelper(config, pd, pg_2, pn_2, exp_2_2); BrkbHelper(config, pd, pg_2, pn_3, exp_2_3); // The all-inactive zeroing predicate sets destination predicate all-false. int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; BrkbHelper(config, pd, pg_3, pn_1, exp_3_x); BrkbHelper(config, pd, pg_3, pn_2, exp_3_x); BrkbHelper(config, pd, pg_3, pn_3, exp_3_x); // clang-format on } typedef void (MacroAssembler::*BrknFn)(const PRegisterWithLaneSize& pd, const PRegisterZ& pg, const PRegisterWithLaneSize& pn, const PRegisterWithLaneSize& pm); typedef void (MacroAssembler::*BrknsFn)(const PRegisterWithLaneSize& pd, const PRegisterZ& pg, const PRegisterWithLaneSize& pn, const PRegisterWithLaneSize& pm); enum BrknDstPredicateState { kAllFalse, kUnchanged }; template static void BrknHelper(Test* config, const T (&pd_inputs)[N], const T (&pg_inputs)[N], const T (&pn_inputs)[N], const T (&pm_inputs)[N], BrknDstPredicateState expected_pd_state) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); PRegister pg = p10; PRegister pn = p9; PRegister pm = p8; PRegister pdm = p0; PRegister pd = p1; PRegister pd_s = p2; Initialise(&masm, pg.VnB(), pg_inputs); Initialise(&masm, pn.VnB(), pn_inputs); Initialise(&masm, pm.VnB(), pm_inputs); Initialise(&masm, pdm.VnB(), pm_inputs); Initialise(&masm, pd.VnB(), pd_inputs); Initialise(&masm, pd_s.VnB(), pd_inputs); // Initialise NZCV to an impossible value, to check that we actually write it. __ Mov(x10, NZCVFlag); __ Msr(NZCV, x10); __ Brkn(pdm.VnB(), pg.Zeroing(), pn.VnB(), pdm.VnB()); // !pd.Aliases(pm). __ Brkn(pd.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB()); __ Brkns(pd_s.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB()); __ Mrs(x0, NZCV); END(); if (CAN_RUN()) { RUN(); T all_false[N] = {0}; if (expected_pd_state == kAllFalse) { ASSERT_EQUAL_SVE(all_false, pd.VnB()); } else { ASSERT_EQUAL_SVE(pm_inputs, pd.VnB()); } ASSERT_EQUAL_SVE(pm_inputs, pm.VnB()); T all_true[N]; for (size_t i = 0; i < ArrayLength(all_true); i++) { all_true[i] = 1; } // Check that the flags were properly set. StatusFlags nzcv_expected = GetPredTestFlags((expected_pd_state == kAllFalse) ? all_false : pm_inputs, all_true, core.GetSVELaneCount(kBRegSize)); ASSERT_EQUAL_64(nzcv_expected, x0); ASSERT_EQUAL_SVE(pd.VnB(), pdm.VnB()); ASSERT_EQUAL_SVE(pd.VnB(), pd_s.VnB()); } } TEST_SVE(sve_brkn) { int pd[] = {1, 0, 0, 1, 0, 1, 1, 0, 1, 0}; int pm[] = {0, 1, 1, 1, 1, 0, 0, 1, 0, 1}; int pg_1[] = {1, 1, 0, 0, 1, 0, 1, 1, 0, 0}; int pg_2[] = {0, 0, 0, 1, 1, 1, 0, 0, 1, 1}; int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int pn_1[] = {1, 0, 0, 0, 0, 1, 1, 0, 0, 0}; int pn_2[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0}; int pn_3[] = {0, 0, 0, 0, 1, 1, 0, 0, 1, 1}; BrknHelper(config, pd, pg_1, pn_1, pm, kUnchanged); BrknHelper(config, pd, pg_1, pn_2, pm, kAllFalse); BrknHelper(config, pd, pg_1, pn_3, pm, kAllFalse); BrknHelper(config, pd, pg_2, pn_1, pm, kAllFalse); BrknHelper(config, pd, pg_2, pn_2, pm, kUnchanged); BrknHelper(config, pd, pg_2, pn_3, pm, kAllFalse); BrknHelper(config, pd, pg_3, pn_1, pm, kAllFalse); BrknHelper(config, pd, pg_3, pn_2, pm, kAllFalse); BrknHelper(config, pd, pg_3, pn_3, pm, kAllFalse); } TEST_SVE(sve_trn) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in0[] = {0xffeeddccbbaa9988, 0x7766554433221100}; uint64_t in1[] = {0xaa55aa55aa55aa55, 0x55aa55aa55aa55aa}; InsrHelper(&masm, z0.VnD(), in0); InsrHelper(&masm, z1.VnD(), in1); __ Trn1(z2.VnB(), z0.VnB(), z1.VnB()); __ Trn2(z3.VnB(), z0.VnB(), z1.VnB()); __ Trn1(z4.VnH(), z0.VnH(), z1.VnH()); __ Trn2(z5.VnH(), z0.VnH(), z1.VnH()); __ Trn1(z6.VnS(), z0.VnS(), z1.VnS()); __ Trn2(z7.VnS(), z0.VnS(), z1.VnS()); __ Trn1(z8.VnD(), z0.VnD(), z1.VnD()); __ Trn2(z9.VnD(), z0.VnD(), z1.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z2[] = {0x55ee55cc55aa5588, 0xaa66aa44aa22aa00}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0xaaffaaddaabbaa99, 0x5577555555335511}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0xaa55ddccaa559988, 0x55aa554455aa1100}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0xaa55ffeeaa55bbaa, 0x55aa776655aa3322}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0xaa55aa55bbaa9988, 0x55aa55aa33221100}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0x55aa55aa77665544}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); } } TEST_SVE(sve_zip_uzp) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Dup(z0.VnD(), 0xffeeddccbbaa9988); __ Insr(z0.VnD(), 0x7766554433221100); __ Dup(z1.VnD(), 0xaa55aa55aa55aa55); __ Insr(z1.VnD(), 0x55aa55aa55aa55aa); __ Zip1(z2.VnB(), z0.VnB(), z1.VnB()); __ Zip2(z3.VnB(), z0.VnB(), z1.VnB()); __ Zip1(z4.VnH(), z0.VnH(), z1.VnH()); __ Zip2(z5.VnH(), z0.VnH(), z1.VnH()); __ Zip1(z6.VnS(), z0.VnS(), z1.VnS()); __ Zip2(z7.VnS(), z0.VnS(), z1.VnS()); __ Zip1(z8.VnD(), z0.VnD(), z1.VnD()); __ Zip2(z9.VnD(), z0.VnD(), z1.VnD()); __ Uzp1(z10.VnB(), z2.VnB(), z3.VnB()); __ Uzp2(z11.VnB(), z2.VnB(), z3.VnB()); __ Uzp1(z12.VnH(), z4.VnH(), z5.VnH()); __ Uzp2(z13.VnH(), z4.VnH(), z5.VnH()); __ Uzp1(z14.VnS(), z6.VnS(), z7.VnS()); __ Uzp2(z15.VnS(), z6.VnS(), z7.VnS()); __ Uzp1(z16.VnD(), z8.VnD(), z9.VnD()); __ Uzp2(z17.VnD(), z8.VnD(), z9.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z2[] = {0x5577aa665555aa44, 0x5533aa225511aa00}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0xaaff55eeaadd55cc, 0xaabb55aaaa995588}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0x55aa776655aa5544, 0x55aa332255aa1100}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0xaa55ffeeaa55ddcc, 0xaa55bbaaaa559988}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x55aa55aa77665544, 0x55aa55aa33221100}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0xaa55aa55bbaa9988}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); // Check uzp is the opposite of zip. ASSERT_EQUAL_SVE(z0.VnD(), z10.VnD()); ASSERT_EQUAL_SVE(z1.VnD(), z11.VnD()); ASSERT_EQUAL_SVE(z0.VnD(), z12.VnD()); ASSERT_EQUAL_SVE(z1.VnD(), z13.VnD()); ASSERT_EQUAL_SVE(z0.VnD(), z14.VnD()); ASSERT_EQUAL_SVE(z1.VnD(), z15.VnD()); ASSERT_EQUAL_SVE(z0.VnD(), z16.VnD()); ASSERT_EQUAL_SVE(z1.VnD(), z17.VnD()); } } TEST_SVE(sve_fcadd) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Dup(z30.VnS(), 0); __ Ptrue(p0.VnB()); __ Pfalse(p1.VnB()); __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements. __ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements. __ Fdup(z0.VnH(), 10.0); // 10i + 10 __ Fdup(z1.VnH(), 5.0); // 5i + 5 __ Index(z7.VnH(), 1, 1); __ Scvtf(z7.VnH(), p0.Merging(), z7.VnH()); // Ai + B __ Sel(z2.VnH(), p3, z1.VnH(), z30.VnH()); // 5i + 0 __ Sel(z3.VnH(), p2, z1.VnH(), z30.VnH()); // 0i + 5 __ Sel(z7.VnH(), p3, z7.VnH(), z0.VnH()); // Ai + 10 __ Mov(z8, z7); __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 2); __ Sel(z8.VnH(), p2, z8.VnH(), z30.VnH()); // 0i + A // (10i + 10) + rotate(5i + 0, 90) // = (10i + 10) + (0i - 5) // = 10i + 5 __ Fcadd(z4.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 90); // (10i + 5) + rotate(0i + 5, 270) // = (10i + 5) + (-5i + 0) // = 5i + 5 __ Fcadd(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH(), 270); // The same calculation, but selecting real/imaginary using predication. __ Mov(z5, z0); __ Fcadd(z5.VnH(), p2.Merging(), z5.VnH(), z1.VnH(), 90); __ Fcadd(z5.VnH(), p3.Merging(), z5.VnH(), z1.VnH(), 270); // Reference calculation: (10i + 10) - (5i + 5) __ Fsub(z6.VnH(), z0.VnH(), z1.VnH()); // Calculation using varying imaginary values. // (Ai + 10) + rotate(5i + 0, 90) // = (Ai + 10) + (0i - 5) // = Ai + 5 __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z2.VnH(), 90); // (Ai + 5) + rotate(0i + A, 270) // = (Ai + 5) + (-Ai + 0) // = 5 __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z8.VnH(), 270); // Repeated, but for wider elements. __ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); __ Zip1(p3.VnS(), p1.VnS(), p0.VnS()); __ Fdup(z0.VnS(), 42.0); __ Fdup(z1.VnS(), 21.0); __ Index(z11.VnS(), 1, 1); __ Scvtf(z11.VnS(), p0.Merging(), z11.VnS()); __ Sel(z2.VnS(), p3, z1.VnS(), z30.VnS()); __ Sel(z29.VnS(), p2, z1.VnS(), z30.VnS()); __ Sel(z11.VnS(), p3, z11.VnS(), z0.VnS()); __ Mov(z12, z11); __ Ext(z12.VnB(), z12.VnB(), z12.VnB(), 4); __ Sel(z12.VnS(), p2, z12.VnS(), z30.VnS()); __ Fcadd(z8.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 90); __ Fcadd(z8.VnS(), p0.Merging(), z8.VnS(), z29.VnS(), 270); __ Mov(z9, z0); __ Fcadd(z9.VnS(), p2.Merging(), z9.VnS(), z1.VnS(), 90); __ Fcadd(z9.VnS(), p3.Merging(), z9.VnS(), z1.VnS(), 270); __ Fsub(z10.VnS(), z0.VnS(), z1.VnS()); __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z2.VnS(), 90); __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z12.VnS(), 270); __ Zip1(p2.VnD(), p0.VnD(), p1.VnD()); __ Zip1(p3.VnD(), p1.VnD(), p0.VnD()); __ Fdup(z0.VnD(), -42.0); __ Fdup(z1.VnD(), -21.0); __ Index(z15.VnD(), 1, 1); __ Scvtf(z15.VnD(), p0.Merging(), z15.VnD()); __ Sel(z2.VnD(), p3, z1.VnD(), z30.VnD()); __ Sel(z28.VnD(), p2, z1.VnD(), z30.VnD()); __ Sel(z15.VnD(), p3, z15.VnD(), z0.VnD()); __ Mov(z16, z15); __ Ext(z16.VnB(), z16.VnB(), z16.VnB(), 8); __ Sel(z16.VnD(), p2, z16.VnD(), z30.VnD()); __ Fcadd(z12.VnD(), p0.Merging(), z0.VnD(), z2.VnD(), 90); __ Fcadd(z12.VnD(), p0.Merging(), z12.VnD(), z28.VnD(), 270); __ Mov(z13, z0); __ Fcadd(z13.VnD(), p2.Merging(), z13.VnD(), z1.VnD(), 90); __ Fcadd(z13.VnD(), p3.Merging(), z13.VnD(), z1.VnD(), 270); __ Fsub(z14.VnD(), z0.VnD(), z1.VnD()); __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z2.VnD(), 90); __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z16.VnD(), 270); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH()); ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH()); ASSERT_EQUAL_SVE(z3.VnH(), z7.VnH()); ASSERT_EQUAL_SVE(z10.VnS(), z8.VnS()); ASSERT_EQUAL_SVE(z10.VnS(), z9.VnS()); ASSERT_EQUAL_SVE(z29.VnS(), z11.VnS()); ASSERT_EQUAL_SVE(z14.VnD(), z12.VnD()); ASSERT_EQUAL_SVE(z14.VnD(), z13.VnD()); ASSERT_EQUAL_SVE(z28.VnS(), z15.VnS()); } } TEST_SVE(sve_fcmla_index) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Fdup(z0.VnH(), 10.0); __ Fdup(z2.VnH(), 2.0); __ Zip1(z0.VnH(), z0.VnH(), z2.VnH()); // Duplicate complex numbers across z2 segments. First segment has 1i+0, // second has 3i+2, etc. __ Index(z1.VnH(), 0, 1); __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH()); __ Zip1(z2.VnS(), z1.VnS(), z1.VnS()); __ Zip1(z2.VnS(), z2.VnS(), z2.VnS()); // Derive a vector from z2 where only the third element in each segment // contains a complex number, with other elements zero. __ Index(z3.VnS(), 0, 1); __ And(z3.VnS(), z3.VnS(), 3); __ Cmpeq(p2.VnS(), p0.Zeroing(), z3.VnS(), 2); __ Dup(z3.VnB(), 0); __ Sel(z3.VnS(), p2, z2.VnS(), z3.VnS()); // Use indexed complex multiply on this vector, indexing the third element. __ Dup(z4.VnH(), 0); __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 0); __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 90); // Rotate the indexed complex number and repeat, negated, and with a different // index. __ Ext(z3.VnH(), z3.VnH(), z3.VnH(), 4); __ Dup(z5.VnH(), 0); __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 180); __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 270); __ Fneg(z5.VnH(), p0.Merging(), z5.VnH()); // Create a reference result from a vector complex multiply. __ Dup(z6.VnH(), 0); __ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 0); __ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 90); // Repeated, but for wider elements. __ Fdup(z0.VnS(), 42.0); __ Fdup(z2.VnS(), 24.0); __ Zip1(z0.VnS(), z0.VnS(), z2.VnS()); __ Index(z1.VnS(), -42, 13); __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS()); __ Zip1(z2.VnD(), z1.VnD(), z1.VnD()); __ Zip1(z2.VnD(), z2.VnD(), z2.VnD()); __ Index(z3.VnD(), 0, 1); __ And(z3.VnD(), z3.VnD(), 1); __ Cmpeq(p2.VnD(), p0.Zeroing(), z3.VnD(), 1); __ Dup(z3.VnB(), 0); __ Sel(z3.VnD(), p2, z2.VnD(), z3.VnD()); __ Dup(z7.VnS(), 0); __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 0); __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 90); __ Ext(z3.VnB(), z3.VnB(), z3.VnB(), 8); __ Dup(z8.VnS(), 0); __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 180); __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 270); __ Fneg(z8.VnS(), p0.Merging(), z8.VnS()); __ Dup(z9.VnS(), 0); __ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 0); __ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 90); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH()); ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH()); ASSERT_EQUAL_SVE(z9.VnS(), z7.VnS()); ASSERT_EQUAL_SVE(z9.VnS(), z8.VnS()); } } TEST_SVE(sve_fcmla) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Pfalse(p1.VnB()); __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements. __ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements. __ Fdup(z0.VnH(), 10.0); __ Fdup(z2.VnH(), 2.0); // Create pairs of complex numbers, Ai + A. A is chosen to be non-zero, as // the later fneg will result in a failed comparison otherwise. __ Index(z1.VnH(), -4, 3); __ Zip1(z1.VnH(), z1.VnH(), z1.VnH()); __ Zip1(z1.VnH(), z1.VnH(), z1.VnH()); __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH()); __ Sel(z3.VnH(), p2, z0.VnH(), z1.VnH()); // Ai + 10 __ Sel(z4.VnH(), p2, z1.VnH(), z2.VnH()); // 2i + A __ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); // Even complex numbers. __ Zip1(p3.VnS(), p1.VnS(), p0.VnS()); // Odd complex numbers. // Calculate (Ai + 10) * (2i + A) = (20 + A^2)i + 8A, using predication to // select only the complex numbers in odd-numbered element pairs. This leaves // results in elements 2/3, 6/7, etc. with zero in elements 0/1, 4/5, etc. // ... 7 6 5 4 3 2 1 0 <-- element // ... | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | 0 | 0 | <-- value __ Dup(z5.VnH(), 0); __ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 0); __ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 90); // Move the odd results to the even result positions. // ... 7 6 5 4 3 2 1 0 <-- element // ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 4); // Calculate -(Ai + 10) * (2i + A) = -(20 + A^2)i - 8A for the even complex // numbers. // ... 7 6 5 4 3 2 1 0 <-- element // ... | 0 | 0 | -20-A^2 | -8A | 0 | 0 | -20-A^2 | -8A | <-- value __ Dup(z6.VnH(), 0); __ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 180); __ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 270); // Negate the even results. The results in z6 should now match the results // computed earlier in z5. // ... 7 6 5 4 3 2 1 0 <-- element // ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value __ Fneg(z6.VnH(), p2.Merging(), z6.VnH()); // Similarly, but for wider elements. __ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); __ Zip1(p3.VnS(), p1.VnS(), p0.VnS()); __ Index(z1.VnS(), -4, 3); __ Zip1(z1.VnS(), z1.VnS(), z1.VnS()); __ Zip1(z1.VnS(), z1.VnS(), z1.VnS()); __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS()); __ Fdup(z0.VnS(), 20.0); __ Fdup(z2.VnS(), 21.0); __ Sel(z3.VnS(), p2, z0.VnS(), z1.VnS()); __ Sel(z4.VnS(), p2, z1.VnS(), z2.VnS()); __ Punpklo(p2.VnH(), p2.VnB()); __ Punpklo(p3.VnH(), p3.VnB()); __ Dup(z7.VnS(), 0); __ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 0); __ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 90); __ Ext(z7.VnB(), z7.VnB(), z7.VnB(), 8); __ Dup(z8.VnS(), 0); __ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 180); __ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 270); __ Fneg(z8.VnS(), p2.Merging(), z8.VnS()); // Double precision computed for even lanes only. __ Zip1(p2.VnD(), p0.VnD(), p1.VnD()); __ Index(z1.VnD(), -4, 3); __ Zip1(z1.VnD(), z1.VnD(), z1.VnD()); __ Zip1(z1.VnD(), z1.VnD(), z1.VnD()); __ Scvtf(z1.VnD(), p0.Merging(), z1.VnD()); __ Fdup(z0.VnD(), 20.0); __ Fdup(z2.VnD(), 21.0); __ Sel(z3.VnD(), p2, z0.VnD(), z1.VnD()); __ Sel(z4.VnD(), p2, z1.VnD(), z2.VnD()); __ Punpklo(p2.VnH(), p2.VnB()); __ Dup(z9.VnD(), 0); __ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 0); __ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 90); __ Dup(z10.VnD(), 0); __ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 180); __ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 270); __ Fneg(z10.VnD(), p2.Merging(), z10.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z5.VnH(), z6.VnH()); ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS()); ASSERT_EQUAL_SVE(z9.VnD(), z10.VnD()); } } // Create a pattern in dst where the value of each element in src is incremented // by the segment number. This allows varying a short input by a predictable // pattern for each segment. static void FPSegmentPatternHelper(MacroAssembler* masm, const ZRegister& dst, const PRegisterM& ptrue, const ZRegister& src) { VIXL_ASSERT(AreSameLaneSize(dst, src)); UseScratchRegisterScope temps(masm); ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst); masm->Index(ztmp, 0, 1); masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2()); masm->Scvtf(ztmp, ptrue, ztmp); masm->Fadd(dst, src, ztmp); } TEST_SVE(sve_fpmul_index) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in0[] = {0x3ff000003f803c00, 0xbff00000bf80bc00}; uint64_t in1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76}; __ Ptrue(p0.VnB()); // Repeat indexed vector across up to 2048-bit VL. for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i++) { InsrHelper(&masm, z25.VnD(), in0); } InsrHelper(&masm, z1.VnD(), in1); FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z25.VnH()); __ Fmul(z2.VnH(), z1.VnH(), z0.VnH(), 0); __ Fmul(z3.VnH(), z1.VnH(), z0.VnH(), 1); __ Fmul(z4.VnH(), z1.VnH(), z0.VnH(), 4); __ Fmul(z5.VnH(), z1.VnH(), z0.VnH(), 7); __ Fmul(z6.VnS(), z1.VnS(), z0.VnS(), 0); __ Fmul(z7.VnS(), z1.VnS(), z0.VnS(), 1); __ Fmul(z8.VnS(), z1.VnS(), z0.VnS(), 2); __ Fmul(z9.VnS(), z1.VnS(), z0.VnS(), 3); __ Fmul(z10.VnD(), z1.VnD(), z0.VnD(), 0); __ Fmul(z11.VnD(), z1.VnD(), z0.VnD(), 1); // Compute the results using other instructions. __ Dup(z12.VnH(), z25.VnH(), 0); FPSegmentPatternHelper(&masm, z12.VnH(), p0.Merging(), z12.VnH()); __ Fmul(z12.VnH(), z1.VnH(), z12.VnH()); __ Dup(z13.VnH(), z25.VnH(), 1); FPSegmentPatternHelper(&masm, z13.VnH(), p0.Merging(), z13.VnH()); __ Fmul(z13.VnH(), z1.VnH(), z13.VnH()); __ Dup(z14.VnH(), z25.VnH(), 4); FPSegmentPatternHelper(&masm, z14.VnH(), p0.Merging(), z14.VnH()); __ Fmul(z14.VnH(), z1.VnH(), z14.VnH()); __ Dup(z15.VnH(), z25.VnH(), 7); FPSegmentPatternHelper(&masm, z15.VnH(), p0.Merging(), z15.VnH()); __ Fmul(z15.VnH(), z1.VnH(), z15.VnH()); __ Dup(z16.VnS(), z25.VnS(), 0); FPSegmentPatternHelper(&masm, z16.VnH(), p0.Merging(), z16.VnH()); __ Fmul(z16.VnS(), z1.VnS(), z16.VnS()); __ Dup(z17.VnS(), z25.VnS(), 1); FPSegmentPatternHelper(&masm, z17.VnH(), p0.Merging(), z17.VnH()); __ Fmul(z17.VnS(), z1.VnS(), z17.VnS()); __ Dup(z18.VnS(), z25.VnS(), 2); FPSegmentPatternHelper(&masm, z18.VnH(), p0.Merging(), z18.VnH()); __ Fmul(z18.VnS(), z1.VnS(), z18.VnS()); __ Dup(z19.VnS(), z25.VnS(), 3); FPSegmentPatternHelper(&masm, z19.VnH(), p0.Merging(), z19.VnH()); __ Fmul(z19.VnS(), z1.VnS(), z19.VnS()); __ Dup(z20.VnD(), z25.VnD(), 0); FPSegmentPatternHelper(&masm, z20.VnH(), p0.Merging(), z20.VnH()); __ Fmul(z20.VnD(), z1.VnD(), z20.VnD()); __ Dup(z21.VnD(), z25.VnD(), 1); FPSegmentPatternHelper(&masm, z21.VnH(), p0.Merging(), z21.VnH()); __ Fmul(z21.VnD(), z1.VnD(), z21.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z12.VnH(), z2.VnH()); ASSERT_EQUAL_SVE(z13.VnH(), z3.VnH()); ASSERT_EQUAL_SVE(z14.VnH(), z4.VnH()); ASSERT_EQUAL_SVE(z15.VnH(), z5.VnH()); ASSERT_EQUAL_SVE(z16.VnS(), z6.VnS()); ASSERT_EQUAL_SVE(z17.VnS(), z7.VnS()); ASSERT_EQUAL_SVE(z18.VnS(), z8.VnS()); ASSERT_EQUAL_SVE(z19.VnS(), z9.VnS()); ASSERT_EQUAL_SVE(z20.VnD(), z10.VnD()); ASSERT_EQUAL_SVE(z21.VnD(), z11.VnD()); } } TEST_SVE(sve_ftmad) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t in_h0[] = {0x7c027e01fc02fe01, 0x3c003c00bc00bc00, 0x3c003c00bc00bc00}; uint64_t in_h1[] = {0xfe01fc027e017e01, 0x3c00bc003c00bc00, 0x3c00bc003c00bc00}; uint64_t in_s0[] = {0x7f800002ffc00001, 0x3f8000003f800000, 0xbf800000bf800000}; uint64_t in_s1[] = {0xffc00001ffc00001, 0x3f800000bf800000, 0x3f800000bf800000}; uint64_t in_d0[] = {0x7ff8000000000001, 0x3ff0000000000000, 0xbff0000000000000}; uint64_t in_d1[] = {0xfff0000000000002, 0xbff0000000000000, 0x3ff0000000000000}; InsrHelper(&masm, z0.VnD(), in_h0); InsrHelper(&masm, z1.VnD(), in_h1); InsrHelper(&masm, z2.VnD(), in_s0); InsrHelper(&masm, z3.VnD(), in_s1); InsrHelper(&masm, z4.VnD(), in_d0); InsrHelper(&masm, z5.VnD(), in_d1); __ Mov(z6, z0); __ Ftmad(z6.VnH(), z6.VnH(), z1.VnH(), 0); __ Mov(z7, z0); __ Ftmad(z7.VnH(), z7.VnH(), z1.VnH(), 1); __ Mov(z8, z0); __ Ftmad(z8.VnH(), z8.VnH(), z1.VnH(), 2); __ Mov(z9, z2); __ Ftmad(z9.VnS(), z9.VnS(), z3.VnS(), 0); __ Mov(z10, z2); __ Ftmad(z10.VnS(), z10.VnS(), z3.VnS(), 3); __ Mov(z11, z2); __ Ftmad(z11.VnS(), z11.VnS(), z3.VnS(), 4); __ Mov(z12, z4); __ Ftmad(z12.VnD(), z12.VnD(), z5.VnD(), 0); __ Mov(z13, z4); __ Ftmad(z13.VnD(), z13.VnD(), z5.VnD(), 5); __ Mov(z14, z4); __ Ftmad(z14.VnD(), z14.VnD(), z5.VnD(), 7); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z6[] = {0x7e027e02fe02fe01, 0x4000400000000000, 0x4000400000000000}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0x7e027e02fe02fe01, 0x3aab3800bcabbe00, 0x3aab3800bcabbe00}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0x7e027e02fe02fe01, 0x3c083c2abbefbbac, 0x3c083c2abbefbbac}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = {0x7fc00002ffc00001, 0x4000000040000000, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z10[] = {0x7fc00002ffc00001, 0x3f7ff2ff3f7fa4fc, 0xbf800680bf802d82}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); uint64_t expected_z11[] = {0x7fc00002ffc00001, 0x3f8000173f8000cd, 0xbf7fffd2bf7ffe66}; ASSERT_EQUAL_SVE(expected_z11, z11.VnD()); uint64_t expected_z12[] = {0x7ff8000000000002, 0x4000000000000000, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z12, z12.VnD()); uint64_t expected_z13[] = {0x7ff8000000000002, 0x3fefffff6c0d846c, 0xbff0000006b978ae}; ASSERT_EQUAL_SVE(expected_z13, z13.VnD()); uint64_t expected_z14[] = {0x7ff8000000000002, 0x3feffffffffe708a, 0xbff0000000000000}; ASSERT_EQUAL_SVE(expected_z14, z14.VnD()); } } static void BasicFPArithHelper(MacroAssembler* masm, int lane_size_in_bits, const uint64_t (&inputs)[2], const uint64_t (&inputs_fmulx)[2], const uint64_t (&inputs_nans)[2]) { int ls = lane_size_in_bits; for (int i = 0; i < 16; i++) { InsrHelper(masm, z0.VnD(), inputs); } ZRegister rvrs = z1.WithLaneSize(ls); masm->Rev(rvrs, z0.WithLaneSize(ls)); int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1}; Initialise(masm, p2.VnB(), pred); PRegisterM p2m = p2.Merging(); masm->Mov(z2, z0); masm->Fadd(z2.WithLaneSize(ls), p2m, z2.WithLaneSize(ls), rvrs, FastNaNPropagation); masm->Mov(z3, z0); masm->Fsub(z3.WithLaneSize(ls), p2m, z3.WithLaneSize(ls), rvrs); masm->Mov(z4, z0); masm->Fsub(z4.WithLaneSize(ls), p2m, rvrs, z4.WithLaneSize(ls)); masm->Mov(z5, z0); masm->Fabd(z5.WithLaneSize(ls), p2m, z5.WithLaneSize(ls), rvrs, FastNaNPropagation); masm->Mov(z6, z0); masm->Fmul(z6.WithLaneSize(ls), p2m, z6.WithLaneSize(ls), rvrs, FastNaNPropagation); for (int i = 0; i < 16; i++) { InsrHelper(masm, z7.VnD(), inputs_fmulx); } masm->Rev(z8.WithLaneSize(ls), z7.WithLaneSize(ls)); masm->Fmulx(z7.WithLaneSize(ls), p2m, z7.WithLaneSize(ls), z8.WithLaneSize(ls), FastNaNPropagation); InsrHelper(masm, z8.VnD(), inputs_nans); masm->Mov(z9, z8); masm->Fminnm(z9.WithLaneSize(ls), p2m, z9.WithLaneSize(ls), rvrs, FastNaNPropagation); masm->Mov(z10, z8); masm->Fmaxnm(z10.WithLaneSize(ls), p2m, z10.WithLaneSize(ls), rvrs, FastNaNPropagation); } TEST_SVE(sve_fp_arith_pred_h) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t inputs[] = {0x4800470046004500, 0x4400420040003c00}; uint64_t inputs_fmulx[] = {0x7c00fc007c00fc00, 0x0000800000008000}; uint64_t inputs_nans[] = {0x7fffffff7fffffff, 0x7bfffbff7fbbfbff}; BasicFPArithHelper(&masm, kHRegSize, inputs, inputs_fmulx, inputs_nans); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z2[] = {0x4880488048804880, 0x4880420048804880}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0x4700450042003c00, 0xbc004200c500c700}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0xc700c500c200bc00, 0x3c00420045004700}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x4700450042003c00, 0x3c00420045004700}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x48004b004c804d00, 0x4d0042004b004800}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0xc000c000c000c000, 0xc0008000c000c000}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z9[] = {0x3c00400042004400, 0x4500fbff4700fbff}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z10[] = {0x3c00400042004400, 0x7bfffbff47004800}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); } } TEST_SVE(sve_fp_arith_pred_s) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t inputs[] = {0x4080000040400000, 0x400000003f800000}; uint64_t inputs_fmulx[] = {0x7f800000ff800000, 0x0000000080000000}; uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x41000000c1000000}; BasicFPArithHelper(&masm, kSRegSize, inputs, inputs_fmulx, inputs_nans); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z2[] = {0x40a0000040a00000, 0x4000000040a00000}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0x404000003f800000, 0x40000000c0400000}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0xc0400000bf800000, 0x4000000040400000}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x404000003f800000, 0x4000000040400000}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x4080000040c00000, 0x4000000040800000}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0xc0000000c0000000, 0x00000000c0000000}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z9[] = {0x3f80000040000000, 0x41000000c1000000}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z10[] = {0x3f80000040000000, 0x4100000040800000}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); } } TEST_SVE(sve_fp_arith_pred_d) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t inputs[] = {0x4000000000000000, 0x3ff0000000000000}; uint64_t inputs_fmulx[] = {0x7ff0000000000000, 0x8000000000000000}; uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x4100000000000000}; BasicFPArithHelper(&masm, kDRegSize, inputs, inputs_fmulx, inputs_nans); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z2[] = {0x4008000000000000, 0x4008000000000000}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0x3ff0000000000000, 0xbff0000000000000}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0xbff0000000000000, 0x3ff0000000000000}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x3ff0000000000000, 0x3ff0000000000000}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x4000000000000000, 0x4000000000000000}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0xc000000000000000, 0xc000000000000000}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z9[] = {0x3ff0000000000000, 0x4000000000000000}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z10[] = {0x3ff0000000000000, 0x4100000000000000}; ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); } } TEST_SVE(sve_fp_arith_pred_imm) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1}; Initialise(&masm, p0.VnB(), pred); PRegisterM p0m = p0.Merging(); __ Ptrue(p1.VnB()); __ Fdup(z0.VnD(), 0.0); __ Mov(z1, z0); __ Fdiv(z1.VnH(), p1.Merging(), z1.VnH(), z1.VnH()); __ Mov(z2, z0); __ Fadd(z2.VnH(), p0m, z2.VnH(), 0.5); __ Mov(z3, z2); __ Fsub(z3.VnH(), p0m, z3.VnH(), 1.0); __ Mov(z4, z3); __ Fsub(z4.VnH(), p0m, 1.0, z4.VnH()); __ Mov(z5, z4); __ Fmul(z5.VnH(), p0m, z5.VnH(), 2.0); __ Mov(z6, z1); __ Fminnm(z6.VnH(), p0m, z6.VnH(), 0.0); __ Mov(z7, z1); __ Fmaxnm(z7.VnH(), p0m, z7.VnH(), 1.0); __ Mov(z8, z5); __ Fmin(z8.VnH(), p0m, z8.VnH(), 1.0); __ Mov(z9, z5); __ Fmax(z9.VnH(), p0m, z9.VnH(), 0.0); __ Mov(z11, z0); __ Fdiv(z11.VnS(), p1.Merging(), z11.VnS(), z11.VnS()); __ Mov(z12, z0); __ Fadd(z12.VnS(), p0m, z12.VnS(), 0.5); __ Mov(z13, z12); __ Fsub(z13.VnS(), p0m, z13.VnS(), 1.0); __ Mov(z14, z13); __ Fsub(z14.VnS(), p0m, 1.0, z14.VnS()); __ Mov(z15, z14); __ Fmul(z15.VnS(), p0m, z15.VnS(), 2.0); __ Mov(z16, z11); __ Fminnm(z16.VnS(), p0m, z16.VnS(), 0.0); __ Mov(z17, z11); __ Fmaxnm(z17.VnS(), p0m, z17.VnS(), 1.0); __ Mov(z18, z15); __ Fmin(z18.VnS(), p0m, z18.VnS(), 1.0); __ Mov(z19, z15); __ Fmax(z19.VnS(), p0m, z19.VnS(), 0.0); __ Mov(z21, z0); __ Fdiv(z21.VnD(), p1.Merging(), z21.VnD(), z21.VnD()); __ Mov(z22, z0); __ Fadd(z22.VnD(), p0m, z22.VnD(), 0.5); __ Mov(z23, z22); __ Fsub(z23.VnD(), p0m, z23.VnD(), 1.0); __ Mov(z24, z23); __ Fsub(z24.VnD(), p0m, 1.0, z24.VnD()); __ Mov(z25, z24); __ Fmul(z25.VnD(), p0m, z25.VnD(), 2.0); __ Mov(z26, z21); __ Fminnm(z26.VnD(), p0m, z26.VnD(), 0.0); __ Mov(z27, z21); __ Fmaxnm(z27.VnD(), p0m, z27.VnD(), 1.0); __ Mov(z28, z25); __ Fmin(z28.VnD(), p0m, z28.VnD(), 1.0); __ Mov(z29, z25); __ Fmax(z29.VnD(), p0m, z29.VnD(), 0.0); __ Index(z0.VnH(), -3, 1); __ Scvtf(z0.VnH(), p1.Merging(), z0.VnH()); __ Fmax(z0.VnH(), p1.Merging(), z0.VnH(), 0.0); __ Index(z1.VnS(), -4, 2); __ Scvtf(z1.VnS(), p1.Merging(), z1.VnS()); __ Fadd(z1.VnS(), p1.Merging(), z1.VnS(), 1.0); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z2[] = {0x3800380038003800, 0x3800000038003800}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0xb800b800b800b800, 0xb8000000b800b800}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); uint64_t expected_z4[] = {0x3e003e003e003e00, 0x3e0000003e003e00}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x4200420042004200, 0x4200000042004200}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x0000000000000000, 0x00007e0000000000}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0x3c003c003c003c00, 0x3c007e003c003c00}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0x3c003c003c003c00, 0x3c0000003c003c00}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = {0x4200420042004200, 0x4200000042004200}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z12[] = {0x3f0000003f000000, 0x000000003f000000}; ASSERT_EQUAL_SVE(expected_z12, z12.VnD()); uint64_t expected_z13[] = {0xbf000000bf000000, 0x00000000bf000000}; ASSERT_EQUAL_SVE(expected_z13, z13.VnD()); uint64_t expected_z14[] = {0x3fc000003fc00000, 0x000000003fc00000}; ASSERT_EQUAL_SVE(expected_z14, z14.VnD()); uint64_t expected_z15[] = {0x4040000040400000, 0x0000000040400000}; ASSERT_EQUAL_SVE(expected_z15, z15.VnD()); uint64_t expected_z16[] = {0x0000000000000000, 0x7fc0000000000000}; ASSERT_EQUAL_SVE(expected_z16, z16.VnD()); uint64_t expected_z17[] = {0x3f8000003f800000, 0x7fc000003f800000}; ASSERT_EQUAL_SVE(expected_z17, z17.VnD()); uint64_t expected_z18[] = {0x3f8000003f800000, 0x000000003f800000}; ASSERT_EQUAL_SVE(expected_z18, z18.VnD()); uint64_t expected_z19[] = {0x4040000040400000, 0x0000000040400000}; ASSERT_EQUAL_SVE(expected_z19, z19.VnD()); uint64_t expected_z22[] = {0x3fe0000000000000, 0x3fe0000000000000}; ASSERT_EQUAL_SVE(expected_z22, z22.VnD()); uint64_t expected_z23[] = {0xbfe0000000000000, 0xbfe0000000000000}; ASSERT_EQUAL_SVE(expected_z23, z23.VnD()); uint64_t expected_z24[] = {0x3ff8000000000000, 0x3ff8000000000000}; ASSERT_EQUAL_SVE(expected_z24, z24.VnD()); uint64_t expected_z25[] = {0x4008000000000000, 0x4008000000000000}; ASSERT_EQUAL_SVE(expected_z25, z25.VnD()); uint64_t expected_z26[] = {0x0000000000000000, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z26, z26.VnD()); uint64_t expected_z27[] = {0x3ff0000000000000, 0x3ff0000000000000}; ASSERT_EQUAL_SVE(expected_z27, z27.VnD()); uint64_t expected_z28[] = {0x3ff0000000000000, 0x3ff0000000000000}; ASSERT_EQUAL_SVE(expected_z28, z28.VnD()); uint64_t expected_z29[] = {0x4008000000000000, 0x4008000000000000}; ASSERT_EQUAL_SVE(expected_z29, z29.VnD()); uint64_t expected_z0[] = {0x4400420040003c00, 0x0000000000000000}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); uint64_t expected_z1[] = {0x404000003f800000, 0xbf800000c0400000}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); } } TEST_SVE(sve_fscale) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); uint64_t inputs_h[] = {0x4800470046004500, 0x4400420040003c00}; InsrHelper(&masm, z0.VnD(), inputs_h); uint64_t inputs_s[] = {0x4080000040400000, 0x400000003f800000}; InsrHelper(&masm, z1.VnD(), inputs_s); uint64_t inputs_d[] = {0x40f0000000000000, 0x4000000000000000}; InsrHelper(&masm, z2.VnD(), inputs_d); uint64_t scales[] = {0x00080002fff8fffe, 0x00100001fff0ffff}; InsrHelper(&masm, z3.VnD(), scales); __ Ptrue(p0.VnB()); int pred[] = {0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1}; Initialise(&masm, p1.VnB(), pred); __ Mov(z4, z0); __ Fscale(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH()); __ Mov(z5, z0); __ Fscale(z5.VnH(), p1.Merging(), z5.VnH(), z3.VnH()); __ Sunpklo(z3.VnS(), z3.VnH()); __ Mov(z6, z1); __ Fscale(z6.VnS(), p0.Merging(), z6.VnS(), z3.VnS()); __ Mov(z7, z1); __ Fscale(z7.VnS(), p1.Merging(), z7.VnS(), z3.VnS()); __ Sunpklo(z3.VnD(), z3.VnS()); __ Mov(z8, z2); __ Fscale(z8.VnD(), p0.Merging(), z8.VnD(), z3.VnD()); __ Mov(z9, z2); __ Fscale(z9.VnD(), p1.Merging(), z9.VnD(), z3.VnD()); // Test full double precision range scaling. __ Dup(z10.VnD(), 2045); __ Dup(z11.VnD(), 0x0010000000000000); // 2^-1022 __ Fscale(z11.VnD(), p0.Merging(), z11.VnD(), z10.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z4[] = {0x68004f0026003d00, 0x7c00460002003800}; ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); uint64_t expected_z5[] = {0x68004f0026004500, 0x7c00420002003800}; ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); uint64_t expected_z6[] = {0x4880000040c00000, 0x380000003f000000}; ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); uint64_t expected_z7[] = {0x4880000040400000, 0x400000003f000000}; ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); uint64_t expected_z8[] = {0x3ff0000000000000, 0x3ff0000000000000}; ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); uint64_t expected_z9[] = {0x40f0000000000000, 0x3ff0000000000000}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); uint64_t expected_z11[] = {0x7fe0000000000000, 0x7fe0000000000000}; ASSERT_EQUAL_SVE(expected_z11, z11.VnD()); } } typedef void (MacroAssembler::*FcvtFrintMFn)(const ZRegister& zd, const PRegisterM& pg, const ZRegister& zn); typedef void (MacroAssembler::*FcvtFrintZFn)(const ZRegister& zd, const PRegisterZ& pg, const ZRegister& zn); template static void TestFcvtFrintHelper(Test* config, FcvtFrintMFn macro_m, FcvtFrintZFn macro_z, int dst_type_size_in_bits, int src_type_size_in_bits, const F (&zn_inputs)[N], const int (&pg_inputs)[N], const uint64_t (&zd_expected_all_active)[N]) { VIXL_ASSERT(macro_m != NULL); SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // If the input and result types have a different size, the instruction // options on elements of the largest specified type is determined by the // larger type. int lane_size_in_bits = std::max(dst_type_size_in_bits, src_type_size_in_bits); ZRegister zd_all_active = z25; ZRegister zd_merging = z26; ZRegister zn = z27; uint64_t zn_rawbits[N]; FPToRawbitsWithSize(zn_inputs, zn_rawbits, src_type_size_in_bits); InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_rawbits); PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits); __ Ptrue(pg_all_active); // Test floating-point conversions with all lanes activated. (masm.*macro_m)(zd_all_active.WithLaneSize(dst_type_size_in_bits), pg_all_active.Merging(), zn.WithLaneSize(src_type_size_in_bits)); PRegisterWithLaneSize pg_merging = p1.WithLaneSize(lane_size_in_bits); Initialise(&masm, pg_merging, pg_inputs); __ Dup(zd_merging.VnD(), 0x0bad0bad0bad0bad); // Use the same `zn` inputs to test floating-point conversions but partial // lanes are set inactive. (masm.*macro_m)(zd_merging.WithLaneSize(dst_type_size_in_bits), pg_merging.Merging(), zn.WithLaneSize(src_type_size_in_bits)); ZRegister zd_zeroing = z24; PRegisterWithLaneSize pg_zeroing = p1.WithLaneSize(lane_size_in_bits); Initialise(&masm, pg_zeroing, pg_inputs); if (macro_z != NULL) { __ Dup(zd_zeroing.VnD(), 0x0bad0bad0bad0bad); (masm.*macro_z)(zd_zeroing.WithLaneSize(dst_type_size_in_bits), pg_zeroing.Zeroing(), zn.WithLaneSize(src_type_size_in_bits)); } END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zd_expected_all_active, zd_all_active.WithLaneSize(lane_size_in_bits)); uint64_t zd_expected_merging[N]; for (unsigned i = 0; i < N; i++) { zd_expected_merging[i] = pg_inputs[i] ? zd_expected_all_active[i] : 0x0bad0bad0bad0bad & GetUintMask(lane_size_in_bits); } ASSERT_EQUAL_SVE(zd_expected_merging, zd_merging.WithLaneSize(lane_size_in_bits)); if (macro_z != NULL) { uint64_t zd_expected_zeroing[N] = {0}; for (unsigned i = 0; i < N; i++) { if (pg_inputs[i]) { zd_expected_zeroing[i] = zd_expected_all_active[i]; } } ASSERT_EQUAL_SVE(zd_expected_zeroing, zd_zeroing.WithLaneSize(lane_size_in_bits)); } } } template static void TestFcvtzHelper(Test* config, FcvtFrintMFn macro_m, int dst_type_size_in_bits, int src_type_size_in_bits, const F (&zn_inputs)[N], const int (&pg_inputs)[N], const uint64_t (&zd_expected_all_active)[N]) { TestFcvtFrintHelper(config, macro_m, // Fcvt variants have no zeroing predication form. NULL, dst_type_size_in_bits, src_type_size_in_bits, zn_inputs, pg_inputs, zd_expected_all_active); } TEST_SVE(fcvtzs_fcvtzu_float16) { const double h_max_float16 = 0x7ff0; // Largest float16 == INT16_MAX. const double h_min_float16 = -h_max_float16; // Smallest float16 > INT16_MIN. const double largest_float16 = 0xffe0; // 65504 const double smallest_float16 = -largest_float16; const double h_max_int_add_one = 0x8000; double zn_inputs[] = {1.0, 1.1, 1.5, -1.5, h_max_float16, h_min_float16, largest_float16, smallest_float16, kFP64PositiveInfinity, kFP64NegativeInfinity, h_max_int_add_one}; int pg_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1}; uint64_t expected_fcvtzs_fp162h[] = {1, 1, 1, 0xffff, 0x7ff0, 0x8010, 0x7fff, 0x8000, 0x7fff, 0x8000, 0x7fff}; uint64_t expected_fcvtzu_fp162h[] = {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffff, 0, 0x8000}; // Float16 to 16-bit integers. TestFcvtzHelper(config, &MacroAssembler::Fcvtzs, kHRegSize, kHRegSize, zn_inputs, pg_inputs, expected_fcvtzs_fp162h); TestFcvtzHelper(config, &MacroAssembler::Fcvtzu, kHRegSize, kHRegSize, zn_inputs, pg_inputs, expected_fcvtzu_fp162h); uint64_t expected_fcvtzs_fp162w[] = {1, 1, 1, 0xffffffff, 0x7ff0, 0xffff8010, 0xffe0, 0xffff0020, 0x7fffffff, 0x80000000, 0x8000}; uint64_t expected_fcvtzu_fp162w[] = {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffff, 0, 0x8000}; // Float16 to 32-bit integers. TestFcvtzHelper(config, &MacroAssembler::Fcvtzs, kSRegSize, kHRegSize, zn_inputs, pg_inputs, expected_fcvtzs_fp162w); TestFcvtzHelper(config, &MacroAssembler::Fcvtzu, kSRegSize, kHRegSize, zn_inputs, pg_inputs, expected_fcvtzu_fp162w); uint64_t expected_fcvtzs_fp162x[] = {1, 1, 1, 0xffffffffffffffff, 0x7ff0, 0xffffffffffff8010, 0xffe0, 0xffffffffffff0020, 0x7fffffffffffffff, 0x8000000000000000, 0x8000}; uint64_t expected_fcvtzu_fp162x[] = {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffffffffffff, 0, 0x8000}; // Float16 to 64-bit integers. TestFcvtzHelper(config, &MacroAssembler::Fcvtzs, kDRegSize, kHRegSize, zn_inputs, pg_inputs, expected_fcvtzs_fp162x); TestFcvtzHelper(config, &MacroAssembler::Fcvtzu, kDRegSize, kHRegSize, zn_inputs, pg_inputs, expected_fcvtzu_fp162x); } TEST_SVE(fcvtzs_fcvtzu_float) { const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX. const double w_min_float = -w_max_float; // Smallest float > INT32_MIN. const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX. const double x_min_float = -x_max_float; // Smallest float > INT64_MIN. const double w_min_int_add_one = 0x80000000; const double x_max_int_add_one = 0x80000000'00000000; double zn_inputs[] = {1.0, 1.1, 1.5, -1.5, w_max_float, w_min_float, x_max_float, x_min_float, kFP64PositiveInfinity, kFP64NegativeInfinity, w_min_int_add_one, x_max_int_add_one}; int pg_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1}; uint64_t expected_fcvtzs_s2w[] = {1, 1, 1, 0xffffffff, 0x7fffff80, 0x80000080, 0x7fffffff, 0x80000000, 0x7fffffff, 0x80000000, 0x7fffffff, 0x7fffffff}; uint64_t expected_fcvtzu_s2w[] = {1, 1, 1, 0, 0x7fffff80, 0, 0xffffffff, 0, 0xffffffff, 0, 0x80000000, 0xffffffff}; // Float to 32-bit integers. TestFcvtzHelper(config, &MacroAssembler::Fcvtzs, kSRegSize, kSRegSize, zn_inputs, pg_inputs, expected_fcvtzs_s2w); TestFcvtzHelper(config, &MacroAssembler::Fcvtzu, kSRegSize, kSRegSize, zn_inputs, pg_inputs, expected_fcvtzu_s2w); uint64_t expected_fcvtzs_s2x[] = {1, 1, 1, 0xffffffffffffffff, 0x7fffff80, 0xffffffff80000080, 0x7fffff8000000000, 0x8000008000000000, 0x7fffffffffffffff, 0x8000000000000000, 0x80000000, 0x7fffffffffffffff}; uint64_t expected_fcvtzu_s2x[] = {1, 1, 1, 0, 0x7fffff80, 0, 0x7fffff8000000000, 0, 0xffffffffffffffff, 0, 0x80000000, 0x8000000000000000}; // Float to 64-bit integers. TestFcvtzHelper(config, &MacroAssembler::Fcvtzs, kDRegSize, kSRegSize, zn_inputs, pg_inputs, expected_fcvtzs_s2x); TestFcvtzHelper(config, &MacroAssembler::Fcvtzu, kDRegSize, kSRegSize, zn_inputs, pg_inputs, expected_fcvtzu_s2x); } TEST_SVE(fcvtzs_fcvtzu_double) { const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX. const double w_min_float = -w_max_float; // Smallest float > INT32_MIN. const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX. const double x_min_float = -x_max_float; // Smallest float > INT64_MIN. const double w_max_double = kWMaxInt; // Largest double == INT32_MAX. const double w_min_double = -w_max_double; // Smallest double > INT32_MIN. const double x_max_double = 0x7ffffffffffffc00; // Largest double < INT64_MAX. const double x_min_double = -x_max_double; // Smallest double > INT64_MIN. const double w_max_int_sub_one = kWMaxInt - 1; const double w_min_int_add_one = kWMinInt + 1; const double w_max_int_add_one = 0x80000000; const double x_max_int_add_one = 0x80000000'00000000; double zn_inputs[] = {1.0, 1.1, 1.5, -1.5, w_max_float, w_min_float, x_max_float, x_min_float, w_max_double, w_min_double, x_max_double, x_min_double, kFP64PositiveInfinity, kFP64NegativeInfinity, w_max_int_sub_one, w_min_int_add_one, w_max_int_add_one, x_max_int_add_one}; int pg_inputs[] = {1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0}; uint64_t expected_fcvtzs_d2w[] = {1, 1, 1, 0xffffffffffffffff, 0x7fffff80, 0xffffffff80000080, 0x7fffffff, 0xffffffff80000000, 0x7fffffff, 0xffffffff80000001, 0x7fffffff, 0xffffffff80000000, 0x7fffffff, 0xffffffff80000000, 0x7ffffffe, 0xffffffff80000001, 0x7fffffff, 0x7fffffff}; uint64_t expected_fcvtzu_d2w[] = {1, 1, 1, 0, 0x7fffff80, 0, 0xffffffff, 0, 0x7fffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0x7ffffffe, 0, 0x80000000, 0xffffffff}; // Double to 32-bit integers. TestFcvtzHelper(config, &MacroAssembler::Fcvtzs, kSRegSize, kDRegSize, zn_inputs, pg_inputs, expected_fcvtzs_d2w); TestFcvtzHelper(config, &MacroAssembler::Fcvtzu, kSRegSize, kDRegSize, zn_inputs, pg_inputs, expected_fcvtzu_d2w); uint64_t expected_fcvtzs_d2x[] = {1, 1, 1, 0xffffffffffffffff, 0x7fffff80, 0xffffffff80000080, 0x7fffff8000000000, 0x8000008000000000, 0x7fffffff, 0xffffffff80000001, 0x7ffffffffffffc00, 0x8000000000000400, 0x7fffffffffffffff, 0x8000000000000000, 0x7ffffffe, 0xffffffff80000001, 0x80000000, 0x7fffffffffffffff}; uint64_t expected_fcvtzu_d2x[] = {1, 1, 1, 0, 0x7fffff80, 0, 0x7fffff8000000000, 0, 0x7fffffff, 0, 0x7ffffffffffffc00, 0, 0xffffffffffffffff, 0, 0x000000007ffffffe, 0, 0x80000000, 0x8000000000000000}; // Double to 64-bit integers. TestFcvtzHelper(config, &MacroAssembler::Fcvtzs, kDRegSize, kDRegSize, zn_inputs, pg_inputs, expected_fcvtzs_d2x); TestFcvtzHelper(config, &MacroAssembler::Fcvtzu, kDRegSize, kDRegSize, zn_inputs, pg_inputs, expected_fcvtzu_d2x); } template static void TestFrintHelper(Test* config, FcvtFrintMFn macro_m, FcvtFrintZFn macro_z, int lane_size_in_bits, const F (&zn_inputs)[N], const int (&pg_inputs)[N], const F (&zd_expected)[N]) { uint64_t zd_expected_rawbits[N]; FPToRawbitsWithSize(zd_expected, zd_expected_rawbits, lane_size_in_bits); TestFcvtFrintHelper(config, macro_m, macro_z, lane_size_in_bits, lane_size_in_bits, zn_inputs, pg_inputs, zd_expected_rawbits); } TEST_SVE(frint) { const double inf_pos = kFP64PositiveInfinity; const double inf_neg = kFP64NegativeInfinity; double zn_inputs[] = {1.1, 1.5, 1.9, 2.5, -1.5, -2.5, 0.0, -0.0, -0.2, inf_pos, inf_neg}; double zd_expected_a[] = {1.0, 2.0, 2.0, 3.0, -2.0, -3.0, 0.0, -0.0, -0.0, inf_pos, inf_neg}; double zd_expected_i[] = {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg}; double zd_expected_m[] = {1.0, 1.0, 1.0, 2.0, -2.0, -3.0, 0.0, -0.0, -1.0, inf_pos, inf_neg}; double zd_expected_n[] = {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg}; double zd_expected_p[] = {2.0, 2.0, 2.0, 3.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg}; double zd_expected_x[] = {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg}; double zd_expected_z[] = {1.0, 1.0, 1.0, 2.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg}; int pg_inputs[] = {0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0}; struct TestDataSet { FcvtFrintMFn macro_m; // merging form. FcvtFrintZFn macro_z; // zeroing form. double (&expected)[11]; }; TestDataSet test_data[] = {{&MacroAssembler::Frinta, &MacroAssembler::Frinta, zd_expected_a}, {&MacroAssembler::Frinti, &MacroAssembler::Frinti, zd_expected_i}, {&MacroAssembler::Frintm, &MacroAssembler::Frintm, zd_expected_m}, {&MacroAssembler::Frintn, &MacroAssembler::Frintn, zd_expected_n}, {&MacroAssembler::Frintp, &MacroAssembler::Frintp, zd_expected_p}, {&MacroAssembler::Frintx, &MacroAssembler::Frintx, zd_expected_x}, {&MacroAssembler::Frintz, &MacroAssembler::Frintz, zd_expected_z}}; unsigned lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize}; for (size_t i = 0; i < sizeof(test_data) / sizeof(TestDataSet); i++) { for (size_t j = 0; j < ArrayLength(lane_sizes); j++) { TestFrintHelper(config, test_data[i].macro_m, test_data[i].macro_z, lane_sizes[j], zn_inputs, pg_inputs, test_data[i].expected); } } } struct CvtfTestDataSet { uint64_t int_value; uint64_t scvtf_result; uint64_t ucvtf_result; }; template static void TestUScvtfHelper(Test* config, int dst_type_size_in_bits, int src_type_size_in_bits, const int (&pg_inputs)[N], const CvtfTestDataSet (&data_set)[N]) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); // Unpack the data from the array of struct into individual arrays that can // simplify the testing. uint64_t zn_inputs[N]; uint64_t expected_zd_scvtf_all_active[N]; uint64_t expected_zd_ucvtf_all_active[N]; for (size_t i = 0; i < N; i++) { zn_inputs[i] = data_set[i].int_value; expected_zd_scvtf_all_active[i] = data_set[i].scvtf_result; expected_zd_ucvtf_all_active[i] = data_set[i].ucvtf_result; } // If the input and result types have a different size, the instruction // operates on elements of the largest specified type. int lane_size_in_bits = std::max(dst_type_size_in_bits, src_type_size_in_bits); ZRegister zd_scvtf_all_active = z25; ZRegister zd_ucvtf_all_active = z26; ZRegister zn = z27; InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_inputs); PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits); __ Ptrue(pg_all_active); // Test integer conversions with all lanes activated. __ Scvtf(zd_scvtf_all_active.WithLaneSize(dst_type_size_in_bits), pg_all_active.Merging(), zn.WithLaneSize(src_type_size_in_bits)); __ Ucvtf(zd_ucvtf_all_active.WithLaneSize(dst_type_size_in_bits), pg_all_active.Merging(), zn.WithLaneSize(src_type_size_in_bits)); ZRegister zd_scvtf_merged = z23; ZRegister zd_ucvtf_merged = z24; PRegisterWithLaneSize pg_merged = p1.WithLaneSize(lane_size_in_bits); Initialise(&masm, pg_merged, pg_inputs); uint64_t snan; switch (lane_size_in_bits) { case kHRegSize: snan = 0x7c11; break; case kSRegSize: snan = 0x7f951111; break; case kDRegSize: snan = 0x7ff5555511111111; break; } __ Dup(zd_scvtf_merged.WithLaneSize(lane_size_in_bits), snan); __ Dup(zd_ucvtf_merged.WithLaneSize(lane_size_in_bits), snan); // Use the same `zn` inputs to test integer conversions but some lanes are set // inactive. __ Scvtf(zd_scvtf_merged.WithLaneSize(dst_type_size_in_bits), pg_merged.Merging(), zn.WithLaneSize(src_type_size_in_bits)); __ Ucvtf(zd_ucvtf_merged.WithLaneSize(dst_type_size_in_bits), pg_merged.Merging(), zn.WithLaneSize(src_type_size_in_bits)); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(expected_zd_scvtf_all_active, zd_scvtf_all_active.WithLaneSize(lane_size_in_bits)); ASSERT_EQUAL_SVE(expected_zd_ucvtf_all_active, zd_ucvtf_all_active.WithLaneSize(lane_size_in_bits)); uint64_t expected_zd_scvtf_merged[N]; for (size_t i = 0; i < N; i++) { expected_zd_scvtf_merged[i] = pg_inputs[i] ? expected_zd_scvtf_all_active[i] : snan; } ASSERT_EQUAL_SVE(expected_zd_scvtf_merged, zd_scvtf_merged.WithLaneSize(lane_size_in_bits)); uint64_t expected_zd_ucvtf_merged[N]; for (size_t i = 0; i < N; i++) { expected_zd_ucvtf_merged[i] = pg_inputs[i] ? expected_zd_ucvtf_all_active[i] : snan; } ASSERT_EQUAL_SVE(expected_zd_ucvtf_merged, zd_ucvtf_merged.WithLaneSize(lane_size_in_bits)); } } TEST_SVE(scvtf_ucvtf_h_s_d_to_float16) { // clang-format off CvtfTestDataSet data_set_1[] = { // Simple conversions of positive numbers which require no rounding; the // results should not depend on the rounding mode, and ucvtf and scvtf should // produce the same result. {0x0000, 0x0000, 0x0000}, {0x0001, 0x3c00, 0x3c00}, {0x0010, 0x4c00, 0x4c00}, {0x0080, 0x5800, 0x5800}, {0x0400, 0x6400, 0x6400}, // Conversions which require rounding. {0x4000, 0x7400, 0x7400}, {0x4001, 0x7400, 0x7400}, // Round up to produce a result that's too big for the input to represent. {0x7ff0, 0x77ff, 0x77ff}, {0x7ff1, 0x77ff, 0x77ff}, {0x7ffe, 0x7800, 0x7800}, {0x7fff, 0x7800, 0x7800}}; int pg_1[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; TestUScvtfHelper(config, kHRegSize, kDRegSize, pg_1, data_set_1); TestUScvtfHelper(config, kHRegSize, kSRegSize, pg_1, data_set_1); TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_1, data_set_1); CvtfTestDataSet data_set_2[] = { // Test mantissa extremities. {0x0401, 0x6401, 0x6401}, {0x4020, 0x7402, 0x7402}, // The largest int16_t that fits in a float16. {0xffef, 0xcc40, 0x7bff}, // Values that would be negative if treated as an int16_t. {0xff00, 0xdc00, 0x7bf8}, {0x8000, 0xf800, 0x7800}, {0x8100, 0xf7f0, 0x7808}, // Check for bit pattern reproduction. {0x0123, 0x5c8c, 0x5c8c}, {0x0cde, 0x6a6f, 0x6a6f}, // Simple conversions of negative int64_t values. These require no rounding, // and the results should not depend on the rounding mode. {0xf800, 0xe800, 0x7bc0}, {0xfc00, 0xe400, 0x7be0}, {0xc000, 0xf400, 0x7a00}, // Check rounding of negative int16_t values. {0x8ffe, 0xf700, 0x7880}, {0x8fff, 0xf700, 0x7880}, {0xffee, 0xcc80, 0x7bff}, {0xffef, 0xcc40, 0x7bff}}; int pg_2[] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1}; // `32-bit to float16` and `64-bit to float16` of above tests has been tested // in `ucvtf` of `16-bit to float16`. TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_2, data_set_2); // clang-format on } TEST_SVE(scvtf_ucvtf_s_to_float) { // clang-format off int dst_lane_size = kSRegSize; int src_lane_size = kSRegSize; // Simple conversions of positive numbers which require no rounding; the // results should not depend on the rounding mode, and ucvtf and scvtf should // produce the same result. CvtfTestDataSet data_set_1[] = { {0x00000000, 0x00000000, 0x00000000}, {0x00000001, 0x3f800000, 0x3f800000}, {0x00004000, 0x46800000, 0x46800000}, {0x00010000, 0x47800000, 0x47800000}, {0x40000000, 0x4e800000, 0x4e800000}}; int pg_1[] = {1, 0, 1, 0, 0}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1); CvtfTestDataSet data_set_2[] = { // Test mantissa extremities. {0x00800001, 0x4b000001, 0x4b000001}, {0x40400000, 0x4e808000, 0x4e808000}, // The largest int32_t that fits in a double. {0x7fffff80, 0x4effffff, 0x4effffff}, // Values that would be negative if treated as an int32_t. {0xffffffff, 0xbf800000, 0x4f800000}, {0xffffff00, 0xc3800000, 0x4f7fffff}, {0x80000000, 0xcf000000, 0x4f000000}, {0x80000001, 0xcf000000, 0x4f000000}, // Check for bit pattern reproduction. {0x089abcde, 0x4d09abce, 0x4d09abce}, {0x12345678, 0x4d91a2b4, 0x4d91a2b4}}; int pg_2[] = {1, 0, 1, 0, 1, 1, 1, 0, 0}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2); // Simple conversions of negative int32_t values. These require no rounding, // and the results should not depend on the rounding mode. CvtfTestDataSet data_set_3[] = { {0xffffc000, 0xc6800000, 0x4f7fffc0}, {0xffff0000, 0xc7800000, 0x4f7fff00}, {0xc0000000, 0xce800000, 0x4f400000}, // Conversions which require rounding. {0x72800000, 0x4ee50000, 0x4ee50000}, {0x72800001, 0x4ee50000, 0x4ee50000}, {0x73000000, 0x4ee60000, 0x4ee60000}, // Check rounding of negative int32_t values. {0x80000140, 0xcefffffe, 0x4f000001}, {0x80000141, 0xcefffffd, 0x4f000001}, {0x80000180, 0xcefffffd, 0x4f000002}, // Round up to produce a result that's too big for the input to represent. {0x7fffffc0, 0x4f000000, 0x4f000000}, {0x7fffffff, 0x4f000000, 0x4f000000}}; int pg_3[] = {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3); // clang-format on } TEST_SVE(scvtf_ucvtf_d_to_float) { // clang-format off int dst_lane_size = kSRegSize; int src_lane_size = kDRegSize; // Simple conversions of positive numbers which require no rounding; the // results should not depend on the rounding mode, and ucvtf and scvtf should // produce the same result. CvtfTestDataSet data_set_1[] = { {0x0000000000000000, 0x00000000, 0x00000000}, {0x0000000000000001, 0x3f800000, 0x3f800000}, {0x0000000040000000, 0x4e800000, 0x4e800000}, {0x0000000100000000, 0x4f800000, 0x4f800000}, {0x4000000000000000, 0x5e800000, 0x5e800000}}; int pg_1[] = {1, 1, 0, 1, 0}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1); CvtfTestDataSet data_set_2[] = { // Test mantissa extremities. {0x0010000000000001, 0x59800000, 0x59800000}, {0x4008000000000000, 0x5e801000, 0x5e801000}, // The largest int32_t that fits in a float. {0x000000007fffff80, 0x4effffff, 0x4effffff}, // Values that would be negative if treated as an int32_t. {0x00000000ffffffff, 0x4f800000, 0x4f800000}, {0x00000000ffffff00, 0x4f7fffff, 0x4f7fffff}, {0x0000000080000000, 0x4f000000, 0x4f000000}, {0x0000000080000100, 0x4f000001, 0x4f000001}, // The largest int64_t that fits in a float. {0x7fffff8000000000, 0x5effffff, 0x5effffff}, // Check for bit pattern reproduction. {0x0123456789abcde0, 0x5b91a2b4, 0x5b91a2b4}, {0x0000000000876543, 0x4b076543, 0x4b076543}}; int pg_2[] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2); CvtfTestDataSet data_set_3[] = { // Simple conversions of negative int64_t values. These require no rounding, // and the results should not depend on the rounding mode. {0xffffffffc0000000, 0xce800000, 0x5f800000}, {0xffffffff00000000, 0xcf800000, 0x5f800000}, {0xc000000000000000, 0xde800000, 0x5f400000}, // Conversions which require rounding. {0x0000800002800000, 0x57000002, 0x57000002}, {0x0000800002800001, 0x57000003, 0x57000003}, {0x0000800003000000, 0x57000003, 0x57000003}, // Check rounding of negative int64_t values. {0x8000014000000000, 0xdefffffe, 0x5f000001}, {0x8000014000000001, 0xdefffffd, 0x5f000001}, {0x8000018000000000, 0xdefffffd, 0x5f000002}, // Round up to produce a result that's too big for the input to represent. {0x00000000ffffff80, 0x4f800000, 0x4f800000}, {0x00000000ffffffff, 0x4f800000, 0x4f800000}, {0xffffff8000000000, 0xd3000000, 0x5f800000}, {0xffffffffffffffff, 0xbf800000, 0x5f800000}}; int pg_3[] = {0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3); // clang-format on } TEST_SVE(scvtf_ucvtf_d_to_double) { // clang-format off int dst_lane_size = kDRegSize; int src_lane_size = kDRegSize; // Simple conversions of positive numbers which require no rounding; the // results should not depend on the rounding mode, and ucvtf and scvtf should // produce the same result. CvtfTestDataSet data_set_1[] = { {0x0000000000000000, 0x0000000000000000, 0x0000000000000000}, {0x0000000000000001, 0x3ff0000000000000, 0x3ff0000000000000}, {0x0000000040000000, 0x41d0000000000000, 0x41d0000000000000}, {0x0000000100000000, 0x41f0000000000000, 0x41f0000000000000}, {0x4000000000000000, 0x43d0000000000000, 0x43d0000000000000}}; int pg_1[] = {0, 1, 1, 0, 0}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1); CvtfTestDataSet data_set_2[] = { // Test mantissa extremities. {0x0010000000000001, 0x4330000000000001, 0x4330000000000001}, {0x4008000000000000, 0x43d0020000000000, 0x43d0020000000000}, // The largest int32_t that fits in a double. {0x000000007fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000}, // Values that would be negative if treated as an int32_t. {0x00000000ffffffff, 0x41efffffffe00000, 0x41efffffffe00000}, {0x0000000080000000, 0x41e0000000000000, 0x41e0000000000000}, {0x0000000080000001, 0x41e0000000200000, 0x41e0000000200000}, // The largest int64_t that fits in a double. {0x7ffffffffffffc00, 0x43dfffffffffffff, 0x43dfffffffffffff}, // Check for bit pattern reproduction. {0x0123456789abcde0, 0x43723456789abcde, 0x43723456789abcde}, {0x0000000012345678, 0x41b2345678000000, 0x41b2345678000000}}; int pg_2[] = {1, 1, 1, 1, 1, 0, 0, 0, 0}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2); CvtfTestDataSet data_set_3[] = { // Simple conversions of negative int64_t values. These require no rounding, // and the results should not depend on the rounding mode. {0xffffffffc0000000, 0xc1d0000000000000, 0x43effffffff80000}, {0xffffffff00000000, 0xc1f0000000000000, 0x43efffffffe00000}, {0xc000000000000000, 0xc3d0000000000000, 0x43e8000000000000}, // Conversions which require rounding. {0x1000000000000280, 0x43b0000000000002, 0x43b0000000000002}, {0x1000000000000281, 0x43b0000000000003, 0x43b0000000000003}, {0x1000000000000300, 0x43b0000000000003, 0x43b0000000000003}, // Check rounding of negative int64_t values. {0x8000000000000a00, 0xc3dffffffffffffe, 0x43e0000000000001}, {0x8000000000000a01, 0xc3dffffffffffffd, 0x43e0000000000001}, {0x8000000000000c00, 0xc3dffffffffffffd, 0x43e0000000000002}, // Round up to produce a result that's too big for the input to represent. {0x7ffffffffffffe00, 0x43e0000000000000, 0x43e0000000000000}, {0x7fffffffffffffff, 0x43e0000000000000, 0x43e0000000000000}, {0xfffffffffffffc00, 0xc090000000000000, 0x43f0000000000000}, {0xffffffffffffffff, 0xbff0000000000000, 0x43f0000000000000}}; int pg_3[] = {1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3); // clang-format on } TEST_SVE(scvtf_ucvtf_s_to_double) { // clang-format off int dst_lane_size = kDRegSize; int src_lane_size = kSRegSize; // Simple conversions of positive numbers which require no rounding; the // results should not depend on the rounding mode, and ucvtf and scvtf should // produce the same result. CvtfTestDataSet data_set_1[] = { {0x00000000, 0x0000000000000000, 0x0000000000000000}, {0x00000001, 0x3ff0000000000000, 0x3ff0000000000000}, {0x00004000, 0x40d0000000000000, 0x40d0000000000000}, {0x00010000, 0x40f0000000000000, 0x40f0000000000000}, {0x40000000, 0x41d0000000000000, 0x41d0000000000000}}; int pg_1[] = {1, 0, 0, 0, 1}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1); CvtfTestDataSet data_set_2[] = { // Test mantissa extremities. {0x40000400, 0x41d0000100000000, 0x41d0000100000000}, // The largest int32_t that fits in a double. {0x7fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000}, // Values that would be negative if treated as an int32_t. {0xffffffff, 0xbff0000000000000, 0x41efffffffe00000}, {0x80000000, 0xc1e0000000000000, 0x41e0000000000000}, {0x80000001, 0xc1dfffffffc00000, 0x41e0000000200000}, // Check for bit pattern reproduction. {0x089abcde, 0x41a13579bc000000, 0x41a13579bc000000}, {0x12345678, 0x41b2345678000000, 0x41b2345678000000}, // Simple conversions of negative int32_t values. These require no rounding, // and the results should not depend on the rounding mode. {0xffffc000, 0xc0d0000000000000, 0x41effff800000000}, {0xffff0000, 0xc0f0000000000000, 0x41efffe000000000}, {0xc0000000, 0xc1d0000000000000, 0x41e8000000000000}}; int pg_2[] = {1, 0, 1, 0, 0, 1, 1, 0, 1, 1}; TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2); // Note that IEEE 754 double-precision format has 52-bits fraction, so all // 32-bits integers are representable in double. // clang-format on } TEST_SVE(sve_fadda) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kFP); START(); __ Ptrue(p0.VnB()); __ Pfalse(p1.VnB()); __ Zip1(p1.VnH(), p0.VnH(), p1.VnH()); __ Index(z0.VnS(), 3, 3); __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS()); __ Fmov(s2, 2.0); __ Fadda(s2, p0, s2, z0.VnS()); __ Index(z0.VnD(), -7, -7); __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD()); __ Fmov(d3, 3.0); __ Fadda(d3, p0, d3, z0.VnD()); __ Index(z0.VnH(), 1, 1); __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH()); __ Fmov(h4, 0); __ Fadda(h4, p1, h4, z0.VnH()); END(); if (CAN_RUN()) { RUN(); // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2. int n = core.GetSVELaneCount(kSRegSize); ASSERT_EQUAL_FP32(2 + 3 * ((n + 1) * (n / 2)), s2); n /= 2; // Half as many lanes. ASSERT_EQUAL_FP64(3 + -7 * ((n + 1) * (n / 2)), d3); // Sum of first n odd numbers is n^2. n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers. ASSERT_EQUAL_FP16(Float16(n * n), h4); } } TEST_SVE(sve_extract) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Index(z0.VnB(), 0, 1); __ Mov(z1, z0); __ Mov(z2, z0); __ Mov(z3, z0); __ Mov(z4, z0); __ Mov(z5, z0); __ Mov(z6, z0); __ Ext(z1, z1, z0, 0); __ Ext(z2, z2, z0, 1); __ Ext(z3, z3, z0, 15); __ Ext(z4, z4, z0, 31); __ Ext(z5, z5, z0, 47); __ Ext(z6, z6, z0, 255); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z1, z0); int lane_count = core.GetSVELaneCount(kBRegSize); if (lane_count == 16) { uint64_t z2_expected[] = {0x000f0e0d0c0b0a09, 0x0807060504030201}; ASSERT_EQUAL_SVE(z2_expected, z2.VnD()); } else { uint64_t z2_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201}; ASSERT_EQUAL_SVE(z2_expected, z2.VnD()); } if (lane_count == 16) { uint64_t z3_expected[] = {0x0e0d0c0b0a090807, 0x060504030201000f}; ASSERT_EQUAL_SVE(z3_expected, z3.VnD()); } else { uint64_t z3_expected[] = {0x1e1d1c1b1a191817, 0x161514131211100f}; ASSERT_EQUAL_SVE(z3_expected, z3.VnD()); } if (lane_count < 32) { ASSERT_EQUAL_SVE(z4, z0); } else if (lane_count == 32) { uint64_t z4_expected[] = {0x0e0d0c0b0a090807, 0x060504030201001f}; ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); } else { uint64_t z4_expected[] = {0x2e2d2c2b2a292827, 0x262524232221201f}; ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); } if (lane_count < 48) { ASSERT_EQUAL_SVE(z5, z0); } else if (lane_count == 48) { uint64_t z5_expected[] = {0x0e0d0c0b0a090807, 0x060504030201002f}; ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); } else { uint64_t z5_expected[] = {0x3e3d3c3b3a393837, 0x363534333231302f}; ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); } if (lane_count < 256) { ASSERT_EQUAL_SVE(z6, z0); } else { uint64_t z6_expected[] = {0x0e0d0c0b0a090807, 0x06050403020100ff}; ASSERT_EQUAL_SVE(z6_expected, z6.VnD()); } } } TEST_SVE(sve_fp_paired_across) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Pfalse(p1.VnB()); __ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); __ Zip1(p3.VnD(), p0.VnD(), p1.VnD()); __ Zip1(p4.VnH(), p0.VnH(), p1.VnH()); __ Index(z0.VnS(), 3, 3); __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS()); __ Faddv(s1, p0, z0.VnS()); __ Fminv(s2, p2, z0.VnS()); __ Fmaxv(s3, p2, z0.VnS()); __ Index(z0.VnD(), -7, -7); __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD()); __ Faddv(d4, p0, z0.VnD()); __ Fminv(d5, p3, z0.VnD()); __ Fmaxv(d6, p3, z0.VnD()); __ Index(z0.VnH(), 1, 1); __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH()); __ Faddv(h7, p4, z0.VnH()); __ Fminv(h8, p4, z0.VnH()); __ Fmaxv(h9, p4, z0.VnH()); __ Dup(z10.VnH(), 0); __ Fdiv(z10.VnH(), p0.Merging(), z10.VnH(), z10.VnH()); __ Insr(z10.VnH(), 0x5140); __ Insr(z10.VnH(), 0xd140); __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 2); __ Fmaxnmv(h11, p0, z10.VnH()); __ Fmaxnmv(h12, p4, z10.VnH()); __ Fminnmv(h13, p0, z10.VnH()); __ Fminnmv(h14, p4, z10.VnH()); __ Dup(z10.VnS(), 0); __ Fdiv(z10.VnS(), p0.Merging(), z10.VnS(), z10.VnS()); __ Insr(z10.VnS(), 0x42280000); __ Insr(z10.VnS(), 0xc2280000); __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 4); __ Fmaxnmv(s15, p0, z10.VnS()); __ Fmaxnmv(s16, p2, z10.VnS()); __ Fminnmv(s17, p0, z10.VnS()); __ Fminnmv(s18, p2, z10.VnS()); __ Dup(z10.VnD(), 0); __ Fdiv(z10.VnD(), p0.Merging(), z10.VnD(), z10.VnD()); __ Insr(z10.VnD(), 0x4045000000000000); __ Insr(z10.VnD(), 0xc045000000000000); __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 8); __ Fmaxnmv(d19, p0, z10.VnD()); __ Fmaxnmv(d20, p3, z10.VnD()); __ Fminnmv(d21, p0, z10.VnD()); __ Fminnmv(d22, p3, z10.VnD()); END(); if (CAN_RUN()) { RUN(); // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2. int n = core.GetSVELaneCount(kSRegSize); ASSERT_EQUAL_FP32(3 * ((n + 1) * (n / 2)), s1); ASSERT_EQUAL_FP32(3, s2); ASSERT_EQUAL_FP32(3 * n - 3, s3); n /= 2; // Half as many lanes. ASSERT_EQUAL_FP64(-7 * ((n + 1) * (n / 2)), d4); ASSERT_EQUAL_FP64(-7 * (n - 1), d5); ASSERT_EQUAL_FP64(-7, d6); // Sum of first n odd numbers is n^2. n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers. ASSERT_EQUAL_FP16(Float16(n * n), h7); ASSERT_EQUAL_FP16(Float16(1), h8); n = core.GetSVELaneCount(kHRegSize); ASSERT_EQUAL_FP16(Float16(n - 1), h9); ASSERT_EQUAL_FP16(Float16(42), h11); ASSERT_EQUAL_FP16(Float16(42), h12); ASSERT_EQUAL_FP16(Float16(-42), h13); ASSERT_EQUAL_FP16(Float16(42), h14); ASSERT_EQUAL_FP32(42, s15); ASSERT_EQUAL_FP32(42, s16); ASSERT_EQUAL_FP32(-42, s17); ASSERT_EQUAL_FP32(42, s18); ASSERT_EQUAL_FP64(42, d19); ASSERT_EQUAL_FP64(42, d20); ASSERT_EQUAL_FP64(-42, d21); ASSERT_EQUAL_FP64(42, d22); } } TEST_SVE(sve_frecpe_frsqrte) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Index(z0.VnH(), 0, 1); __ Fdup(z1.VnH(), Float16(1)); __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH()); __ Insr(z1.VnH(), 0); __ Frsqrte(z2.VnH(), z1.VnH()); __ Frecpe(z1.VnH(), z1.VnH()); __ Index(z0.VnS(), 0, 1); __ Fdup(z3.VnS(), Float16(1)); __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS()); __ Insr(z3.VnS(), 0); __ Frsqrte(z4.VnS(), z3.VnS()); __ Frecpe(z3.VnS(), z3.VnS()); __ Index(z0.VnD(), 0, 1); __ Fdup(z5.VnD(), Float16(1)); __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD()); __ Insr(z5.VnD(), 0); __ Frsqrte(z6.VnD(), z5.VnD()); __ Frecpe(z5.VnD(), z5.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t z1_expected[] = {0x23fc27fc2bfc2ffc, 0x33fc37fc3bfc7c00}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); uint64_t z2_expected[] = {0x2ffc31a433fc35a4, 0x37fc39a43bfc7c00}; ASSERT_EQUAL_SVE(z2_expected, z2.VnD()); uint64_t z3_expected[] = {0x3e7f80003eff8000, 0x3f7f80007f800000}; ASSERT_EQUAL_SVE(z3_expected, z3.VnD()); uint64_t z4_expected[] = {0x3eff80003f348000, 0x3f7f80007f800000}; ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); uint64_t z5_expected[] = {0x3feff00000000000, 0x7ff0000000000000}; ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); uint64_t z6_expected[] = {0x3feff00000000000, 0x7ff0000000000000}; ASSERT_EQUAL_SVE(z6_expected, z6.VnD()); } } TEST_SVE(sve_frecps_frsqrts) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Index(z0.VnH(), 0, -1); __ Fdup(z1.VnH(), Float16(1)); __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH()); __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH()); __ Insr(z1.VnH(), 0); __ Frsqrts(z2.VnH(), z1.VnH(), z0.VnH()); __ Frecps(z1.VnH(), z1.VnH(), z0.VnH()); __ Index(z0.VnS(), 0, -1); __ Fdup(z3.VnS(), Float16(1)); __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS()); __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS()); __ Insr(z3.VnS(), 0); __ Frsqrts(z4.VnS(), z3.VnS(), z0.VnS()); __ Frecps(z3.VnS(), z3.VnS(), z0.VnS()); __ Index(z0.VnD(), 0, -1); __ Fdup(z5.VnD(), Float16(1)); __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD()); __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD()); __ Insr(z5.VnD(), 0); __ Frsqrts(z6.VnD(), z5.VnD(), z0.VnD()); __ Frecps(z5.VnD(), z5.VnD(), z0.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t z1_expected[] = {0x4038406040a04100, 0x4180420042004000}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); uint64_t z2_expected[] = {0x3e383e603ea03f00, 0x3f80400040003e00}; ASSERT_EQUAL_SVE(z2_expected, z2.VnD()); uint64_t z3_expected[] = {0x4030000040400000, 0x4040000040000000}; ASSERT_EQUAL_SVE(z3_expected, z3.VnD()); uint64_t z4_expected[] = {0x3ff0000040000000, 0x400000003fc00000}; ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); uint64_t z5_expected[] = {0x4008000000000000, 0x4000000000000000}; ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); uint64_t z6_expected[] = {0x4000000000000000, 0x3ff8000000000000}; ASSERT_EQUAL_SVE(z6_expected, z6.VnD()); } } TEST_SVE(sve_ftsmul) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); __ Index(z0.VnH(), 0, 1); __ Rev(z1.VnH(), z0.VnH()); __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH()); __ Dup(z2.VnH(), 0); __ Fdiv(z2.VnH(), p0.Merging(), z2.VnH(), z2.VnH()); __ Ftsmul(z3.VnH(), z0.VnH(), z1.VnH()); __ Ftsmul(z4.VnH(), z2.VnH(), z1.VnH()); __ Index(z0.VnS(), -7, 1); __ Rev(z1.VnS(), z0.VnS()); __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS()); __ Dup(z2.VnS(), 0); __ Fdiv(z2.VnS(), p0.Merging(), z2.VnS(), z2.VnS()); __ Ftsmul(z5.VnS(), z0.VnS(), z1.VnS()); __ Ftsmul(z6.VnS(), z2.VnS(), z1.VnS()); __ Index(z0.VnD(), 2, -1); __ Rev(z1.VnD(), z0.VnD()); __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD()); __ Dup(z2.VnD(), 0); __ Fdiv(z2.VnD(), p0.Merging(), z2.VnD(), z2.VnD()); __ Ftsmul(z7.VnD(), z0.VnD(), z1.VnD()); __ Ftsmul(z8.VnD(), z2.VnD(), z1.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t z3_expected[] = {0x5220d0804e40cc00, 0x4880c4003c008000}; ASSERT_EQUAL_SVE(z3_expected, z3.VnD()); uint64_t z4_expected[] = {0x7e007e007e007e00, 0x7e007e007e007e00}; ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); uint64_t z5_expected[] = {0xc180000041c80000, 0xc210000042440000}; ASSERT_EQUAL_SVE(z5_expected, z5.VnD()); uint64_t z6_expected[] = {0x7fc000007fc00000, 0x7fc000007fc00000}; ASSERT_EQUAL_SVE(z6_expected, z6.VnD()); uint64_t z7_expected[] = {0x3ff0000000000000, 0xc010000000000000}; ASSERT_EQUAL_SVE(z7_expected, z7.VnD()); uint64_t z8_expected[] = {0x7ff8000000000000, 0x7ff8000000000000}; ASSERT_EQUAL_SVE(z8_expected, z8.VnD()); } } typedef void (MacroAssembler::*FPMulAccFn)( const ZRegister& zd, const PRegisterM& pg, const ZRegister& za, const ZRegister& zn, const ZRegister& zm, FPMacroNaNPropagationOption nan_option); // The `pg_inputs` is used for examining the predication correctness internally. // It does not imply the value of `result` argument. `result` stands for the // expected result on all-true predication. template static void FPMulAccHelper( Test* config, FPMulAccFn macro, unsigned lane_size_in_bits, const int (&pg_inputs)[N], const T (&za_inputs)[N], const T (&zn_inputs)[N], const T (&zm_inputs)[N], const uint64_t (&result)[N], FPMacroNaNPropagationOption nan_option = FastNaNPropagation) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); ZRegister zd = z0.WithLaneSize(lane_size_in_bits); ZRegister za = z1.WithLaneSize(lane_size_in_bits); ZRegister zn = z2.WithLaneSize(lane_size_in_bits); ZRegister zm = z3.WithLaneSize(lane_size_in_bits); uint64_t za_rawbits[N]; uint64_t zn_rawbits[N]; uint64_t zm_rawbits[N]; FPToRawbitsWithSize(za_inputs, za_rawbits, lane_size_in_bits); FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits); FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits); InsrHelper(&masm, za, za_rawbits); InsrHelper(&masm, zn, zn_rawbits); InsrHelper(&masm, zm, zm_rawbits); // Initialize `zd` with a signalling NaN. uint64_t sn = GetSignallingNan(lane_size_in_bits); __ Mov(x29, sn); __ Dup(zd, x29); Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs); // Fmla macro automatically selects between fmla, fmad and movprfx + fmla // Fmls `ditto` fmls, fmsb and movprfx + fmls // Fnmla `ditto` fnmla, fnmad and movprfx + fnmla // Fnmls `ditto` fnmls, fnmsb and movprfx + fnmls // based on what registers are aliased. ZRegister da_result = z10.WithLaneSize(lane_size_in_bits); ZRegister dn_result = z11.WithLaneSize(lane_size_in_bits); ZRegister dm_result = z12.WithLaneSize(lane_size_in_bits); ZRegister d_result = z13.WithLaneSize(lane_size_in_bits); __ Mov(da_result, za); (masm.*macro)(da_result, p0.Merging(), da_result, zn, zm, nan_option); __ Mov(dn_result, zn); (masm.*macro)(dn_result, p0.Merging(), za, dn_result, zm, nan_option); __ Mov(dm_result, zm); (masm.*macro)(dm_result, p0.Merging(), za, zn, dm_result, nan_option); __ Mov(d_result, zd); (masm.*macro)(d_result, p0.Merging(), za, zn, zm, nan_option); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(za_rawbits, za); ASSERT_EQUAL_SVE(zn_rawbits, zn); ASSERT_EQUAL_SVE(zm_rawbits, zm); uint64_t da_expected[N]; uint64_t dn_expected[N]; uint64_t dm_expected[N]; uint64_t d_expected[N]; for (size_t i = 0; i < N; i++) { da_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : za_rawbits[i]; dn_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zn_rawbits[i]; dm_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zm_rawbits[i]; d_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : sn; } ASSERT_EQUAL_SVE(da_expected, da_result); ASSERT_EQUAL_SVE(dn_expected, dn_result); ASSERT_EQUAL_SVE(dm_expected, dm_result); ASSERT_EQUAL_SVE(d_expected, d_result); } } TEST_SVE(sve_fmla_fmad) { // fmla : zd = za + zn * zm double za_inputs[] = {-39.0, 1.0, -3.0, 2.0}; double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0}; double zm_inputs[] = {9.0, -5.0, 4.0, 5.0}; int pg_inputs[] = {1, 1, 0, 1}; uint64_t fmla_result_h[] = {Float16ToRawbits(Float16(-84.0)), Float16ToRawbits(Float16(101.0)), Float16ToRawbits(Float16(33.0)), Float16ToRawbits(Float16(42.0))}; // `fmad` has been tested in the helper. FPMulAccHelper(config, &MacroAssembler::Fmla, kHRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fmla_result_h); uint64_t fmla_result_s[] = {FloatToRawbits(-84.0f), FloatToRawbits(101.0f), FloatToRawbits(33.0f), FloatToRawbits(42.0f)}; FPMulAccHelper(config, &MacroAssembler::Fmla, kSRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fmla_result_s); uint64_t fmla_result_d[] = {DoubleToRawbits(-84.0), DoubleToRawbits(101.0), DoubleToRawbits(33.0), DoubleToRawbits(42.0)}; FPMulAccHelper(config, &MacroAssembler::Fmla, kDRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fmla_result_d); } TEST_SVE(sve_fmls_fmsb) { // fmls : zd = za - zn * zm double za_inputs[] = {-39.0, 1.0, -3.0, 2.0}; double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0}; double zm_inputs[] = {9.0, -5.0, 4.0, 5.0}; int pg_inputs[] = {1, 0, 1, 1}; uint64_t fmls_result_h[] = {Float16ToRawbits(Float16(6.0)), Float16ToRawbits(Float16(-99.0)), Float16ToRawbits(Float16(-39.0)), Float16ToRawbits(Float16(-38.0))}; // `fmsb` has been tested in the helper. FPMulAccHelper(config, &MacroAssembler::Fmls, kHRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fmls_result_h); uint64_t fmls_result_s[] = {FloatToRawbits(6.0f), FloatToRawbits(-99.0f), FloatToRawbits(-39.0f), FloatToRawbits(-38.0f)}; FPMulAccHelper(config, &MacroAssembler::Fmls, kSRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fmls_result_s); uint64_t fmls_result_d[] = {DoubleToRawbits(6.0), DoubleToRawbits(-99.0), DoubleToRawbits(-39.0), DoubleToRawbits(-38.0)}; FPMulAccHelper(config, &MacroAssembler::Fmls, kDRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fmls_result_d); } TEST_SVE(sve_fnmla_fnmad) { // fnmla : zd = -za - zn * zm double za_inputs[] = {-39.0, 1.0, -3.0, 2.0}; double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0}; double zm_inputs[] = {9.0, -5.0, 4.0, 5.0}; int pg_inputs[] = {0, 1, 1, 1}; uint64_t fnmla_result_h[] = {Float16ToRawbits(Float16(84.0)), Float16ToRawbits(Float16(-101.0)), Float16ToRawbits(Float16(-33.0)), Float16ToRawbits(Float16(-42.0))}; // `fnmad` has been tested in the helper. FPMulAccHelper(config, &MacroAssembler::Fnmla, kHRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fnmla_result_h); uint64_t fnmla_result_s[] = {FloatToRawbits(84.0f), FloatToRawbits(-101.0f), FloatToRawbits(-33.0f), FloatToRawbits(-42.0f)}; FPMulAccHelper(config, &MacroAssembler::Fnmla, kSRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fnmla_result_s); uint64_t fnmla_result_d[] = {DoubleToRawbits(84.0), DoubleToRawbits(-101.0), DoubleToRawbits(-33.0), DoubleToRawbits(-42.0)}; FPMulAccHelper(config, &MacroAssembler::Fnmla, kDRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fnmla_result_d); } TEST_SVE(sve_fnmls_fnmsb) { // fnmls : zd = -za + zn * zm double za_inputs[] = {-39.0, 1.0, -3.0, 2.0}; double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0}; double zm_inputs[] = {9.0, -5.0, 4.0, 5.0}; int pg_inputs[] = {1, 1, 1, 0}; uint64_t fnmls_result_h[] = {Float16ToRawbits(Float16(-6.0)), Float16ToRawbits(Float16(99.0)), Float16ToRawbits(Float16(39.0)), Float16ToRawbits(Float16(38.0))}; // `fnmsb` has been tested in the helper. FPMulAccHelper(config, &MacroAssembler::Fnmls, kHRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fnmls_result_h); uint64_t fnmls_result_s[] = {FloatToRawbits(-6.0f), FloatToRawbits(99.0f), FloatToRawbits(39.0f), FloatToRawbits(38.0f)}; FPMulAccHelper(config, &MacroAssembler::Fnmls, kSRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fnmls_result_s); uint64_t fnmls_result_d[] = {DoubleToRawbits(-6.0), DoubleToRawbits(99.0), DoubleToRawbits(39.0), DoubleToRawbits(38.0)}; FPMulAccHelper(config, &MacroAssembler::Fnmls, kDRegSize, pg_inputs, za_inputs, zn_inputs, zm_inputs, fnmls_result_d); } typedef void (MacroAssembler::*FPMulAccIdxFn)(const ZRegister& zd, const ZRegister& za, const ZRegister& zn, const ZRegister& zm, int index); template static void FPMulAccIdxHelper(Test* config, FPMulAccFn macro, FPMulAccIdxFn macro_idx, const T (&za_inputs)[N], const T (&zn_inputs)[N], const T (&zm_inputs)[N]) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Ptrue(p0.VnB()); // Repeat indexed vector across up to 2048-bit VL. for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i += N) { InsrHelper(&masm, z30.VnD(), zm_inputs); } FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z30.VnH()); InsrHelper(&masm, z1.VnD(), zn_inputs); InsrHelper(&masm, z2.VnD(), za_inputs); __ Mov(z3, z0); (masm.*macro_idx)(z3.VnH(), z2.VnH(), z1.VnH(), z3.VnH(), 0); // zd == zm __ Mov(z4, z1); (masm.*macro_idx)(z4.VnH(), z2.VnH(), z4.VnH(), z0.VnH(), 1); // zd == zn __ Mov(z5, z2); (masm.*macro_idx)(z5.VnH(), z5.VnH(), z1.VnH(), z0.VnH(), 4); // zd == za (masm.*macro_idx)(z6.VnH(), z2.VnH(), z1.VnH(), z0.VnH(), 7); FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z30.VnS()); __ Mov(z7, z0); (masm.*macro_idx)(z7.VnS(), z2.VnS(), z1.VnS(), z7.VnS(), 0); // zd == zm __ Mov(z8, z1); (masm.*macro_idx)(z8.VnS(), z2.VnS(), z8.VnS(), z0.VnS(), 1); // zd == zn __ Mov(z9, z2); (masm.*macro_idx)(z9.VnS(), z9.VnS(), z1.VnS(), z0.VnS(), 2); // zd == za (masm.*macro_idx)(z10.VnS(), z2.VnS(), z1.VnS(), z0.VnS(), 3); FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD()); __ Mov(z11, z0); (masm.*macro_idx)(z11.VnD(), z2.VnD(), z1.VnD(), z11.VnD(), 0); // zd == zm __ Mov(z12, z1); (masm.*macro_idx)(z12.VnD(), z2.VnD(), z12.VnD(), z0.VnD(), 1); // zd == zn __ Mov(z13, z2); (masm.*macro_idx)(z13.VnD(), z13.VnD(), z1.VnD(), z0.VnD(), 0); // zd == za __ Mov(z14, z0); // zd == zn == zm (masm.*macro_idx)(z14.VnD(), z2.VnD(), z14.VnD(), z14.VnD(), 1); // Indexed form of Fmla and Fmls won't swap argument, passing strict NaN // propagation mode to ensure the following macros don't swap argument in // any cases. FPMacroNaNPropagationOption option = StrictNaNPropagation; // Compute the results using other instructions. __ Dup(z0.VnH(), z30.VnH(), 0); FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH()); (masm.*macro)(z15.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option); __ Dup(z0.VnH(), z30.VnH(), 1); FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH()); (masm.*macro)(z16.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option); __ Dup(z0.VnH(), z30.VnH(), 4); FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH()); (masm.*macro)(z17.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option); __ Dup(z0.VnH(), z30.VnH(), 7); FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH()); (masm.*macro)(z18.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option); __ Dup(z0.VnS(), z30.VnS(), 0); FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS()); (masm.*macro)(z19.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option); __ Dup(z0.VnS(), z30.VnS(), 1); FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS()); (masm.*macro)(z20.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option); __ Dup(z0.VnS(), z30.VnS(), 2); FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS()); (masm.*macro)(z21.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option); __ Dup(z0.VnS(), z30.VnS(), 3); FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS()); (masm.*macro)(z22.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option); __ Dup(z0.VnD(), z30.VnD(), 0); FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD()); (masm.*macro)(z23.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option); __ Dup(z0.VnD(), z30.VnD(), 1); FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD()); (masm.*macro)(z24.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option); FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD()); __ Dup(z29.VnD(), z30.VnD(), 1); FPSegmentPatternHelper(&masm, z29.VnD(), p0.Merging(), z29.VnD()); (masm.*macro)(z25.VnD(), p0.Merging(), z2.VnD(), z0.VnD(), z29.VnD(), option); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z15.VnH(), z3.VnH()); ASSERT_EQUAL_SVE(z16.VnH(), z4.VnH()); ASSERT_EQUAL_SVE(z17.VnH(), z5.VnH()); ASSERT_EQUAL_SVE(z18.VnH(), z6.VnH()); ASSERT_EQUAL_SVE(z19.VnS(), z7.VnS()); ASSERT_EQUAL_SVE(z20.VnS(), z8.VnS()); ASSERT_EQUAL_SVE(z21.VnS(), z9.VnS()); ASSERT_EQUAL_SVE(z22.VnS(), z10.VnS()); ASSERT_EQUAL_SVE(z23.VnD(), z11.VnD()); ASSERT_EQUAL_SVE(z24.VnD(), z12.VnD()); ASSERT_EQUAL_SVE(z11.VnD(), z13.VnD()); ASSERT_EQUAL_SVE(z25.VnD(), z14.VnD()); } } TEST_SVE(sve_fmla_fmls_index) { uint64_t zm_inputs_1[] = {0x3ff000003f803c00, 0xbff00000bf80bc00}; uint64_t zn_inputs_1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76}; uint64_t za_inputs_1[] = {0x3c004000bc00c000, 0x64006800e400e800}; // Using the vector form of Fmla and Fmls to verify the indexed form. FPMulAccIdxHelper(config, &MacroAssembler::Fmla, // vector form &MacroAssembler::Fmla, // indexed form za_inputs_1, zn_inputs_1, zm_inputs_1); FPMulAccIdxHelper(config, &MacroAssembler::Fmls, // vector form &MacroAssembler::Fmls, // indexed form za_inputs_1, zn_inputs_1, zm_inputs_1); uint64_t zm_inputs_2[] = {0x7ff5555511111111, // NaN 0xfff0000000000000}; // Infinity uint64_t zn_inputs_2[] = {0x7f9511117fc00000, // NaN 0x7f800000ff800000}; // Infinity uint64_t za_inputs_2[] = {0x7c11000000007e00, // NaN 0x000000007c00fc00}; // Infinity FPMulAccIdxHelper(config, &MacroAssembler::Fmla, // vector form &MacroAssembler::Fmla, // indexed form za_inputs_2, zn_inputs_2, zm_inputs_2); FPMulAccIdxHelper(config, &MacroAssembler::Fmls, // vector form &MacroAssembler::Fmls, // indexed form za_inputs_2, zn_inputs_2, zm_inputs_2); } // Execute a number of instructions which all use ProcessNaNs, and check that // they all propagate NaNs correctly. template static void ProcessNaNsHelper(Test* config, int lane_size_in_bits, const Ti (&zn_inputs)[N], const Ti (&zm_inputs)[N], const Td (&zd_expected)[N], FPMacroNaNPropagationOption nan_option) { ArithFn arith_unpredicated_macro[] = {&MacroAssembler::Fadd, &MacroAssembler::Fsub, &MacroAssembler::Fmul}; for (size_t i = 0; i < ArrayLength(arith_unpredicated_macro); i++) { FPBinArithHelper(config, arith_unpredicated_macro[i], lane_size_in_bits, zn_inputs, zm_inputs, zd_expected); } FPArithPredicatedFn arith_predicated_macro[] = {&MacroAssembler::Fmax, &MacroAssembler::Fmin}; int pg_inputs[N]; // With an all-true predicate, this helper aims to compare with special // numbers. for (size_t i = 0; i < N; i++) { pg_inputs[i] = 1; } // fdivr propagates the quotient (Zm) preferentially, so we don't actually // need any special handling for StrictNaNPropagation. FPBinArithHelper(config, NULL, &MacroAssembler::Fdiv, lane_size_in_bits, // With an all-true predicate, the value in zd is // irrelevant to the operations. zn_inputs, pg_inputs, zn_inputs, zm_inputs, zd_expected); for (size_t i = 0; i < ArrayLength(arith_predicated_macro); i++) { FPBinArithHelper(config, arith_predicated_macro[i], NULL, lane_size_in_bits, // With an all-true predicate, the value in zd is // irrelevant to the operations. zn_inputs, pg_inputs, zn_inputs, zm_inputs, zd_expected, nan_option); } } template static void ProcessNaNsHelper3(Test* config, int lane_size_in_bits, const Ti (&za_inputs)[N], const Ti (&zn_inputs)[N], const Ti (&zm_inputs)[N], const Td (&zd_expected_fmla)[N], const Td (&zd_expected_fmls)[N], const Td (&zd_expected_fnmla)[N], const Td (&zd_expected_fnmls)[N], FPMacroNaNPropagationOption nan_option) { int pg_inputs[N]; // With an all-true predicate, this helper aims to compare with special // numbers. for (size_t i = 0; i < N; i++) { pg_inputs[i] = 1; } FPMulAccHelper(config, &MacroAssembler::Fmla, lane_size_in_bits, pg_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_fmla, nan_option); FPMulAccHelper(config, &MacroAssembler::Fmls, lane_size_in_bits, pg_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_fmls, nan_option); FPMulAccHelper(config, &MacroAssembler::Fnmla, lane_size_in_bits, pg_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_fnmla, nan_option); FPMulAccHelper(config, &MacroAssembler::Fnmls, lane_size_in_bits, pg_inputs, za_inputs, zn_inputs, zm_inputs, zd_expected_fnmls, nan_option); } TEST_SVE(sve_process_nans_double) { // Use non-standard NaNs to check that the payload bits are preserved. double sa = RawbitsToDouble(0x7ff5555511111111); double sn = RawbitsToDouble(0x7ff5555522222222); double sm = RawbitsToDouble(0x7ff5555533333333); double qa = RawbitsToDouble(0x7ffaaaaa11111111); double qn = RawbitsToDouble(0x7ffaaaaa22222222); double qm = RawbitsToDouble(0x7ffaaaaa33333333); VIXL_ASSERT(IsSignallingNaN(sa)); VIXL_ASSERT(IsSignallingNaN(sn)); VIXL_ASSERT(IsSignallingNaN(sm)); VIXL_ASSERT(IsQuietNaN(qa)); VIXL_ASSERT(IsQuietNaN(qn)); VIXL_ASSERT(IsQuietNaN(qm)); // The input NaNs after passing through ProcessNaN. uint64_t sa_proc = 0x7ffd555511111111; uint64_t sn_proc = 0x7ffd555522222222; uint64_t sm_proc = 0x7ffd555533333333; uint64_t qa_proc = DoubleToRawbits(qa); uint64_t qn_proc = DoubleToRawbits(qn); uint64_t qm_proc = DoubleToRawbits(qm); uint64_t sa_proc_n = sa_proc ^ kDSignMask; uint64_t sn_proc_n = sn_proc ^ kDSignMask; uint64_t qa_proc_n = qa_proc ^ kDSignMask; uint64_t qn_proc_n = qn_proc ^ kDSignMask; // Quiet NaNs are propagated. double zn_inputs_1[] = {qn, 0.0, 0.0, qm, qn, qm}; double zm_inputs_1[] = {0.0, qn, qm, 0.0, qm, qn}; uint64_t zd_expected_1[] = {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc}; ProcessNaNsHelper(config, kDRegSize, zn_inputs_1, zm_inputs_1, zd_expected_1, StrictNaNPropagation); // Signalling NaNs are propagated. double zn_inputs_2[] = {sn, 0.0, 0.0, sm, sn, sm}; double zm_inputs_2[] = {0.0, sn, sm, 0.0, sm, sn}; uint64_t zd_expected_2[] = {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc}; ProcessNaNsHelper(config, kDRegSize, zn_inputs_2, zm_inputs_2, zd_expected_2, StrictNaNPropagation); // Signalling NaNs take precedence over quiet NaNs. double zn_inputs_3[] = {sn, qn, sn, sn, qn}; double zm_inputs_3[] = {qm, sm, sm, qn, sn}; uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc}; ProcessNaNsHelper(config, kDRegSize, zn_inputs_3, zm_inputs_3, zd_expected_3, StrictNaNPropagation); double za_inputs_4[] = {qa, qa, 0.0, 0.0, qa, qa}; double zn_inputs_4[] = {qn, 0.0, 0.0, qn, qn, qn}; double zm_inputs_4[] = {0.0, qm, qm, qm, qm, 0.0}; // If `a` is propagated, its sign is inverted by fnmla and fnmls. // If `n` is propagated, its sign is inverted by fmls and fnmla. // If `m` is propagated, its sign is never inverted. uint64_t zd_expected_fmla_4[] = {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc}; uint64_t zd_expected_fmls_4[] = {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc}; uint64_t zd_expected_fnmla_4[] = {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n}; uint64_t zd_expected_fnmls_4[] = {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n}; ProcessNaNsHelper3(config, kDRegSize, za_inputs_4, zn_inputs_4, zm_inputs_4, zd_expected_fmla_4, zd_expected_fmls_4, zd_expected_fnmla_4, zd_expected_fnmls_4, StrictNaNPropagation); // Signalling NaNs take precedence over quiet NaNs. double za_inputs_5[] = {qa, qa, sa, sa, sa}; double zn_inputs_5[] = {qn, sn, sn, sn, qn}; double zm_inputs_5[] = {sm, qm, sm, qa, sm}; uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc}; uint64_t zd_expected_fmls_5[] = {sm_proc, sn_proc_n, sa_proc, sa_proc, sa_proc}; uint64_t zd_expected_fnmla_5[] = {sm_proc, sn_proc_n, sa_proc_n, sa_proc_n, sa_proc_n}; uint64_t zd_expected_fnmls_5[] = {sm_proc, sn_proc, sa_proc_n, sa_proc_n, sa_proc_n}; ProcessNaNsHelper3(config, kDRegSize, za_inputs_5, zn_inputs_5, zm_inputs_5, zd_expected_fmla_5, zd_expected_fmls_5, zd_expected_fnmla_5, zd_expected_fnmls_5, StrictNaNPropagation); const double inf = kFP64PositiveInfinity; const double inf_n = kFP64NegativeInfinity; uint64_t inf_proc = DoubleToRawbits(inf); uint64_t inf_proc_n = DoubleToRawbits(inf_n); uint64_t d_inf_proc = DoubleToRawbits(kFP64DefaultNaN); double za_inputs_6[] = {qa, qa, 0.0f, -0.0f, qa, sa}; double zn_inputs_6[] = {inf, -0.0f, -0.0f, inf, inf_n, inf}; double zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f}; // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the // quiet_nan. uint64_t zd_expected_fmla_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc}; uint64_t zd_expected_fmls_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc}; uint64_t zd_expected_fnmla_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n}; uint64_t zd_expected_fnmls_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n}; ProcessNaNsHelper3(config, kDRegSize, za_inputs_6, zn_inputs_6, zm_inputs_6, zd_expected_fmla_6, zd_expected_fmls_6, zd_expected_fnmla_6, zd_expected_fnmls_6, StrictNaNPropagation); } TEST_SVE(sve_process_nans_float) { // Use non-standard NaNs to check that the payload bits are preserved. float sa = RawbitsToFloat(0x7f951111); float sn = RawbitsToFloat(0x7f952222); float sm = RawbitsToFloat(0x7f953333); float qa = RawbitsToFloat(0x7fea1111); float qn = RawbitsToFloat(0x7fea2222); float qm = RawbitsToFloat(0x7fea3333); VIXL_ASSERT(IsSignallingNaN(sa)); VIXL_ASSERT(IsSignallingNaN(sn)); VIXL_ASSERT(IsSignallingNaN(sm)); VIXL_ASSERT(IsQuietNaN(qa)); VIXL_ASSERT(IsQuietNaN(qn)); VIXL_ASSERT(IsQuietNaN(qm)); // The input NaNs after passing through ProcessNaN. uint32_t sa_proc = 0x7fd51111; uint32_t sn_proc = 0x7fd52222; uint32_t sm_proc = 0x7fd53333; uint32_t qa_proc = FloatToRawbits(qa); uint32_t qn_proc = FloatToRawbits(qn); uint32_t qm_proc = FloatToRawbits(qm); uint32_t sa_proc_n = sa_proc ^ kSSignMask; uint32_t sn_proc_n = sn_proc ^ kSSignMask; uint32_t qa_proc_n = qa_proc ^ kSSignMask; uint32_t qn_proc_n = qn_proc ^ kSSignMask; // Quiet NaNs are propagated. float zn_inputs_1[] = {qn, 0.0f, 0.0f, qm, qn, qm}; float zm_inputs_1[] = {0.0f, qn, qm, 0.0f, qm, qn}; uint64_t zd_expected_1[] = {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc}; ProcessNaNsHelper(config, kSRegSize, zn_inputs_1, zm_inputs_1, zd_expected_1, StrictNaNPropagation); // Signalling NaNs are propagated. float zn_inputs_2[] = {sn, 0.0f, 0.0f, sm, sn, sm}; float zm_inputs_2[] = {0.0f, sn, sm, 0.0f, sm, sn}; uint64_t zd_expected_2[] = {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc}; ProcessNaNsHelper(config, kSRegSize, zn_inputs_2, zm_inputs_2, zd_expected_2, StrictNaNPropagation); // Signalling NaNs take precedence over quiet NaNs. float zn_inputs_3[] = {sn, qn, sn, sn, qn}; float zm_inputs_3[] = {qm, sm, sm, qn, sn}; uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc}; ProcessNaNsHelper(config, kSRegSize, zn_inputs_3, zm_inputs_3, zd_expected_3, StrictNaNPropagation); float za_inputs_4[] = {qa, qa, 0.0f, 0.0f, qa, qa}; float zn_inputs_4[] = {qn, 0.0f, 0.0f, qn, qn, qn}; float zm_inputs_4[] = {0.0f, qm, qm, qm, qm, 0.0f}; // If `a` is propagated, its sign is inverted by fnmla and fnmls. // If `n` is propagated, its sign is inverted by fmls and fnmla. // If `m` is propagated, its sign is never inverted. uint64_t zd_expected_fmla_4[] = {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc}; uint64_t zd_expected_fmls_4[] = {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc}; uint64_t zd_expected_fnmla_4[] = {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n}; uint64_t zd_expected_fnmls_4[] = {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n}; ProcessNaNsHelper3(config, kSRegSize, za_inputs_4, zn_inputs_4, zm_inputs_4, zd_expected_fmla_4, zd_expected_fmls_4, zd_expected_fnmla_4, zd_expected_fnmls_4, StrictNaNPropagation); // Signalling NaNs take precedence over quiet NaNs. float za_inputs_5[] = {qa, qa, sa, sa, sa}; float zn_inputs_5[] = {qn, sn, sn, sn, qn}; float zm_inputs_5[] = {sm, qm, sm, qa, sm}; uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc}; uint64_t zd_expected_fmls_5[] = {sm_proc, sn_proc_n, sa_proc, sa_proc, sa_proc}; uint64_t zd_expected_fnmla_5[] = {sm_proc, sn_proc_n, sa_proc_n, sa_proc_n, sa_proc_n}; uint64_t zd_expected_fnmls_5[] = {sm_proc, sn_proc, sa_proc_n, sa_proc_n, sa_proc_n}; ProcessNaNsHelper3(config, kSRegSize, za_inputs_5, zn_inputs_5, zm_inputs_5, zd_expected_fmla_5, zd_expected_fmls_5, zd_expected_fnmla_5, zd_expected_fnmls_5, StrictNaNPropagation); const float inf = kFP32PositiveInfinity; const float inf_n = kFP32NegativeInfinity; uint32_t inf_proc = FloatToRawbits(inf); uint32_t inf_proc_n = FloatToRawbits(inf_n); uint32_t d_inf_proc = FloatToRawbits(kFP32DefaultNaN); float za_inputs_6[] = {qa, qa, 0.0f, 0.0f, qa, sa}; float zn_inputs_6[] = {inf, 0.0f, 0.0f, inf, inf_n, inf}; float zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f}; // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the // quiet_nan. uint64_t zd_expected_fmla_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc}; uint64_t zd_expected_fmls_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc}; uint64_t zd_expected_fnmla_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n}; uint64_t zd_expected_fnmls_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n}; ProcessNaNsHelper3(config, kSRegSize, za_inputs_6, zn_inputs_6, zm_inputs_6, zd_expected_fmla_6, zd_expected_fmls_6, zd_expected_fnmla_6, zd_expected_fnmls_6, StrictNaNPropagation); } TEST_SVE(sve_process_nans_half) { // Use non-standard NaNs to check that the payload bits are preserved. Float16 sa(RawbitsToFloat16(0x7c11)); Float16 sn(RawbitsToFloat16(0x7c22)); Float16 sm(RawbitsToFloat16(0x7c33)); Float16 qa(RawbitsToFloat16(0x7e44)); Float16 qn(RawbitsToFloat16(0x7e55)); Float16 qm(RawbitsToFloat16(0x7e66)); VIXL_ASSERT(IsSignallingNaN(sa)); VIXL_ASSERT(IsSignallingNaN(sn)); VIXL_ASSERT(IsSignallingNaN(sm)); VIXL_ASSERT(IsQuietNaN(qa)); VIXL_ASSERT(IsQuietNaN(qn)); VIXL_ASSERT(IsQuietNaN(qm)); // The input NaNs after passing through ProcessNaN. uint16_t sa_proc = 0x7e11; uint16_t sn_proc = 0x7e22; uint16_t sm_proc = 0x7e33; uint16_t qa_proc = Float16ToRawbits(qa); uint16_t qn_proc = Float16ToRawbits(qn); uint16_t qm_proc = Float16ToRawbits(qm); uint16_t sa_proc_n = sa_proc ^ kHSignMask; uint16_t sn_proc_n = sn_proc ^ kHSignMask; uint16_t qa_proc_n = qa_proc ^ kHSignMask; uint16_t qn_proc_n = qn_proc ^ kHSignMask; Float16 zero(0.0); // Quiet NaNs are propagated. Float16 zn_inputs_1[] = {qn, zero, zero, qm, qn, qm}; Float16 zm_inputs_1[] = {zero, qn, qm, zero, qm, qn}; uint64_t zd_expected_1[] = {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc}; ProcessNaNsHelper(config, kHRegSize, zn_inputs_1, zm_inputs_1, zd_expected_1, StrictNaNPropagation); // Signalling NaNs are propagated. Float16 zn_inputs_2[] = {sn, zero, zero, sm, sn, sm}; Float16 zm_inputs_2[] = {zero, sn, sm, zero, sm, sn}; uint64_t zd_expected_2[] = {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc}; ProcessNaNsHelper(config, kHRegSize, zn_inputs_2, zm_inputs_2, zd_expected_2, StrictNaNPropagation); // Signalling NaNs take precedence over quiet NaNs. Float16 zn_inputs_3[] = {sn, qn, sn, sn, qn}; Float16 zm_inputs_3[] = {qm, sm, sm, qn, sn}; uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc}; ProcessNaNsHelper(config, kHRegSize, zn_inputs_3, zm_inputs_3, zd_expected_3, StrictNaNPropagation); Float16 za_inputs_4[] = {qa, qa, zero, zero, qa, qa}; Float16 zn_inputs_4[] = {qn, zero, zero, qn, qn, qn}; Float16 zm_inputs_4[] = {zero, qm, qm, qm, qm, zero}; // If `a` is propagated, its sign is inverted by fnmla and fnmls. // If `n` is propagated, its sign is inverted by fmls and fnmla. // If `m` is propagated, its sign is never inverted. uint64_t zd_expected_fmla_4[] = {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc}; uint64_t zd_expected_fmls_4[] = {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc}; uint64_t zd_expected_fnmla_4[] = {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n}; uint64_t zd_expected_fnmls_4[] = {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n}; ProcessNaNsHelper3(config, kHRegSize, za_inputs_4, zn_inputs_4, zm_inputs_4, zd_expected_fmla_4, zd_expected_fmls_4, zd_expected_fnmla_4, zd_expected_fnmls_4, StrictNaNPropagation); // Signalling NaNs take precedence over quiet NaNs. Float16 za_inputs_5[] = {qa, qa, sa, sa, sa}; Float16 zn_inputs_5[] = {qn, sn, sn, sn, qn}; Float16 zm_inputs_5[] = {sm, qm, sm, qa, sm}; uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc}; uint64_t zd_expected_fmls_5[] = {sm_proc, sn_proc_n, sa_proc, sa_proc, sa_proc}; uint64_t zd_expected_fnmla_5[] = {sm_proc, sn_proc_n, sa_proc_n, sa_proc_n, sa_proc_n}; uint64_t zd_expected_fnmls_5[] = {sm_proc, sn_proc, sa_proc_n, sa_proc_n, sa_proc_n}; ProcessNaNsHelper3(config, kHRegSize, za_inputs_5, zn_inputs_5, zm_inputs_5, zd_expected_fmla_5, zd_expected_fmls_5, zd_expected_fnmla_5, zd_expected_fnmls_5, StrictNaNPropagation); const Float16 inf = kFP16PositiveInfinity; const Float16 inf_n = kFP16NegativeInfinity; uint64_t inf_proc = Float16ToRawbits(inf); uint64_t inf_proc_n = Float16ToRawbits(inf_n); uint64_t d_inf_proc = Float16ToRawbits(kFP16DefaultNaN); Float16 za_inputs_6[] = {qa, qa, zero, zero, qa, sa}; Float16 zn_inputs_6[] = {inf, zero, zero, inf, inf_n, inf}; Float16 zm_inputs_6[] = {zero, inf_n, inf, inf, inf, zero}; // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the // quiet_nan. uint64_t zd_expected_fmla_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc}; uint64_t zd_expected_fmls_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc}; uint64_t zd_expected_fnmla_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n}; uint64_t zd_expected_fnmls_6[] = {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n}; ProcessNaNsHelper3(config, kHRegSize, za_inputs_6, zn_inputs_6, zm_inputs_6, zd_expected_fmla_6, zd_expected_fmls_6, zd_expected_fnmla_6, zd_expected_fnmls_6, StrictNaNPropagation); } typedef void (MacroAssembler::*FCmpFn)(const PRegisterWithLaneSize& pd, const PRegisterZ& pg, const ZRegister& zn, const ZRegister& zm); typedef void (MacroAssembler::*FCmpZeroFn)(const PRegisterWithLaneSize& pd, const PRegisterZ& pg, const ZRegister& zn, double zero); typedef void (MacroAssembler::*CmpFn)(const PRegisterWithLaneSize& pd, const PRegisterZ& pg, const ZRegister& zn, const ZRegister& zm); static FCmpFn GetFpAbsCompareFn(Condition cond) { switch (cond) { case ge: return &MacroAssembler::Facge; case gt: return &MacroAssembler::Facgt; case le: return &MacroAssembler::Facle; case lt: return &MacroAssembler::Faclt; default: VIXL_UNIMPLEMENTED(); return NULL; } } static FCmpFn GetFpCompareFn(Condition cond) { switch (cond) { case ge: return &MacroAssembler::Fcmge; case gt: return &MacroAssembler::Fcmgt; case le: return &MacroAssembler::Fcmle; case lt: return &MacroAssembler::Fcmlt; case eq: return &MacroAssembler::Fcmeq; case ne: return &MacroAssembler::Fcmne; case uo: return &MacroAssembler::Fcmuo; default: VIXL_UNIMPLEMENTED(); return NULL; } } static FCmpZeroFn GetFpCompareZeroFn(Condition cond) { switch (cond) { case ge: return &MacroAssembler::Fcmge; case gt: return &MacroAssembler::Fcmgt; case le: return &MacroAssembler::Fcmle; case lt: return &MacroAssembler::Fcmlt; case eq: return &MacroAssembler::Fcmeq; case ne: return &MacroAssembler::Fcmne; default: VIXL_UNIMPLEMENTED(); return NULL; } } static CmpFn GetIntCompareFn(Condition cond) { switch (cond) { case ge: return &MacroAssembler::Cmpge; case gt: return &MacroAssembler::Cmpgt; case le: return &MacroAssembler::Cmple; case lt: return &MacroAssembler::Cmplt; case eq: return &MacroAssembler::Cmpeq; case ne: return &MacroAssembler::Cmpne; default: VIXL_UNIMPLEMENTED(); return NULL; } } template static void TestFpCompareHelper(Test* config, int lane_size_in_bits, Condition cond, const double (&zn_inputs)[N], const double (&zm_inputs)[N], const int (&pd_expected)[N], bool is_absolute = false) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); ZRegister zt_int_1 = z1.WithLaneSize(lane_size_in_bits); ZRegister zt_int_2 = z2.WithLaneSize(lane_size_in_bits); ZRegister zt_int_3 = z3.WithLaneSize(lane_size_in_bits); ZRegister zt_fp_1 = z11.WithLaneSize(lane_size_in_bits); ZRegister zt_fp_2 = z12.WithLaneSize(lane_size_in_bits); ZRegister zt_fp_3 = z13.WithLaneSize(lane_size_in_bits); ZRegister fp_one = z31.WithLaneSize(lane_size_in_bits); PRegisterWithLaneSize pd_result_int_1 = p15.WithLaneSize(lane_size_in_bits); PRegisterWithLaneSize pd_result_fp_1 = p14.WithLaneSize(lane_size_in_bits); PRegisterWithLaneSize pd_result_int_2 = p13.WithLaneSize(lane_size_in_bits); PRegisterWithLaneSize pd_result_fp_2 = p12.WithLaneSize(lane_size_in_bits); FCmpFn fcmp = is_absolute ? GetFpAbsCompareFn(cond) : GetFpCompareFn(cond); __ Ptrue(p1.VnB()); if (cond != uo) { int pg_inputs[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1}; Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs); __ Fdup(fp_one, 0.1f); __ Index(zt_int_1, 3, 3); __ Scvtf(zt_fp_1, p0.Merging(), zt_int_1); __ Fadd(zt_fp_1, zt_fp_1, fp_one); __ Index(zt_int_2, 3, -10); __ Scvtf(zt_fp_2, p0.Merging(), zt_int_2); __ Fadd(zt_fp_2, zt_fp_2, fp_one); __ Index(zt_int_3, 3, 2); __ Scvtf(zt_fp_3, p0.Merging(), zt_int_3); __ Fadd(zt_fp_3, zt_fp_3, fp_one); // There is no absolute comparison in integer type, use `abs` with `cmp` // to synthesize the expected result for `fac`. if (is_absolute == true) { __ Abs(zt_int_2, p1.Merging(), zt_int_2); } CmpFn cmp = GetIntCompareFn(cond); (masm.*cmp)(pd_result_int_1, p0.Zeroing(), zt_int_1, zt_int_2); (masm.*fcmp)(pd_result_fp_1, p0.Zeroing(), zt_fp_1, zt_fp_2); (masm.*cmp)(pd_result_int_2, p0.Zeroing(), zt_int_1, zt_int_3); (masm.*fcmp)(pd_result_fp_2, p0.Zeroing(), zt_fp_1, zt_fp_3); } uint64_t zn_inputs_rawbits[N]; uint64_t zm_inputs_rawbits[N]; FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits); FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits); ZRegister zn_fp = z14.WithLaneSize(lane_size_in_bits); ZRegister zm_fp = z15.WithLaneSize(lane_size_in_bits); InsrHelper(&masm, zn_fp, zn_inputs_rawbits); InsrHelper(&masm, zm_fp, zm_inputs_rawbits); PRegisterWithLaneSize pd_result_fp_3 = p11.WithLaneSize(lane_size_in_bits); (masm.*fcmp)(pd_result_fp_3, p1.Zeroing(), zn_fp, zm_fp); END(); if (CAN_RUN()) { RUN(); if (cond != uo) { ASSERT_EQUAL_SVE(pd_result_int_1, pd_result_fp_1); ASSERT_EQUAL_SVE(pd_result_int_2, pd_result_fp_2); } ASSERT_EQUAL_SVE(pd_expected, pd_result_fp_3); } } TEST_SVE(sve_fp_compare_vectors) { double inf_p = kFP64PositiveInfinity; double inf_n = kFP64NegativeInfinity; double nan = kFP64DefaultNaN; // Normal floating point comparison has been tested in the helper. double zn[] = {0.0, inf_n, 1.0, inf_p, inf_p, nan, 0.0, nan}; double zm[] = {-0.0, inf_n, inf_n, -2.0, inf_n, nan, nan, inf_p}; int pd_fcm_gt[] = {0, 0, 1, 1, 1, 0, 0, 0}; int pd_fcm_lt[] = {0, 0, 0, 0, 0, 0, 0, 0}; int pd_fcm_ge[] = {1, 1, 1, 1, 1, 0, 0, 0}; int pd_fcm_le[] = {1, 1, 0, 0, 0, 0, 0, 0}; int pd_fcm_eq[] = {1, 1, 0, 0, 0, 0, 0, 0}; int pd_fcm_ne[] = {0, 0, 1, 1, 1, 1, 1, 1}; int pd_fcm_uo[] = {0, 0, 0, 0, 0, 1, 1, 1}; int pd_fac_gt[] = {0, 0, 0, 1, 0, 0, 0, 0}; int pd_fac_lt[] = {0, 0, 1, 0, 0, 0, 0, 0}; int pd_fac_ge[] = {1, 1, 0, 1, 1, 0, 0, 0}; int pd_fac_le[] = {1, 1, 1, 0, 1, 0, 0, 0}; int lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize}; for (size_t i = 0; i < ArrayLength(lane_sizes); i++) { int lane_size = lane_sizes[i]; // Test floating-point compare vectors. TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fcm_gt); TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fcm_lt); TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fcm_ge); TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fcm_le); TestFpCompareHelper(config, lane_size, eq, zn, zm, pd_fcm_eq); TestFpCompareHelper(config, lane_size, ne, zn, zm, pd_fcm_ne); TestFpCompareHelper(config, lane_size, uo, zn, zm, pd_fcm_uo); // Test floating-point absolute compare vectors. TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fac_gt, true); TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fac_lt, true); TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fac_ge, true); TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fac_le, true); } } template static void TestFpCompareZeroHelper(Test* config, int lane_size_in_bits, Condition cond, const T (&zn_inputs)[N], const int (&pd_expected)[N]) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); ZRegister zn = z28.WithLaneSize(lane_size_in_bits); PRegisterWithLaneSize pd = p14.WithLaneSize(lane_size_in_bits); uint64_t zn_rawbits[N]; FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits); InsrHelper(&masm, zn, zn_rawbits); __ Ptrue(p0.VnB()); (masm.*GetFpCompareZeroFn(cond))(pd, p0.Zeroing(), zn, 0.0); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(pd_expected, pd); } } TEST_SVE(sve_fp_compare_vector_zero) { Float16 fp16_inf_p = kFP16PositiveInfinity; Float16 fp16_inf_n = kFP16NegativeInfinity; Float16 fp16_dn = kFP16DefaultNaN; Float16 fp16_sn = RawbitsToFloat16(0x7c22); Float16 fp16_qn = RawbitsToFloat16(0x7e55); float fp32_inf_p = kFP32PositiveInfinity; float fp32_inf_n = kFP32NegativeInfinity; float fp32_dn = kFP32DefaultNaN; float fp32_sn = RawbitsToFloat(0x7f952222); float fp32_qn = RawbitsToFloat(0x7fea2222); double fp64_inf_p = kFP64PositiveInfinity; double fp64_inf_n = kFP64NegativeInfinity; double fp64_dn = kFP64DefaultNaN; double fp64_sn = RawbitsToDouble(0x7ff5555511111111); double fp64_qn = RawbitsToDouble(0x7ffaaaaa11111111); // Normal floating point comparison has been tested in the non-zero form. Float16 zn_inputs_h[] = {Float16(0.0), Float16(-0.0), fp16_inf_p, fp16_inf_n, fp16_dn, fp16_sn, fp16_qn}; float zn_inputs_s[] = {0.0, -0.0, fp32_inf_p, fp32_inf_n, fp32_dn, fp32_sn, fp32_qn}; double zn_inputs_d[] = {0.0, -0.0, fp64_inf_p, fp64_inf_n, fp64_dn, fp64_sn, fp64_qn}; int pd_expected_gt[] = {0, 0, 1, 0, 0, 0, 0}; int pd_expected_lt[] = {0, 0, 0, 1, 0, 0, 0}; int pd_expected_ge[] = {1, 1, 1, 0, 0, 0, 0}; int pd_expected_le[] = {1, 1, 0, 1, 0, 0, 0}; int pd_expected_eq[] = {1, 1, 0, 0, 0, 0, 0}; int pd_expected_ne[] = {0, 0, 1, 1, 1, 1, 1}; TestFpCompareZeroHelper(config, kDRegSize, gt, zn_inputs_d, pd_expected_gt); TestFpCompareZeroHelper(config, kDRegSize, lt, zn_inputs_d, pd_expected_lt); TestFpCompareZeroHelper(config, kDRegSize, ge, zn_inputs_d, pd_expected_ge); TestFpCompareZeroHelper(config, kDRegSize, le, zn_inputs_d, pd_expected_le); TestFpCompareZeroHelper(config, kDRegSize, eq, zn_inputs_d, pd_expected_eq); TestFpCompareZeroHelper(config, kDRegSize, ne, zn_inputs_d, pd_expected_ne); TestFpCompareZeroHelper(config, kSRegSize, gt, zn_inputs_s, pd_expected_gt); TestFpCompareZeroHelper(config, kSRegSize, lt, zn_inputs_s, pd_expected_lt); TestFpCompareZeroHelper(config, kSRegSize, ge, zn_inputs_s, pd_expected_ge); TestFpCompareZeroHelper(config, kSRegSize, le, zn_inputs_s, pd_expected_le); TestFpCompareZeroHelper(config, kSRegSize, eq, zn_inputs_s, pd_expected_eq); TestFpCompareZeroHelper(config, kSRegSize, ne, zn_inputs_s, pd_expected_ne); TestFpCompareZeroHelper(config, kHRegSize, gt, zn_inputs_h, pd_expected_gt); TestFpCompareZeroHelper(config, kHRegSize, lt, zn_inputs_h, pd_expected_lt); TestFpCompareZeroHelper(config, kHRegSize, ge, zn_inputs_h, pd_expected_ge); TestFpCompareZeroHelper(config, kHRegSize, le, zn_inputs_h, pd_expected_le); TestFpCompareZeroHelper(config, kHRegSize, eq, zn_inputs_h, pd_expected_eq); TestFpCompareZeroHelper(config, kHRegSize, ne, zn_inputs_h, pd_expected_ne); } typedef void (MacroAssembler::*FPUnaryMFn)(const ZRegister& zd, const PRegisterM& pg, const ZRegister& zn); typedef void (MacroAssembler::*FPUnaryZFn)(const ZRegister& zd, const PRegisterZ& pg, const ZRegister& zn); template static void TestFPUnaryPredicatedHelper(Test* config, int src_size_in_bits, int dst_size_in_bits, uint64_t (&zn_inputs)[N], const uint64_t (&pg_inputs)[M], const uint64_t (&zd_expected)[N], FPUnaryMFn macro_m, FPUnaryZFn macro_z) { // Provide the full predicate input. VIXL_ASSERT(M == (kPRegMaxSize / kDRegSize)); SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); int ds = dst_size_in_bits; int ss = src_size_in_bits; int ls = std::max(ss, ds); // When destination type is larger than source type, fill the high parts with // noise values, which should be ignored. if (ds > ss) { VIXL_ASSERT(ss < 64); uint64_t zn_inputs_mod[N]; uint64_t sn = GetSignallingNan(ss); for (unsigned i = 0; i < N; i++) { zn_inputs_mod[i] = zn_inputs[i] | ((sn + i) << ss); } InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs_mod); } else { InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs); } // Make a copy so we can check that constructive operations preserve zn. __ Mov(z28, z29); // Run the operation on all lanes. __ Ptrue(p0.WithLaneSize(ls)); (masm.*macro_m)(z27.WithLaneSize(ds), p0.Merging(), z28.WithLaneSize(ss)); Initialise(&masm, p1.VnB(), pg_inputs[3], pg_inputs[2], pg_inputs[1], pg_inputs[0]); // Clear the irrelevant lanes. __ Index(z31.WithLaneSize(ls), 0, 1); __ Cmplt(p1.WithLaneSize(ls), p1.Zeroing(), z31.WithLaneSize(ls), N); // Check merging predication. __ Index(z11.WithLaneSize(ls), 42, 1); // Preserve the base value so we can derive the expected result. __ Mov(z21, z11); __ Mov(z9, z11); (masm.*macro_m)(z11.WithLaneSize(ds), p1.Merging(), z28.WithLaneSize(ss)); // Generate expected values using explicit merging operations. InsrHelper(&masm, z25.WithLaneSize(ls), zd_expected); __ Mov(z21.WithLaneSize(ls), p1.Merging(), z25.WithLaneSize(ls)); // Check zeroing predication. __ Index(z12.WithLaneSize(ds), 42, -1); (masm.*macro_z)(z12.WithLaneSize(ds), p1.Zeroing(), z28.WithLaneSize(ss)); // Generate expected values using explicit zeroing operations. InsrHelper(&masm, z30.WithLaneSize(ls), zd_expected); // Emulate zeroing predication. __ Dup(z22.WithLaneSize(ls), 0); __ Mov(z22.WithLaneSize(ls), p1.Merging(), z30.WithLaneSize(ls)); // Check an in-place update. __ Mov(z9.WithLaneSize(ls), p1.Merging(), z28.WithLaneSize(ls)); (masm.*macro_m)(z9.WithLaneSize(ds), p1.Merging(), z9.WithLaneSize(ss)); END(); if (CAN_RUN()) { RUN(); // Check all lanes. ASSERT_EQUAL_SVE(zd_expected, z27.WithLaneSize(ls)); // Check that constructive operations preserve their inputs. ASSERT_EQUAL_SVE(z28, z29); // Check merging predication. ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z21.WithLaneSize(ls)); // Check zeroing predication. ASSERT_EQUAL_SVE(z22.WithLaneSize(ls), z12.WithLaneSize(ls)); // Check in-place operation where zd == zn. ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z9.WithLaneSize(ls)); } } template static void TestFPUnaryPredicatedHelper(Test* config, int src_size_in_bits, int dst_size_in_bits, T (&zn_inputs)[N], const T (&zd_expected)[N], FPUnaryMFn macro_m, FPUnaryZFn macro_z) { uint64_t pg_inputs[] = {0xa55aa55aa55aa55a, 0xa55aa55aa55aa55a, 0xa55aa55aa55aa55a, 0xa55aa55aa55aa55a}; TestFPUnaryPredicatedHelper(config, src_size_in_bits, dst_size_in_bits, zn_inputs, pg_inputs, zd_expected, macro_m, macro_z); // The complementary of above predicate to get full input coverage. uint64_t pg_c_inputs[] = {0x5aa55aa55aa55aa5, 0x5aa55aa55aa55aa5, 0x5aa55aa55aa55aa5, 0x5aa55aa55aa55aa5}; TestFPUnaryPredicatedHelper(config, src_size_in_bits, dst_size_in_bits, zn_inputs, pg_c_inputs, zd_expected, macro_m, macro_z); } template static void TestFcvtHelper(Test* config, int src_size_in_bits, int dst_size_in_bits, T (&zn_inputs)[N], const T (&zd_expected)[N]) { TestFPUnaryPredicatedHelper(config, src_size_in_bits, dst_size_in_bits, zn_inputs, zd_expected, &MacroAssembler::Fcvt, // Merging form. &MacroAssembler::Fcvt); // Zerging form. } TEST_SVE(sve_fcvt) { uint64_t h_vals[] = {0x7c00, 0xfc00, 0, 0x8000, 0x7bff, // Max half precision. 0x0400, // Min positive normal. 0x03ff, // Max subnormal. 0x0001}; // Min positive subnormal. uint64_t s_vals[] = {0x7f800000, 0xff800000, 0, 0x80000000, 0x477fe000, 0x38800000, 0x387fc000, 0x33800000}; uint64_t d_vals[] = {0x7ff0000000000000, 0xfff0000000000000, 0, 0x8000000000000000, 0x40effc0000000000, 0x3f10000000000000, 0x3f0ff80000000000, 0x3e70000000000000}; TestFcvtHelper(config, kHRegSize, kSRegSize, h_vals, s_vals); TestFcvtHelper(config, kSRegSize, kHRegSize, s_vals, h_vals); TestFcvtHelper(config, kSRegSize, kDRegSize, s_vals, d_vals); TestFcvtHelper(config, kDRegSize, kSRegSize, d_vals, s_vals); TestFcvtHelper(config, kHRegSize, kDRegSize, h_vals, d_vals); TestFcvtHelper(config, kDRegSize, kHRegSize, d_vals, h_vals); } TEST_SVE(sve_fcvt_nan) { uint64_t h_inputs[] = {0x7e55, // Quiet NaN. 0x7c22}; // Signalling NaN. uint64_t h2s_expected[] = {0x7fcaa000, 0x7fc44000}; uint64_t h2d_expected[] = {0x7ff9540000000000, 0x7ff8880000000000}; uint64_t s_inputs[] = {0x7fc12345, // Quiet NaN. 0x7f812345}; // Signalling NaN. uint64_t s2h_expected[] = {0x7e09, 0x7e09}; uint64_t s2d_expected[] = {0x7ff82468a0000000, 0x7ff82468a0000000}; uint64_t d_inputs[] = {0x7ffaaaaa22222222, // Quiet NaN. 0x7ff5555511111111}; // Signalling NaN. uint64_t d2h_expected[] = {0x7eaa, 0x7f55}; uint64_t d2s_expected[] = {0x7fd55551, 0x7feaaaa8}; TestFcvtHelper(config, kHRegSize, kSRegSize, h_inputs, h2s_expected); TestFcvtHelper(config, kSRegSize, kHRegSize, s_inputs, s2h_expected); TestFcvtHelper(config, kHRegSize, kDRegSize, h_inputs, h2d_expected); TestFcvtHelper(config, kDRegSize, kHRegSize, d_inputs, d2h_expected); TestFcvtHelper(config, kSRegSize, kDRegSize, s_inputs, s2d_expected); TestFcvtHelper(config, kDRegSize, kSRegSize, d_inputs, d2s_expected); } template static void TestFrecpxHelper(Test* config, int lane_size_in_bits, T (&zn_inputs)[N], const T (&zd_expected)[N]) { TestFPUnaryPredicatedHelper(config, lane_size_in_bits, lane_size_in_bits, zn_inputs, zd_expected, &MacroAssembler::Frecpx, // Merging form. &MacroAssembler::Frecpx); // Zerging form. } TEST_SVE(sve_frecpx_h) { uint64_t zn_inputs[] = {Float16ToRawbits(kFP16PositiveInfinity), Float16ToRawbits(kFP16NegativeInfinity), Float16ToRawbits(Float16(0.0)), Float16ToRawbits(Float16(-0.0)), 0x0001, // Smallest positive subnormal number. 0x03ff, // Largest subnormal number. 0x0400, // Smallest positive normal number. 0x7bff, // Largest normal number. 0x3bff, // Largest number less than one. 0x3c01, // Smallest number larger than one. 0x7c22, // Signalling NaN. 0x7e55}; // Quiet NaN. uint64_t zd_expected[] = {0, 0x8000, 0x7800, 0xf800, // Exponent of subnormal numbers are zero. 0x7800, 0x7800, 0x7800, 0x0400, 0x4400, 0x4000, 0x7e22, // To quiet NaN. 0x7e55}; TestFrecpxHelper(config, kHRegSize, zn_inputs, zd_expected); } TEST_SVE(sve_frecpx_s) { uint64_t zn_inputs[] = {FloatToRawbits(kFP32PositiveInfinity), FloatToRawbits(kFP32NegativeInfinity), FloatToRawbits(65504), // Max half precision. FloatToRawbits(6.10352e-5), // Min positive normal. FloatToRawbits(6.09756e-5), // Max subnormal. FloatToRawbits( 5.96046e-8), // Min positive subnormal. FloatToRawbits(5e-9), // Not representable -> zero. FloatToRawbits(-0.0), FloatToRawbits(0.0), 0x7f952222, // Signalling NaN. 0x7fea2222}; // Quiet NaN; uint64_t zd_expected[] = {0, // 0.0 0x80000000, // -0.0 0x38800000, // 6.10352e-05 0x47000000, // 32768 0x47800000, // 65536 0x4c800000, // 6.71089e+07 0x4e000000, // 5.36871e+08 0xff000000, // -1.70141e+38 0x7f000000, // 1.70141e+38 0x7fd52222, 0x7fea2222}; TestFrecpxHelper(config, kSRegSize, zn_inputs, zd_expected); } TEST_SVE(sve_frecpx_d) { uint64_t zn_inputs[] = {DoubleToRawbits(kFP64PositiveInfinity), DoubleToRawbits(kFP64NegativeInfinity), DoubleToRawbits(65504), // Max half precision. DoubleToRawbits(6.10352e-5), // Min positive normal. DoubleToRawbits(6.09756e-5), // Max subnormal. DoubleToRawbits( 5.96046e-8), // Min positive subnormal. DoubleToRawbits(5e-9), // Not representable -> zero. DoubleToRawbits(-0.0), DoubleToRawbits(0.0), 0x7ff5555511111111, // Signalling NaN. 0x7ffaaaaa11111111}; // Quiet NaN; uint64_t zd_expected[] = {0, // 0.0 0x8000000000000000, // -0.0 0x3f10000000000000, // 6.10352e-05 0x40e0000000000000, // 32768 0x40f0000000000000, // 65536 0x4190000000000000, // 6.71089e+07 0x41c0000000000000, // 5.36871e+08 0xffe0000000000000, // -1.70141e+38 0x7fe0000000000000, // 1.70141e+38 0x7ffd555511111111, 0x7ffaaaaa11111111}; TestFrecpxHelper(config, kDRegSize, zn_inputs, zd_expected); } template static void TestFsqrtHelper(Test* config, int lane_size_in_bits, T (&zn_inputs)[N], const T (&zd_expected)[N]) { TestFPUnaryPredicatedHelper(config, lane_size_in_bits, lane_size_in_bits, zn_inputs, zd_expected, &MacroAssembler::Fsqrt, // Merging form. &MacroAssembler::Fsqrt); // Zerging form. } TEST_SVE(sve_fsqrt_h) { uint64_t zn_inputs[] = {Float16ToRawbits(Float16(0.0)), Float16ToRawbits(Float16(-0.0)), Float16ToRawbits(Float16(1.0)), Float16ToRawbits(Float16(65025.0)), Float16ToRawbits(kFP16PositiveInfinity), Float16ToRawbits(kFP16NegativeInfinity), Float16ToRawbits(Float16(6.10352e-5)), // Min normal positive. Float16ToRawbits(Float16(65504.0)), // Max normal positive float. Float16ToRawbits(Float16(6.09756e-5)), // Max subnormal. Float16ToRawbits(Float16(5.96046e-8)), // Min subnormal positive. 0x7c22, // Signaling NaN 0x7e55}; // Quiet NaN uint64_t zd_expected[] = {Float16ToRawbits(Float16(0.0)), Float16ToRawbits(Float16(-0.0)), Float16ToRawbits(Float16(1.0)), Float16ToRawbits(Float16(255.0)), Float16ToRawbits(kFP16PositiveInfinity), Float16ToRawbits(kFP16DefaultNaN), 0x2000, 0x5bff, 0x1fff, 0x0c00, 0x7e22, // To quiet NaN. 0x7e55}; TestFsqrtHelper(config, kHRegSize, zn_inputs, zd_expected); } TEST_SVE(sve_fsqrt_s) { uint64_t zn_inputs[] = {FloatToRawbits(0.0f), FloatToRawbits(-0.0f), FloatToRawbits(1.0f), FloatToRawbits(65536.0f), FloatToRawbits(kFP32PositiveInfinity), FloatToRawbits(kFP32NegativeInfinity), 0x00800000, // Min normal positive, ~1.17e−38 0x7f7fffff, // Max normal positive, ~3.40e+38 0x00000001, // Min subnormal positive, ~1.40e−45 0x007fffff, // Max subnormal, ~1.17e−38 0x7f951111, // Signaling NaN 0x7fea1111}; // Quiet NaN uint64_t zd_expected[] = {FloatToRawbits(0.0f), FloatToRawbits(-0.0f), FloatToRawbits(1.0f), FloatToRawbits(256.0f), FloatToRawbits(kFP32PositiveInfinity), FloatToRawbits(kFP32DefaultNaN), 0x20000000, // ~1.08e-19 0x5f7fffff, // ~1.84e+19 0x1a3504f3, // ~3.74e-23 0x1fffffff, // ~1.08e-19 0x7fd51111, // To quiet NaN. 0x7fea1111}; TestFsqrtHelper(config, kSRegSize, zn_inputs, zd_expected); } TEST_SVE(sve_fsqrt_d) { uint64_t zn_inputs[] = {DoubleToRawbits(0.0), DoubleToRawbits(-0.0), DoubleToRawbits(1.0), DoubleToRawbits(65536.0), DoubleToRawbits(kFP64PositiveInfinity), DoubleToRawbits(kFP64NegativeInfinity), 0x0010000000000000, // Min normal positive, ~2.22e-308 0x7fefffffffffffff, // Max normal positive, ~1.79e+308 0x0000000000000001, // Min subnormal positive, 5e-324 0x000fffffffffffff, // Max subnormal, ~2.22e-308 0x7ff5555511111111, 0x7ffaaaaa11111111}; uint64_t zd_expected[] = {DoubleToRawbits(0.0), DoubleToRawbits(-0.0), DoubleToRawbits(1.0), DoubleToRawbits(256.0), DoubleToRawbits(kFP64PositiveInfinity), DoubleToRawbits(kFP64DefaultNaN), 0x2000000000000000, // ~1.49e-154 0x5fefffffffffffff, // ~1.34e+154 0x1e60000000000000, // ~2.22e-162 0x1fffffffffffffff, // ~1.49e-154 0x7ffd555511111111, // To quiet NaN. 0x7ffaaaaa11111111}; TestFsqrtHelper(config, kDRegSize, zn_inputs, zd_expected); } TEST_SVE(sve_adr) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Index(z0.VnD(), 0x10000000f0000000, 0x1000); __ Index(z1.VnD(), 1, 3); __ Index(z2.VnS(), -1, -1); __ Adr(z3.VnD(), SVEMemOperand(z0.VnD(), z1.VnD())); __ Adr(z4.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 1)); __ Adr(z5.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 2)); __ Adr(z6.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 3)); __ Adr(z7.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW)); __ Adr(z8.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 1)); __ Adr(z9.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 2)); __ Adr(z10.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 3)); __ Adr(z11.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW)); __ Adr(z12.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 1)); __ Adr(z13.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 2)); __ Adr(z14.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 3)); __ Adr(z15.VnS(), SVEMemOperand(z0.VnS(), z2.VnS())); __ Adr(z16.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 1)); __ Adr(z17.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 2)); __ Adr(z18.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 3)); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z3[] = {0x10000000f0001004, 0x10000000f0000001}; uint64_t expected_z4[] = {0x10000000f0001008, 0x10000000f0000002}; uint64_t expected_z5[] = {0x10000000f0001010, 0x10000000f0000004}; uint64_t expected_z6[] = {0x10000000f0001020, 0x10000000f0000008}; uint64_t expected_z7[] = {0x10000001f0000ffd, 0x10000001efffffff}; uint64_t expected_z8[] = {0x10000002f0000ffa, 0x10000002effffffe}; uint64_t expected_z9[] = {0x10000004f0000ff4, 0x10000004effffffc}; uint64_t expected_z10[] = {0x10000008f0000fe8, 0x10000008effffff8}; uint64_t expected_z11[] = {0x10000000f0000ffd, 0x10000000efffffff}; uint64_t expected_z12[] = {0x10000000f0000ffa, 0x10000000effffffe}; uint64_t expected_z13[] = {0x10000000f0000ff4, 0x10000000effffffc}; uint64_t expected_z14[] = {0x10000000f0000fe8, 0x10000000effffff8}; uint64_t expected_z15[] = {0x0ffffffcf0000ffd, 0x0ffffffeefffffff}; uint64_t expected_z16[] = {0x0ffffff8f0000ffa, 0x0ffffffceffffffe}; uint64_t expected_z17[] = {0x0ffffff0f0000ff4, 0x0ffffff8effffffc}; uint64_t expected_z18[] = {0x0fffffe0f0000fe8, 0x0ffffff0effffff8}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); ASSERT_EQUAL_SVE(expected_z4, z4.VnD()); ASSERT_EQUAL_SVE(expected_z5, z5.VnD()); ASSERT_EQUAL_SVE(expected_z6, z6.VnD()); ASSERT_EQUAL_SVE(expected_z7, z7.VnD()); ASSERT_EQUAL_SVE(expected_z8, z8.VnD()); ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); ASSERT_EQUAL_SVE(expected_z10, z10.VnD()); ASSERT_EQUAL_SVE(expected_z11, z11.VnD()); ASSERT_EQUAL_SVE(expected_z12, z12.VnD()); ASSERT_EQUAL_SVE(expected_z13, z13.VnD()); ASSERT_EQUAL_SVE(expected_z14, z14.VnD()); ASSERT_EQUAL_SVE(expected_z15, z15.VnD()); ASSERT_EQUAL_SVE(expected_z16, z16.VnD()); ASSERT_EQUAL_SVE(expected_z17, z17.VnD()); ASSERT_EQUAL_SVE(expected_z18, z18.VnD()); } } // Test loads and broadcast by comparing them with the result of a set of // equivalent scalar loads. template static void LoadBcastHelper(Test* config, unsigned msize_in_bits, unsigned esize_in_bits, F sve_ld1, bool is_signed) { VIXL_ASSERT((esize_in_bits == kBRegSize) || (esize_in_bits == kHRegSize) || (esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize)); static const unsigned kMaxLaneCount = kZRegMaxSize / kBRegSize; SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); unsigned msize_in_bytes = msize_in_bits / kBitsPerByte; unsigned esize_in_bytes = esize_in_bits / kBitsPerByte; int vl = config->sve_vl_in_bytes(); uint64_t offsets[kMaxLaneCount]; uint64_t buffer_size = vl * 64; uint64_t data = reinterpret_cast(malloc(buffer_size)); BufferFillingHelper(data, buffer_size, msize_in_bytes, kMaxLaneCount, offsets); for (unsigned i = 0; i < (kMaxLaneCount / 2); i++) { // Assign encodable offsets into the first part of the offset array so // that both encodable and unencodable offset can be tested. // Note that the encoding bit range of immediate offset is 6 bits. offsets[i] = (offsets[i] % (UINT64_C(1) << 6)) * msize_in_bytes; } ZRegister zn = z0.WithLaneSize(esize_in_bits); ZRegister zn_ref = z4.WithLaneSize(esize_in_bits); PRegisterZ pg = p0.Zeroing(); Initialise(&masm, pg, 0x9abcdef012345678, 0xabcdef0123456789, 0xf4f3f1f0fefdfcfa, 0xf9f8f6f5f3f2f0ff); __ Mov(x2, data); uint64_t enablable_offset = offsets[0]; // Simple check if the operation correct in a single offset. (masm.*sve_ld1)(zn, pg, SVEMemOperand(x2, enablable_offset)); // Generate a reference result using scalar loads. uint64_t address = data + enablable_offset; uint64_t duplicated_addresses[kMaxLaneCount]; for (unsigned i = 0; i < kMaxLaneCount; i++) { duplicated_addresses[i] = address; } ScalarLoadHelper(&masm, vl, duplicated_addresses, zn_ref, pg, esize_in_bits, msize_in_bits, is_signed); ZRegister zn_agg = z10.WithLaneSize(esize_in_bits); ZRegister zn_agg_ref = z11.WithLaneSize(esize_in_bits); ZRegister zn_temp = z12.WithLaneSize(esize_in_bits); __ Dup(zn_agg, 0); __ Dup(zn_agg_ref, 0); // Check if the operation correct in different offsets. for (unsigned i = 0; i < (vl / esize_in_bytes); i++) { (masm.*sve_ld1)(zn_temp, pg, SVEMemOperand(x2, offsets[i])); __ Lastb(x1, pg, zn_temp); __ Insr(zn_agg, x1); __ Mov(x3, data + offsets[i]); ScalarLoadHelper(&masm, x1, x3, msize_in_bits, is_signed); __ Insr(zn_agg_ref, x1); } END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zn_ref, zn); ASSERT_EQUAL_SVE(zn_agg_ref, zn_agg); } free(reinterpret_cast(data)); } TEST_SVE(sve_ld1rb) { LoadBcastHelper(config, kBRegSize, kBRegSize, &MacroAssembler::Ld1rb, false); LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rb, false); LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rb, false); LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rb, false); } TEST_SVE(sve_ld1rh) { LoadBcastHelper(config, kHRegSize, kHRegSize, &MacroAssembler::Ld1rh, false); LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rh, false); LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rh, false); } TEST_SVE(sve_ld1rw) { LoadBcastHelper(config, kSRegSize, kSRegSize, &MacroAssembler::Ld1rw, false); LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rw, false); } TEST_SVE(sve_ld1rd) { LoadBcastHelper(config, kDRegSize, kDRegSize, &MacroAssembler::Ld1rd, false); } TEST_SVE(sve_ld1rsb) { LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rsb, true); LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rsb, true); LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rsb, true); } TEST_SVE(sve_ld1rsh) { LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rsh, true); LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rsh, true); } TEST_SVE(sve_ld1rsw) { LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rsw, true); } TEST_SVE(sve_prefetch_offset) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Prfb(PLDL1KEEP, p5, SVEMemOperand(z30.VnS(), 0)); __ Prfb(PLDL1STRM, p5, SVEMemOperand(x28, -11, SVE_MUL_VL)); __ Prfb(PLDL2KEEP, p6, SVEMemOperand(x30, x29)); __ Prfb(PLDL2STRM, p6, SVEMemOperand(x7, z12.VnS(), UXTW)); __ Prfh(PSTL2KEEP, p6, SVEMemOperand(z0.VnS(), 28)); __ Prfh(PSTL2STRM, p4, SVEMemOperand(x17, -3, SVE_MUL_VL)); __ Prfh(PSTL3KEEP, p3, SVEMemOperand(x0, x0, LSL, 1)); __ Prfh(PSTL3STRM, p4, SVEMemOperand(x20, z0.VnD(), LSL, 1)); __ Prfw(PLDL1KEEP, p3, SVEMemOperand(z23.VnD(), 5)); __ Prfw(PLDL1STRM, p1, SVEMemOperand(x4, 10, SVE_MUL_VL)); __ Prfw(PLDL2KEEP, p2, SVEMemOperand(x22, x22, LSL, 2)); __ Prfw(PLDL2STRM, p1, SVEMemOperand(x2, z6.VnS(), SXTW, 2)); __ Prfd(PLDL3KEEP, p5, SVEMemOperand(z11.VnD(), 9)); __ Prfd(PLDL3STRM, p3, SVEMemOperand(x0, -24, SVE_MUL_VL)); __ Prfd(PSTL1KEEP, p7, SVEMemOperand(x5, x5, LSL, 3)); __ Prfd(PSTL1STRM, p1, SVEMemOperand(x19, z18.VnS(), SXTW, 3)); END(); if (CAN_RUN()) { RUN(); } } TEST_SVE(sve2_match_nmatch) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); __ Ptrue(p0.VnB()); __ Ptrue(p1.VnH()); __ Ptrue(p2.VnS()); // Vector to search is bytes 0 - 7, repeating every eight bytes. __ Index(z0.VnB(), 0, 1); __ Dup(z0.VnD(), z0.VnD(), 0); // Elements to find are (repeated) bytes 0 - 3 in the first segment, 4 - 7 // in the second, 8 - 11 in the third, etc. __ Index(z1.VnB(), 0, 1); __ Lsr(z1.VnB(), z1.VnB(), 2); __ Match(p3.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB()); __ Match(p4.VnB(), p1.Zeroing(), z0.VnB(), z1.VnB()); __ Nmatch(p0.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB()); __ Uunpklo(z0.VnH(), z0.VnB()); __ Uunpklo(z1.VnH(), z1.VnB()); __ Match(p5.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH()); __ Match(p6.VnH(), p2.Zeroing(), z0.VnH(), z1.VnH()); __ Nmatch(p1.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH()); END(); if (CAN_RUN()) { RUN(); int p3_exp[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p3_exp, p3.VnB()); int p4_exp[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1}; ASSERT_EQUAL_SVE(p4_exp, p4.VnB()); int p0_exp[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0}; ASSERT_EQUAL_SVE(p0_exp, p0.VnB()); int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1}; ASSERT_EQUAL_SVE(p5_exp, p5.VnB()); int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; ASSERT_EQUAL_SVE(p6_exp, p6.VnB()); int p1_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0}; ASSERT_EQUAL_SVE(p1_exp, p1.VnB()); } } TEST_SVE(sve2_saba_uaba) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); __ Index(z0.VnB(), 0, 1); __ Dup(z1.VnB(), 0xff); __ Dup(z2.VnB(), 1); __ Uaba(z2.VnB(), z2.VnB(), z0.VnB(), z1.VnB()); __ Index(z0.VnB(), 0, -1); __ Index(z3.VnH(), 0, 1); __ Index(z4.VnH(), 1, 1); __ Uaba(z3.VnH(), z3.VnH(), z3.VnH(), z4.VnH()); __ Index(z5.VnS(), 3, 6); __ Index(z6.VnS(), 5, 6); __ Uaba(z5.VnS(), z5.VnS(), z5.VnS(), z6.VnS()); __ Index(z7.VnD(), 424, 12); __ Index(z8.VnD(), 4242, 12); __ Uaba(z7.VnD(), z7.VnD(), z7.VnD(), z8.VnD()); __ Index(z9.VnH(), -1, -1); __ Dup(z10.VnB(), 0); __ Saba(z10.VnB(), z10.VnB(), z9.VnB(), z10.VnB()); __ Index(z11.VnH(), 0x0101, 1); __ Index(z12.VnH(), 0, 1); __ Index(z13.VnH(), 0, -1); __ Saba(z13.VnH(), z13.VnH(), z12.VnH(), z13.VnH()); __ Index(z14.VnS(), 0, 2); __ Index(z15.VnS(), 0, -2); __ Saba(z15.VnS(), z15.VnS(), z14.VnS(), z15.VnS()); __ Index(z16.VnD(), 0, 42); __ Index(z17.VnD(), 0, -42); __ Saba(z17.VnD(), z17.VnD(), z16.VnD(), z17.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z0, z2); ASSERT_EQUAL_SVE(z3, z4); ASSERT_EQUAL_SVE(z5, z6); ASSERT_EQUAL_SVE(z7, z8); ASSERT_EQUAL_SVE(z10, z11); ASSERT_EQUAL_SVE(z12, z13); ASSERT_EQUAL_SVE(z14, z15); ASSERT_EQUAL_SVE(z16, z17); } } TEST_SVE(sve2_integer_multiply_long_vector) { // The test just check Sqdmull[b|t] and Pmull[b|t], as the way how the element // operating of the other instructions in the group are likewise. int32_t zn_inputs_s[] = {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN}; int32_t zm_inputs_s[] = {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN}; int64_t sqdmullb_vec_expected_d[] = {-8, -32, -72, -128, RawbitsToInt64(0x8000000100000000), INT64_MAX}; uint64_t sqdmullt_vec_expected_d[] = {2, 18, 50, 98, 0x8000000100000000, 0x7ffffffe00000002}; uint64_t pmullb_vec_expected_d[] = {0x00000001fffffffc, 0x00000003fffffff0, 0x000000020000001c, 0x00000007ffffffc0, 0x3fffffff80000000, 0x4000000000000000}; uint64_t pmullt_vec_expected_d[] = {0x05, 0x11, 0x15, 0x3fffffff80000000, 0x1555555555555555}; uint64_t sqdmullb_idx_expected_d[] = {0xfffffffffffffff8, 0xfffffffffffffff0, 0xffffffffffffffb8, 0xffffffffffffffa0, 0x8000000100000000, INT64_MAX}; uint64_t sqdmullt_idx_expected_d[] = {8, // 2 * zn[11] * zm[8] = 2 * 4 * 1 24, // 2 * zn[9] * zm[8] = 2 * 4 * 3 80, // 2 * zn[7] * zm[4] = 2 * 8 * 5 112, // 2 * zn[5] * zm[4] = 2 * 8 * 7 0x7fffffffffffffff, // 2 * zn[3] * zm[0] 0x8000000100000000}; // 2 * zn[1] * zm[0] SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); InsrHelper(&masm, z31.VnS(), zn_inputs_s); InsrHelper(&masm, z30.VnS(), zm_inputs_s); __ Sqdmullb(z1.VnD(), z31.VnS(), z30.VnS()); __ Sqdmullt(z2.VnD(), z31.VnS(), z30.VnS()); __ Pmullb(z3.VnD(), z31.VnS(), z30.VnS()); __ Pmullt(z4.VnD(), z31.VnS(), z30.VnS()); __ Mov(z7, z30); __ Mov(z8, z31); __ Sqdmullb(z5.VnD(), z8.VnS(), z7.VnS(), 2); __ Sqdmullt(z6.VnD(), z8.VnS(), z7.VnS(), 0); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(sqdmullb_vec_expected_d, z1.VnD()); ASSERT_EQUAL_SVE(sqdmullt_vec_expected_d, z2.VnD()); ASSERT_EQUAL_SVE(pmullb_vec_expected_d, z3.VnD()); ASSERT_EQUAL_SVE(pmullt_vec_expected_d, z4.VnD()); ASSERT_EQUAL_SVE(sqdmullb_idx_expected_d, z5.VnD()); ASSERT_EQUAL_SVE(sqdmullt_idx_expected_d, z6.VnD()); } } TEST_SVE(sve2_integer_multiply_add_long_vector) { int32_t zn_inputs_s[] = {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN}; int32_t zm_inputs_s[] = {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN}; int64_t sqdmlalb_vec_expected_d[] = {-3, -28, -69, -126, RawbitsToInt64(0x8000000100000001), INT64_MAX}; int64_t sqdmlalt_vec_expected_d[] = {-3, 14, 47, 96, RawbitsToInt64(0x80000000ffffffff), static_cast( 0x7ffffffe00000002)}; int64_t sqdmlalb_idx_expected_d[] = {-11, // za.d[5] + 2 * zn.s[10] * zm.s[8] = 5 + 2 * -2 * 4 -28, // za.d[4] + 2 * zn.s[8] * zm.s[8] = 4 + 2 * -4 * 4 -93, // za.d[3] + 2 * zn.s[6] * zm.s[4] = 3 + 2 * -6 * 8 -126, // za.d[2] + 2 * zn.s[4] * zm.s[4] = 2 + 2 * -8 * 8 RawbitsToInt64(0x8000000100000001), INT64_MAX}; int64_t sqdmlalt_idx_expected_d[] = {1, // za.d[5] + 2 * zn.s[11] * zm.s[9] = -5 + 2 * 1 * 3 14, // za.d[4] + 2 * zn.s[9] * zm.s[9] = -4 + 2 * 3 * 3 67, // za.d[3] + 2 * zn.s[7] * zm.s[5] = -3 + 2 * 5 * 7 96, // za.d[2] + 2 * zn.s[5] * zm.s[5] = -2 + 2 * 7 * 7 RawbitsToInt64(0x80000000ffffffff), static_cast(0x7ffffffe00000002)}; SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); InsrHelper(&masm, z0.VnS(), zn_inputs_s); InsrHelper(&masm, z1.VnS(), zm_inputs_s); __ Index(z2.VnD(), 0, 1); __ Index(z3.VnD(), 0, -1); __ Mov(z31, z2); __ Sqdmlalb(z31.VnD(), z31.VnD(), z0.VnS(), z1.VnS()); __ Mov(z30, z3); __ Sqdmlalt(z30.VnD(), z30.VnD(), z0.VnS(), z1.VnS()); __ Mov(z29, z31); __ Sqdmlslb(z29.VnD(), z29.VnD(), z0.VnS(), z1.VnS()); __ Mov(z28, z30); __ Sqdmlslt(z28.VnD(), z28.VnD(), z0.VnS(), z1.VnS()); __ Sqdmlalb(z27.VnD(), z2.VnD(), z0.VnS(), z1.VnS()); __ Sqdmlalt(z26.VnD(), z3.VnD(), z0.VnS(), z1.VnS()); __ Sqdmlslb(z25.VnD(), z27.VnD(), z0.VnS(), z1.VnS()); __ Sqdmlslt(z24.VnD(), z26.VnD(), z0.VnS(), z1.VnS()); __ Mov(z23, z2); __ Sqdmlalb(z23.VnD(), z23.VnD(), z0.VnS(), z1.VnS(), 0); __ Mov(z22, z3); __ Sqdmlalt(z22.VnD(), z22.VnD(), z0.VnS(), z1.VnS(), 1); __ Mov(z21, z23); __ Sqdmlslb(z21.VnD(), z21.VnD(), z0.VnS(), z1.VnS(), 0); __ Mov(z20, z22); __ Sqdmlslt(z20.VnD(), z20.VnD(), z0.VnS(), z1.VnS(), 1); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(sqdmlalb_vec_expected_d, z31.VnD()); ASSERT_EQUAL_SVE(sqdmlalt_vec_expected_d, z30.VnD()); ASSERT_EQUAL_SVE(z2, z29); ASSERT_EQUAL_SVE(z3, z28); ASSERT_EQUAL_SVE(z31, z27); ASSERT_EQUAL_SVE(z30, z26); ASSERT_EQUAL_SVE(z29, z25); ASSERT_EQUAL_SVE(z28, z24); ASSERT_EQUAL_SVE(sqdmlalb_idx_expected_d, z23.VnD()); ASSERT_EQUAL_SVE(sqdmlalt_idx_expected_d, z22.VnD()); ASSERT_EQUAL_SVE(z2, z21); ASSERT_EQUAL_SVE(z3, z20); } } TEST_SVE(sve2_ldnt1) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); int data_size = kZRegMaxSizeInBytes * 4; uint8_t* data = new uint8_t[data_size]; for (int i = 0; i < data_size; i++) { data[i] = i & 0xff; } // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); __ Index(z30.VnD(), x0, 1); __ Ptrue(p0.VnB()); __ Punpklo(p1.VnH(), p0.VnB()); __ Punpklo(p2.VnH(), p1.VnB()); __ Punpklo(p3.VnH(), p2.VnB()); __ Punpklo(p4.VnH(), p3.VnB()); __ Mov(x1, 1); __ Ldnt1b(z0.VnD(), p1.Zeroing(), SVEMemOperand(z30.VnD(), x1)); __ Ld1b(z1.VnD(), p1.Zeroing(), SVEMemOperand(x1, z30.VnD())); __ Mov(x1, -4); __ Ldnt1h(z2.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1)); __ Ld1h(z3.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD())); __ Mov(x1, 16); __ Ldnt1w(z4.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1)); __ Ld1w(z5.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD())); __ Mov(x1, -16); __ Ldnt1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(z30.VnD(), x1)); __ Ld1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x1, z30.VnD())); __ Mov(x1, 1); __ Ldnt1sb(z8.VnD(), p0.Zeroing(), SVEMemOperand(z30.VnD(), x1)); __ Ld1sb(z9.VnD(), p0.Zeroing(), SVEMemOperand(x1, z30.VnD())); __ Mov(x1, -4); __ Ldnt1sh(z10.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1)); __ Ld1sh(z11.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD())); __ Mov(x1, 16); __ Ldnt1sw(z12.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1)); __ Ld1sw(z13.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD())); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z0, z1); ASSERT_EQUAL_SVE(z2, z3); ASSERT_EQUAL_SVE(z4, z5); ASSERT_EQUAL_SVE(z6, z7); ASSERT_EQUAL_SVE(z8, z9); ASSERT_EQUAL_SVE(z10, z11); ASSERT_EQUAL_SVE(z12, z13); } } TEST_SVE(sve2_stnt1) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); int data_size = kZRegMaxSizeInBytes * 4; uint8_t* data = new uint8_t[data_size]; // Set the base half-way through the buffer so we can use negative indices. __ Mov(x0, reinterpret_cast(&data[data_size / 2])); __ Ptrue(p0.VnB()); __ Punpklo(p1.VnH(), p0.VnB()); __ Punpklo(p2.VnH(), p1.VnB()); __ Punpklo(p3.VnH(), p2.VnB()); __ Punpklo(p4.VnH(), p3.VnB()); __ Dup(z0.VnB(), 0xaa); __ Dup(z1.VnB(), 0x55); __ Rdvl(x1, 1); __ Mov(x3, 0); // Put store addresses into z30, and a small offset in x4. __ Index(z30.VnD(), x0, 1); __ Mov(x4, 2); // Store an entire vector of 0xaa to the buffer, then a smaller scatter store // of 0x55 using Stnt1b. __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4)); __ Stnt1b(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4)); // Load the entire vector back from the buffer. __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4)); // Construct a predicate that reflects the number of bytes stored by Stnt1b, // based on the current VL, and use Sel to obtain a reference vector for // comparison. __ Lsr(x2, x1, 3); __ Whilelo(p5.VnB(), x3, x2); __ Sel(z3.VnB(), p5.Merging(), z1.VnB(), z0.VnB()); // Repeat for larger element sizes. __ Mov(x4, -4); __ Index(z30.VnD(), x0, 2); __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4)); __ Stnt1h(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4)); __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4)); __ Lsr(x2, x1, 2); __ Whilelo(p5.VnB(), x3, x2); __ Sel(z5.VnB(), p5.Merging(), z1.VnB(), z0.VnB()); __ Mov(x4, 16); __ Index(z30.VnD(), x0, 4); __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4)); __ Stnt1w(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4)); __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4)); __ Lsr(x2, x1, 1); __ Whilelo(p5.VnB(), x3, x2); __ Sel(z7.VnB(), p5.Merging(), z1.VnB(), z0.VnB()); __ Mov(x4, -16); __ Index(z30.VnD(), x0, 8); __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4)); __ Stnt1d(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4)); __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4)); __ Whilelo(p5.VnB(), x3, x1); __ Sel(z9.VnB(), p5.Merging(), z1.VnB(), z0.VnB()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z2, z3); ASSERT_EQUAL_SVE(z4, z5); ASSERT_EQUAL_SVE(z6, z7); ASSERT_EQUAL_SVE(z8, z9); } } TEST_SVE(sve2_while_simple) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); __ Mov(x0, 1); __ Mov(x1, 0); __ Mov(x2, 3); __ Whilehi(p0.VnB(), x0, x1); __ Whilehs(p1.VnB(), x0, x1); __ Whilehi(p2.VnB(), x2, x1); __ Whilehs(p3.VnB(), x2, x1); __ Whilehi(p4.VnB(), x2, x0); __ Whilehs(p5.VnB(), x2, x0); __ Whilegt(p6.VnB(), x0, x1); __ Whilege(p7.VnB(), x0, x1); __ Whilegt(p8.VnB(), x2, x1); __ Whilege(p9.VnB(), x2, x1); __ Whilegt(p10.VnB(), x2, x0); __ Whilege(p11.VnB(), x2, x0); __ Mov(x4, 0x80000000); __ Mov(x5, 0x80000001); __ Whilege(p12.VnB(), w5, w4); __ Whilegt(p13.VnB(), w5, w4); __ Mov(x6, 0x8000000000000000); __ Mov(x7, 0x8000000000000001); __ Whilege(p14.VnB(), x7, x6); __ Whilegt(p15.VnB(), x7, x6); for (int i = 0; i < 16; i++) { __ Rev(PRegister(i).VnB(), PRegister(i).VnB()); } END(); if (CAN_RUN()) { RUN(); int p0_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int p1_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1}; int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1}; int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1}; int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1}; int p8_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1}; int p9_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; int p10_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1}; int p11_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1}; int p12_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; int p13_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; int p14_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; int p15_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; ASSERT_EQUAL_SVE(p0_exp, p0.VnB()); ASSERT_EQUAL_SVE(p1_exp, p1.VnB()); ASSERT_EQUAL_SVE(p2_exp, p2.VnB()); ASSERT_EQUAL_SVE(p3_exp, p3.VnB()); ASSERT_EQUAL_SVE(p4_exp, p4.VnB()); ASSERT_EQUAL_SVE(p5_exp, p5.VnB()); ASSERT_EQUAL_SVE(p6_exp, p6.VnB()); ASSERT_EQUAL_SVE(p7_exp, p7.VnB()); ASSERT_EQUAL_SVE(p8_exp, p8.VnB()); ASSERT_EQUAL_SVE(p9_exp, p9.VnB()); ASSERT_EQUAL_SVE(p10_exp, p10.VnB()); ASSERT_EQUAL_SVE(p11_exp, p11.VnB()); ASSERT_EQUAL_SVE(p12_exp, p12.VnB()); ASSERT_EQUAL_SVE(p13_exp, p13.VnB()); ASSERT_EQUAL_SVE(p14_exp, p14.VnB()); ASSERT_EQUAL_SVE(p15_exp, p15.VnB()); } } TEST_SVE(sve2_whilerw_whilewr_simple) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); __ Mov(x0, 0); __ Mov(x1, 1); __ Mov(x2, 3); __ Whilerw(p0.VnB(), x0, x0); __ Whilerw(p1.VnB(), x0, x1); __ Whilerw(p2.VnB(), x1, x0); __ Whilewr(p3.VnB(), x0, x0); __ Whilewr(p4.VnB(), x0, x1); __ Whilewr(p5.VnB(), x1, x0); __ Whilewr(p6.VnH(), x1, x1); __ Whilewr(p7.VnH(), x1, x2); __ Whilewr(p8.VnH(), x2, x1); END(); if (CAN_RUN()) { RUN(); int p0_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p0_exp, p0.VnB()); int p1_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; ASSERT_EQUAL_SVE(p1_exp, p1.VnB()); int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; ASSERT_EQUAL_SVE(p2_exp, p2.VnB()); int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p3_exp, p3.VnB()); int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; ASSERT_EQUAL_SVE(p4_exp, p4.VnB()); int p5_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; ASSERT_EQUAL_SVE(p5_exp, p5.VnB()); int p6_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; ASSERT_EQUAL_SVE(p6_exp, p6.VnB()); int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; ASSERT_EQUAL_SVE(p7_exp, p7.VnB()); int p8_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; ASSERT_EQUAL_SVE(p8_exp, p8.VnB()); } } TEST_SVE(sve2_sqrdcmlah) { int32_t zn_inputs[] = {-1, -2, -3, -4, 1, 2, 3, 4}; int32_t zm_inputs[] = {-1, -2, 3, 4, 1, 2, -3, -4}; int32_t za_inputs[] = {1, 2, 3, 4, 5, 6, 7, 8}; int32_t zd_000_expected[] = {1025, 2050, -6141, -8188, 1029, 2054, -6137, -8184}; int32_t zd_090_expected[] = {1025, -510, -6141, 4612, 1029, -506, -6137, 4616}; int32_t zd_180_expected[] = {-1023, -2046, 6147, 8196, -1019, -2042, 6151, 8200}; int32_t zd_270_expected[] = {-1023, 514, 6147, -4604, -1019, 518, 6151, -4600}; int32_t zd_0_270_expected[] = {2049, -1534, 6147, -4604, 2053, -1530, 6151, -4600}; int32_t zd_3_090_expected[] = {1025, -510, 3075, -1532, 1029, -506, 3079, -1528}; SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); InsrHelper(&masm, z0.VnS(), zn_inputs); InsrHelper(&masm, z1.VnS(), zm_inputs); InsrHelper(&masm, z31.VnS(), za_inputs); // When the value in operands is small, shift left a random value so that it // can affect the result in destination. int shift = 20; __ Lsl(z0.VnS(), z0.VnS(), shift); __ Lsl(z1.VnS(), z1.VnS(), shift); __ Mov(z10, z31); __ Sqrdcmlah(z10.VnS(), z10.VnS(), z0.VnS(), z1.VnS(), 0); __ Mov(z11, z31); __ Sqrdcmlah(z11.VnS(), z11.VnS(), z0.VnS(), z1.VnS(), 90); __ Mov(z12, z31); __ Sqrdcmlah(z12.VnS(), z12.VnS(), z0.VnS(), z1.VnS(), 180); __ Mov(z13, z31); __ Sqrdcmlah(z13.VnS(), z13.VnS(), z0.VnS(), z1.VnS(), 270); __ Sqrdcmlah(z14.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 0); __ Sqrdcmlah(z15.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 90); __ Sqrdcmlah(z16.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 180); __ Sqrdcmlah(z17.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 270); __ Mov(z18, z31); __ Sqrdcmlah(z18.VnS(), z18.VnS(), z0.VnS(), z1.VnS(), 0, 270); __ Mov(z19, z31); __ Sqrdcmlah(z19.VnS(), z19.VnS(), z0.VnS(), z1.VnS(), 1, 90); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zd_000_expected, z10.VnS()); ASSERT_EQUAL_SVE(zd_090_expected, z11.VnS()); ASSERT_EQUAL_SVE(zd_180_expected, z12.VnS()); ASSERT_EQUAL_SVE(zd_270_expected, z13.VnS()); ASSERT_EQUAL_SVE(z14, z10); ASSERT_EQUAL_SVE(z15, z11); ASSERT_EQUAL_SVE(z16, z12); ASSERT_EQUAL_SVE(z17, z13); ASSERT_EQUAL_SVE(zd_0_270_expected, z18.VnS()); ASSERT_EQUAL_SVE(zd_3_090_expected, z19.VnS()); } } TEST_SVE(sve2_sqrdmlah) { uint16_t zn_inputs_h[] = {0x7ffe, 0x7ffd, 0x7ffd, 0x7ffd, 0x8000, 0x7fff, 0x7ffe, 0x7ffe, 0x8001, 0x8000, 0x7ffd, 0x7ffd, 0x7ffd, 0x5555, 0x5555, 0x5555, 0x8000, 0x8000, 0xaaaa, 0x8001}; uint16_t zm_inputs_h[] = {0x7ffd, 0x7fff, 0x7ffe, 0x7ffd, 0x8001, 0x7fff, 0x7fff, 0x7ffe, 0x8000, 0x8000, 0xaaaa, 0x0001, 0x0001, 0xaaaa, 0xaaaa, 0xcccc, 0x8000, 0x8000, 0x8000, 0x8001}; uint16_t za_inputs_h[] = {0x1010, 0x1010, 0x1010, 0x1010, 0x1010, 0x1010, 0x1010, 0x1010, 0x8000, 0x8011, 0x8006, 0xff7d, 0xfeff, 0xaabc, 0xaabb, 0x9c72, 0x8000, 0x0000, 0x8000, 0xffff}; uint16_t zd_expected_h[] = {0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0xffff, 0x0011, 0x8000, 0xff7e, 0xff00, 0x8000, 0x8000, 0x8000, 0x0000, 0x7fff, 0xd556, 0x7ffd}; uint32_t zn_inputs_s[] = {0x04000000, 0x80000000, 0x04000000, 0x80000000, 0x80000000, 0x80000001, 0x7fffffff, 0x80000000, 0x7ffffffe, 0x7ffffffd, 0x7ffffffd, 0x7ffffffd}; uint32_t zm_inputs_s[] = {0x00000020, 0x80000000, 0x00000010, 0x80000000, 0x7fffffff, 0x80000000, 0x80000000, 0x80000001, 0x7ffffffd, 0x7fffffff, 0x7ffffffe, 0x7ffffffd}; uint32_t za_inputs_s[] = {0x00000000, 0x00000000, 0x00000020, 0x00108000, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x10101010, 0x10101010, 0x10101010, 0x10101010}; uint32_t zd_expected_s[] = {0x00000001, 0x7fffffff, 0x00000021, 0x7fffffff, 0x80000001, 0x7fffffff, 0x80000001, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; uint64_t zn_inputs_d[] = {0x0400000000000000, 0x8000000000000000, 0x0400000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000001, 0x7fffffffffffffff, 0x8000000000000000, 0x7ffffffffffffffe, 0x7ffffffffffffffd, 0x7ffffffffffffffd, 0x7ffffffffffffffd, 0xf1299accc9186169, 0xd529d2675ee9da21, 0x1a10b5d60b92dcf9, 0xfb1d358e0e6455b1, 0x8eb7721078bdc589, 0x4171509750ded141, 0x8eb7721078bdc589, 0x4171509750ded141}; uint64_t zm_inputs_d[] = {0x0000000000000020, 0x8000000000000000, 0x0000000000000010, 0x8000000000000000, 0x7fffffffffffffff, 0x8000000000000000, 0x8000000000000000, 0x8000000000000001, 0x7ffffffffffffffd, 0x7fffffffffffffff, 0x7ffffffffffffffe, 0x7ffffffffffffffd, 0x30b940efe73f180e, 0x3bc1ff1e52a99b66, 0x40de5c9793535a5e, 0x24752faf47bdddb6, 0x162663016b07e5ae, 0x1de34b56f3d22006, 0x8eb7721078bdc589, 0x4171509750ded141}; uint64_t za_inputs_d[] = {0x0000000000000000, 0x0000000000000000, 0x0000000000000020, 0x0010108000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, 0x0000000000000001, 0x1010101010101010, 0x1010101010101010, 0x1010101010101010, 0x1010101010101010, 0xb18253371b2c2c77, 0xa70de31e6645eaef, 0xda817198c0318487, 0x9fd9e6b8e04b42ff, 0xced1f6b7119ab197, 0x01ae051a85509b0f, 0x01a211e9352f7927, 0x7667b70a5b13749f}; uint64_t zd_expected_d[] = {0x0000000000000001, 0x7fffffffffffffff, 0x0000000000000021, 0x7fffffffffffffff, 0x8000000000000001, 0x7fffffffffffffff, 0x8000000000000001, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0xabdc73dea0d72a35, 0x930e3dc877301966, 0xe7b7145a059f8a9f, 0x9e75a4a9d10cf8af, 0xbb378528642d2581, 0x10f5e6d693ffddf3, 0x65e455a46adc091c, 0x7fffffffffffffff}; SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); InsrHelper(&masm, z0.VnH(), zn_inputs_h); InsrHelper(&masm, z1.VnH(), zm_inputs_h); InsrHelper(&masm, z2.VnH(), za_inputs_h); __ Sqrdmlah(z2.VnH(), z2.VnH(), z0.VnH(), z1.VnH()); InsrHelper(&masm, z3.VnS(), zn_inputs_s); InsrHelper(&masm, z4.VnS(), zm_inputs_s); InsrHelper(&masm, z5.VnS(), za_inputs_s); __ Sqrdmlah(z5.VnS(), z5.VnS(), z3.VnS(), z4.VnS()); InsrHelper(&masm, z6.VnD(), zn_inputs_d); InsrHelper(&masm, z7.VnD(), zm_inputs_d); InsrHelper(&masm, z8.VnD(), za_inputs_d); __ Sqrdmlah(z8.VnD(), z8.VnD(), z6.VnD(), z7.VnD()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zd_expected_h, z2.VnH()); ASSERT_EQUAL_SVE(zd_expected_s, z5.VnS()); ASSERT_EQUAL_SVE(zd_expected_d, z8.VnD()); } } TEST_SVE(sve2_cmla) { int32_t zn_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8}; int32_t zm_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8}; int32_t zda_inputs_s[] = {1, 2, 3, 4, 5, 6, 7, 8}; int32_t zd_000_expected[] = {9, 18, 51, 68, 13, 22, 55, 72}; int32_t zd_090_expected[] = {9, -2, 51, -32, 13, 2, 55, -28}; int32_t zd_180_expected[] = {-7, -14, -45, -60, -3, -10, -41, -56}; int32_t zd_270_expected[] = {-7, 6, -45, 40, -3, 10, -41, 44}; SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); InsrHelper(&masm, z31.VnS(), zn_inputs_s); InsrHelper(&masm, z30.VnS(), zm_inputs_s); InsrHelper(&masm, z0.VnS(), zda_inputs_s); __ Mov(z29, z0); __ Cmla(z0.VnS(), z0.VnS(), z31.VnS(), z30.VnS(), 0); InsrHelper(&masm, z1.VnS(), zda_inputs_s); __ Mov(z28, z1); __ Cmla(z1.VnS(), z1.VnS(), z31.VnS(), z30.VnS(), 90); InsrHelper(&masm, z2.VnS(), zda_inputs_s); __ Mov(z27, z2); __ Cmla(z2.VnS(), z2.VnS(), z31.VnS(), z30.VnS(), 180); InsrHelper(&masm, z3.VnS(), zda_inputs_s); __ Mov(z26, z3); __ Cmla(z3.VnS(), z3.VnS(), z31.VnS(), z30.VnS(), 270); __ Cmla(z4.VnS(), z29.VnS(), z31.VnS(), z30.VnS(), 0); __ Cmla(z5.VnS(), z28.VnS(), z31.VnS(), z30.VnS(), 90); __ Cmla(z6.VnS(), z27.VnS(), z31.VnS(), z30.VnS(), 180); __ Cmla(z7.VnS(), z26.VnS(), z31.VnS(), z30.VnS(), 270); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(zd_000_expected, z0.VnS()); ASSERT_EQUAL_SVE(zd_090_expected, z1.VnS()); ASSERT_EQUAL_SVE(zd_180_expected, z2.VnS()); ASSERT_EQUAL_SVE(zd_270_expected, z3.VnS()); ASSERT_EQUAL_SVE(z4, z0); ASSERT_EQUAL_SVE(z5, z1); ASSERT_EQUAL_SVE(z6, z2); ASSERT_EQUAL_SVE(z7, z3); } } TEST_SVE(sve2_integer_saturating_multiply_add_long) { int32_t zn_bottom_inputs[] = {-2, -4, -6, -8, INT32_MAX, INT32_MIN, INT32_MIN}; int32_t zm_top_inputs[] = {1, 3, 5, 7, INT32_MAX, INT32_MAX, INT32_MIN}; int64_t sqdmlalbt_expected[] = {2, -19, -56, -109, static_cast(0x7ffffffe00000004), RawbitsToInt64(0x8000000100000001), INT64_MAX}; int64_t sqdmlslbt_expected[] = {-2, 19, 56, 109, RawbitsToInt64(0x80000001fffffffc), static_cast(0x7ffffffeffffffff), RawbitsToInt64(0x8000000000000001)}; SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); InsrHelper(&masm, z31.VnS(), zn_bottom_inputs); InsrHelper(&masm, z30.VnS(), zm_top_inputs); __ Dup(z29.VnD(), 0); __ Zip1(z31.VnS(), z31.VnS(), z29.VnS()); __ Zip1(z30.VnS(), z29.VnS(), z30.VnS()); // Initialise inputs for za. __ Index(z1.VnD(), 0, 1); __ Index(z2.VnD(), 0, -1); __ Sqdmlalbt(z1.VnD(), z1.VnD(), z31.VnS(), z30.VnS()); __ Sqdmlslbt(z2.VnD(), z2.VnD(), z31.VnS(), z30.VnS()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(sqdmlalbt_expected, z1.VnD()); ASSERT_EQUAL_SVE(sqdmlslbt_expected, z2.VnD()); } } TEST_SVE(sve2_floating_point_multiply_add_long_vector) { uint16_t zn_inputs[] = {Float16ToRawbits(Float16(1000)), Float16ToRawbits(Float16(2000)), Float16ToRawbits(Float16(0.5)), Float16ToRawbits(Float16(-0.5)), Float16ToRawbits(Float16(14)), Float16ToRawbits(Float16(-14)), Float16ToRawbits(kFP16PositiveInfinity), Float16ToRawbits(kFP16NegativeInfinity)}; uint16_t zm_inputs[] = {Float16ToRawbits(Float16(10)), Float16ToRawbits(Float16(-10)), Float16ToRawbits(Float16(10)), Float16ToRawbits(Float16(-10)), Float16ToRawbits(Float16(10)), Float16ToRawbits(Float16(-10)), Float16ToRawbits(Float16(10)), Float16ToRawbits(Float16(-10))}; uint32_t za_inputs[] = {FloatToRawbits(1.0f), FloatToRawbits(-1.0f), FloatToRawbits(1.0f), FloatToRawbits(-1.0f)}; uint32_t fmlalb_zd_expected[] = {0xc69c3e00, // -19999 0x40800000, // 4 0x430d0000, // 141 FloatToRawbits(kFP32PositiveInfinity)}; uint32_t fmlalt_zd_expected[] = {0x461c4400, // 10001 0x40800000, // 4 0x430d0000, // 141 FloatToRawbits(kFP32PositiveInfinity)}; uint32_t fmlslb_zd_expected[] = {0x469c4200, // 20001 0xc0c00000, // -6 0xc30b0000, // -139 FloatToRawbits(kFP32NegativeInfinity)}; uint32_t fmlslt_zd_expected[] = {0xc61c3c00, // -9999 0xc0c00000, // -6 0xc30b0000, // -139 FloatToRawbits(kFP32NegativeInfinity)}; SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); InsrHelper(&masm, z31.VnH(), zn_inputs); InsrHelper(&masm, z30.VnH(), zm_inputs); InsrHelper(&masm, z29.VnS(), za_inputs); __ Mov(z0, z29); __ Fmlalb(z0.VnS(), z0.VnS(), z31.VnH(), z30.VnH()); __ Mov(z1, z29); __ Fmlalt(z1.VnS(), z1.VnS(), z31.VnH(), z30.VnH()); __ Mov(z2, z29); __ Fmlslb(z2.VnS(), z2.VnS(), z31.VnH(), z30.VnH()); __ Mov(z3, z29); __ Fmlslt(z3.VnS(), z3.VnS(), z31.VnH(), z30.VnH()); __ Fmlalb(z4.VnS(), z29.VnS(), z31.VnH(), z30.VnH()); __ Fmlalt(z5.VnS(), z29.VnS(), z31.VnH(), z30.VnH()); __ Fmlslb(z6.VnS(), z29.VnS(), z31.VnH(), z30.VnH()); __ Fmlslt(z7.VnS(), z29.VnS(), z31.VnH(), z30.VnH()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(fmlalb_zd_expected, z0.VnS()); ASSERT_EQUAL_SVE(fmlalt_zd_expected, z1.VnS()); ASSERT_EQUAL_SVE(fmlslb_zd_expected, z2.VnS()); ASSERT_EQUAL_SVE(fmlslt_zd_expected, z3.VnS()); ASSERT_EQUAL_SVE(z4, z0); ASSERT_EQUAL_SVE(z5, z1); ASSERT_EQUAL_SVE(z6, z2); ASSERT_EQUAL_SVE(z7, z3); } } TEST_SVE(sve2_flogb_simple) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2); START(); __ Ptrue(p0.VnB()); __ Index(z0.VnS(), -4, 1); __ Mov(z1.VnS(), 0); __ Mov(z2.VnD(), 0x000fffffffffffff); __ Mov(z3.VnD(), 0x0010000000000000); __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS()); __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS()); __ Fdiv(z1.VnS(), p0.Merging(), z0.VnS(), z1.VnS()); __ Flogb(z0.VnS(), p0.Merging(), z0.VnS()); __ Flogb(z1.VnS(), p0.Merging(), z1.VnS()); __ Flogb(z2.VnD(), p0.Merging(), z2.VnD()); __ Flogb(z3.VnD(), p0.Merging(), z3.VnD()); END(); if (CAN_RUN()) { RUN(); uint64_t expected_z0[] = {0x0000000200000002, 0x0000000200000002, 0x0000000100000001, 0x0000000080000000, 0x0000000000000001, 0x0000000100000002}; ASSERT_EQUAL_SVE(expected_z0, z0.VnD()); uint64_t expected_z1[] = {0x7fffffff7fffffff, 0x7fffffff7fffffff, 0x7fffffff7fffffff, 0x7fffffff80000000, 0x7fffffff7fffffff, 0x7fffffff7fffffff}; ASSERT_EQUAL_SVE(expected_z1, z1.VnD()); uint64_t expected_z2[] = {0xfffffffffffffc01, 0xfffffffffffffc01, 0xfffffffffffffc01, 0xfffffffffffffc01}; ASSERT_EQUAL_SVE(expected_z2, z2.VnD()); uint64_t expected_z3[] = {0xfffffffffffffc02, 0xfffffffffffffc02, 0xfffffffffffffc02, 0xfffffffffffffc02}; ASSERT_EQUAL_SVE(expected_z3, z3.VnD()); } } TEST_SVE(neon_matmul) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEI8MM, CPUFeatures::kNEON, CPUFeatures::kI8MM); // Test Neon integer matrix multiply against SVE. START(); __ Movi(v0.V2D(), 0xffeeddccbbaa9988, 0x77665544332211); __ Movi(v1.V2D(), 0xaa5555aa55555555, 0x55aaaa55aaaaaa); __ Movi(v2.V2D(), 0, 0); __ Movi(v3.V2D(), 0, 0); __ Movi(v4.V2D(), 0, 0); __ Movi(v5.V2D(), 0, 0); __ Movi(v6.V2D(), 0, 0); __ Movi(v7.V2D(), 0, 0); __ Smmla(v2.V4S(), v0.V16B(), v1.V16B()); __ Smmla(z3.VnS(), z3.VnS(), z0.VnB(), z1.VnB()); __ Ummla(v4.V4S(), v0.V16B(), v1.V16B()); __ Ummla(z5.VnS(), z5.VnS(), z0.VnB(), z1.VnB()); __ Usmmla(v6.V4S(), v0.V16B(), v1.V16B()); __ Usmmla(z7.VnS(), z7.VnS(), z0.VnB(), z1.VnB()); END(); if (CAN_RUN()) { RUN(); // The inputs as Z registers are zero beyond the least-significant 128 bits, // so the Neon and SVE results should be equal for any VL. ASSERT_EQUAL_SVE(z3, z2); ASSERT_EQUAL_SVE(z5, z4); ASSERT_EQUAL_SVE(z7, z6); } } TEST_SVE(sudot_usdot) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2, CPUFeatures::kSVEI8MM); START(); __ Ptrue(p0.VnB()); __ Index(z0.VnS(), -424242, 77777); __ Index(z1.VnB(), 127, -1); __ Sqabs(z1.VnB(), p0.Merging(), z1.VnB()); __ Index(z2.VnB(), 0, 1); __ Sqabs(z2.VnB(), p0.Merging(), z2.VnB()); __ Index(z3.VnB(), -128, 1); __ Mov(z4.VnD(), 0); // Test Usdot against Udot/Sdot over the range of inputs where they should be // equal. __ Usdot(z5.VnS(), z0.VnS(), z1.VnB(), z2.VnB()); __ Udot(z6.VnS(), z0.VnS(), z1.VnB(), z2.VnB()); __ Usdot(z7.VnS(), z0.VnS(), z1.VnB(), z3.VnB()); __ Sdot(z8.VnS(), z0.VnS(), z1.VnB(), z3.VnB()); // Construct values which, when interpreted correctly as signed/unsigned, // should give a zero result for dot product. __ Mov(z10.VnS(), 0x8101ff40); // [-127, 1, -1, 64] as signed bytes. __ Mov(z11.VnS(), 0x02fe8002); // [2, 254, 128, 2] as unsigned bytes. __ Usdot(z12.VnS(), z4.VnS(), z11.VnB(), z10.VnB()); __ Usdot(z13.VnS(), z4.VnS(), z10.VnB(), z11.VnB()); // Construct a vector with duplicated values across segments. This allows // testing indexed dot product against the already tested variant. __ Mov(z14.VnS(), 1); __ Mul(z15.VnS(), z14.VnS(), z3.VnS(), 1); __ Usdot(z16.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1); __ Usdot(z17.VnS(), z0.VnS(), z3.VnB(), z15.VnB()); __ Sudot(z18.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1); __ Usdot(z19.VnS(), z0.VnS(), z15.VnB(), z3.VnB()); END(); if (CAN_RUN()) { RUN(); ASSERT_EQUAL_SVE(z6, z5); ASSERT_EQUAL_SVE(z8, z7); ASSERT_EQUAL_SVE(z4, z12); uint64_t z13_expected[] = {0xffff8200ffff8200, 0xffff8200ffff8200}; ASSERT_EQUAL_SVE(z13_expected, z13.VnD()); ASSERT_EQUAL_SVE(z17, z16); ASSERT_EQUAL_SVE(z19, z18); } } TEST_SVE(sve_load_store_sp_base_regression_test) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); __ Mov(x0, 0); __ Mov(z0.VnB(), 0); __ Ptrue(p0.VnB()); Label loop; __ Mov(x1, 128); __ Bind(&loop); __ Push(xzr, xzr); __ Sub(x1, x1, 1); __ Cbnz(x1, &loop); { ExactAssemblyScope scope(&masm, 193 * kInstructionSize); __ dci(0xa420a3e0); // ld1b {z0.h}, p0/z, [sp] __ dci(0xa440a3e0); // ld1b {z0.s}, p0/z, [sp] __ dci(0xa460a3e0); // ld1b {z0.d}, p0/z, [sp] __ dci(0xa400a3e0); // ld1b {z0.b}, p0/z, [sp] __ dci(0xa42043e0); // ld1b {z0.h}, p0/z, [sp, x0] __ dci(0xa44043e0); // ld1b {z0.s}, p0/z, [sp, x0] __ dci(0xa46043e0); // ld1b {z0.d}, p0/z, [sp, x0] __ dci(0xa40043e0); // ld1b {z0.b}, p0/z, [sp, x0] __ dci(0xc440c3e0); // ld1b {z0.d}, p0/z, [sp, z0.d] __ dci(0xa5e0a3e0); // ld1d {z0.d}, p0/z, [sp] __ dci(0xa5e043e0); // ld1d {z0.d}, p0/z, [sp, x0, lsl #3] __ dci(0xc5e0c3e0); // ld1d {z0.d}, p0/z, [sp, z0.d, lsl #3] __ dci(0xc5c0c3e0); // ld1d {z0.d}, p0/z, [sp, z0.d] __ dci(0xa4a0a3e0); // ld1h {z0.h}, p0/z, [sp] __ dci(0xa4c0a3e0); // ld1h {z0.s}, p0/z, [sp] __ dci(0xa4e0a3e0); // ld1h {z0.d}, p0/z, [sp] __ dci(0xa4a043e0); // ld1h {z0.h}, p0/z, [sp, x0, lsl #1] __ dci(0xa4c043e0); // ld1h {z0.s}, p0/z, [sp, x0, lsl #1] __ dci(0xa4e043e0); // ld1h {z0.d}, p0/z, [sp, x0, lsl #1] __ dci(0xc4e0c3e0); // ld1h {z0.d}, p0/z, [sp, z0.d, lsl #1] __ dci(0xc4c0c3e0); // ld1h {z0.d}, p0/z, [sp, z0.d] __ dci(0x8440a3e0); // ld1rb {z0.h}, p0/z, [sp] __ dci(0x8440c3e0); // ld1rb {z0.s}, p0/z, [sp] __ dci(0x8440e3e0); // ld1rb {z0.d}, p0/z, [sp] __ dci(0x844083e0); // ld1rb {z0.b}, p0/z, [sp] __ dci(0x85c0e3e0); // ld1rd {z0.d}, p0/z, [sp] __ dci(0x84c0a3e0); // ld1rh {z0.h}, p0/z, [sp] __ dci(0x84c0c3e0); // ld1rh {z0.s}, p0/z, [sp] __ dci(0x84c0e3e0); // ld1rh {z0.d}, p0/z, [sp] __ dci(0xa40023e0); // ld1rqb {z0.b}, p0/z, [sp] __ dci(0xa40003e0); // ld1rqb {z0.b}, p0/z, [sp, x0] __ dci(0xa58023e0); // ld1rqd {z0.d}, p0/z, [sp] __ dci(0xa58003e0); // ld1rqd {z0.d}, p0/z, [sp, x0, lsl #3] __ dci(0xa48023e0); // ld1rqh {z0.h}, p0/z, [sp] __ dci(0xa48003e0); // ld1rqh {z0.h}, p0/z, [sp, x0, lsl #1] __ dci(0xa50023e0); // ld1rqw {z0.s}, p0/z, [sp] __ dci(0xa50003e0); // ld1rqw {z0.s}, p0/z, [sp, x0, lsl #2] __ dci(0x85c0c3e0); // ld1rsb {z0.h}, p0/z, [sp] __ dci(0x85c0a3e0); // ld1rsb {z0.s}, p0/z, [sp] __ dci(0x85c083e0); // ld1rsb {z0.d}, p0/z, [sp] __ dci(0x8540a3e0); // ld1rsh {z0.s}, p0/z, [sp] __ dci(0x854083e0); // ld1rsh {z0.d}, p0/z, [sp] __ dci(0x84c083e0); // ld1rsw {z0.d}, p0/z, [sp] __ dci(0x8540c3e0); // ld1rw {z0.s}, p0/z, [sp] __ dci(0x8540e3e0); // ld1rw {z0.d}, p0/z, [sp] __ dci(0xa5c0a3e0); // ld1sb {z0.h}, p0/z, [sp] __ dci(0xa5a0a3e0); // ld1sb {z0.s}, p0/z, [sp] __ dci(0xa580a3e0); // ld1sb {z0.d}, p0/z, [sp] __ dci(0xa5c043e0); // ld1sb {z0.h}, p0/z, [sp, x0] __ dci(0xa5a043e0); // ld1sb {z0.s}, p0/z, [sp, x0] __ dci(0xa58043e0); // ld1sb {z0.d}, p0/z, [sp, x0] __ dci(0xc44083e0); // ld1sb {z0.d}, p0/z, [sp, z0.d] __ dci(0xa520a3e0); // ld1sh {z0.s}, p0/z, [sp] __ dci(0xa500a3e0); // ld1sh {z0.d}, p0/z, [sp] __ dci(0xa52043e0); // ld1sh {z0.s}, p0/z, [sp, x0, lsl #1] __ dci(0xa50043e0); // ld1sh {z0.d}, p0/z, [sp, x0, lsl #1] __ dci(0xc4e083e0); // ld1sh {z0.d}, p0/z, [sp, z0.d, lsl #1] __ dci(0xc4c083e0); // ld1sh {z0.d}, p0/z, [sp, z0.d] __ dci(0xa480a3e0); // ld1sw {z0.d}, p0/z, [sp] __ dci(0xa48043e0); // ld1sw {z0.d}, p0/z, [sp, x0, lsl #2] __ dci(0xc56083e0); // ld1sw {z0.d}, p0/z, [sp, z0.d, lsl #2] __ dci(0xc54083e0); // ld1sw {z0.d}, p0/z, [sp, z0.d] __ dci(0xa540a3e0); // ld1w {z0.s}, p0/z, [sp] __ dci(0xa560a3e0); // ld1w {z0.d}, p0/z, [sp] __ dci(0xa54043e0); // ld1w {z0.s}, p0/z, [sp, x0, lsl #2] __ dci(0xa56043e0); // ld1w {z0.d}, p0/z, [sp, x0, lsl #2] __ dci(0xc560c3e0); // ld1w {z0.d}, p0/z, [sp, z0.d, lsl #2] __ dci(0xc540c3e0); // ld1w {z0.d}, p0/z, [sp, z0.d] __ dci(0xa420e3e0); // ld2b {z0.b, z1.b}, p0/z, [sp] __ dci(0xa420c3e0); // ld2b {z0.b, z1.b}, p0/z, [sp, x0] __ dci(0xa5a0e3e0); // ld2d {z0.d, z1.d}, p0/z, [sp] __ dci(0xa5a0c3e0); // ld2d {z0.d, z1.d}, p0/z, [sp, x0, lsl #3] __ dci(0xa4a0e3e0); // ld2h {z0.h, z1.h}, p0/z, [sp] __ dci(0xa4a0c3e0); // ld2h {z0.h, z1.h}, p0/z, [sp, x0, lsl #1] __ dci(0xa520e3e0); // ld2w {z0.s, z1.s}, p0/z, [sp] __ dci(0xa520c3e0); // ld2w {z0.s, z1.s}, p0/z, [sp, x0, lsl #2] __ dci(0xa440e3e0); // ld3b {z0.b, z1.b, z2.b}, p0/z, [sp] __ dci(0xa440c3e0); // ld3b {z0.b, z1.b, z2.b}, p0/z, [sp, x0] __ dci(0xa5c0e3e0); // ld3d {z0.d, z1.d, z2.d}, p0/z, [sp] __ dci(0xa5c0c3e0); // ld3d {z0.d, z1.d, z2.d}, p0/z, [sp, x0, lsl #3] __ dci(0xa4c0e3e0); // ld3h {z0.h, z1.h, z2.h}, p0/z, [sp] __ dci(0xa4c0c3e0); // ld3h {z0.h, z1.h, z2.h}, p0/z, [sp, x0, lsl #1] __ dci(0xa540e3e0); // ld3w {z0.s, z1.s, z2.s}, p0/z, [sp] __ dci(0xa540c3e0); // ld3w {z0.s, z1.s, z2.s}, p0/z, [sp, x0, lsl #2] __ dci(0xa460e3e0); // ld4b {z0.b, z1.b, z2.b, z3.b}, p0/z, [sp] __ dci(0xa460c3e0); // ld4b {z0.b, z1.b, z2.b, z3.b}, p0/z, [sp, x0] __ dci(0xa5e0e3e0); // ld4d {z0.d, z1.d, z2.d, z3.d}, p0/z, [sp] __ dci( 0xa5e0c3e0); // ld4d {z0.d, z1.d, z2.d, z3.d}, p0/z, [sp, x0, lsl #3] __ dci(0xa4e0e3e0); // ld4h {z0.h, z1.h, z2.h, z3.h}, p0/z, [sp] __ dci( 0xa4e0c3e0); // ld4h {z0.h, z1.h, z2.h, z3.h}, p0/z, [sp, x0, lsl #1] __ dci(0xa560e3e0); // ld4w {z0.s, z1.s, z2.s, z3.s}, p0/z, [sp] __ dci( 0xa560c3e0); // ld4w {z0.s, z1.s, z2.s, z3.s}, p0/z, [sp, x0, lsl #2] __ dci(0xa42063e0); // ldff1b {z0.h}, p0/z, [sp, x0] __ dci(0xa44063e0); // ldff1b {z0.s}, p0/z, [sp, x0] __ dci(0xa46063e0); // ldff1b {z0.d}, p0/z, [sp, x0] __ dci(0xa40063e0); // ldff1b {z0.b}, p0/z, [sp, x0] __ dci(0xc440e3e0); // ldff1b {z0.d}, p0/z, [sp, z0.d] __ dci(0xa5e063e0); // ldff1d {z0.d}, p0/z, [sp, x0, lsl #3] __ dci(0xc5e0e3e0); // ldff1d {z0.d}, p0/z, [sp, z0.d, lsl #3] __ dci(0xc5c0e3e0); // ldff1d {z0.d}, p0/z, [sp, z0.d] __ dci(0xa4a063e0); // ldff1h {z0.h}, p0/z, [sp, x0, lsl #1] __ dci(0xa4c063e0); // ldff1h {z0.s}, p0/z, [sp, x0, lsl #1] __ dci(0xa4e063e0); // ldff1h {z0.d}, p0/z, [sp, x0, lsl #1] __ dci(0xc4e0e3e0); // ldff1h {z0.d}, p0/z, [sp, z0.d, lsl #1] __ dci(0xc4c0e3e0); // ldff1h {z0.d}, p0/z, [sp, z0.d] __ dci(0xa5c063e0); // ldff1sb {z0.h}, p0/z, [sp, x0] __ dci(0xa5a063e0); // ldff1sb {z0.s}, p0/z, [sp, x0] __ dci(0xa58063e0); // ldff1sb {z0.d}, p0/z, [sp, x0] __ dci(0xc440a3e0); // ldff1sb {z0.d}, p0/z, [sp, z0.d] __ dci(0xa52063e0); // ldff1sh {z0.s}, p0/z, [sp, x0, lsl #1] __ dci(0xa50063e0); // ldff1sh {z0.d}, p0/z, [sp, x0, lsl #1] __ dci(0xc4e0a3e0); // ldff1sh {z0.d}, p0/z, [sp, z0.d, lsl #1] __ dci(0xc4c0a3e0); // ldff1sh {z0.d}, p0/z, [sp, z0.d] __ dci(0xa48063e0); // ldff1sw {z0.d}, p0/z, [sp, x0, lsl #2] __ dci(0xc560a3e0); // ldff1sw {z0.d}, p0/z, [sp, z0.d, lsl #2] __ dci(0xc540a3e0); // ldff1sw {z0.d}, p0/z, [sp, z0.d] __ dci(0xa54063e0); // ldff1w {z0.s}, p0/z, [sp, x0, lsl #2] __ dci(0xa56063e0); // ldff1w {z0.d}, p0/z, [sp, x0, lsl #2] __ dci(0xc560e3e0); // ldff1w {z0.d}, p0/z, [sp, z0.d, lsl #2] __ dci(0xc540e3e0); // ldff1w {z0.d}, p0/z, [sp, z0.d] __ dci(0xa430a3e0); // ldnf1b {z0.h}, p0/z, [sp] __ dci(0xa450a3e0); // ldnf1b {z0.s}, p0/z, [sp] __ dci(0xa470a3e0); // ldnf1b {z0.d}, p0/z, [sp] __ dci(0xa410a3e0); // ldnf1b {z0.b}, p0/z, [sp] __ dci(0xa5f0a3e0); // ldnf1d {z0.d}, p0/z, [sp] __ dci(0xa4b0a3e0); // ldnf1h {z0.h}, p0/z, [sp] __ dci(0xa4d0a3e0); // ldnf1h {z0.s}, p0/z, [sp] __ dci(0xa4f0a3e0); // ldnf1h {z0.d}, p0/z, [sp] __ dci(0xa5d0a3e0); // ldnf1sb {z0.h}, p0/z, [sp] __ dci(0xa5b0a3e0); // ldnf1sb {z0.s}, p0/z, [sp] __ dci(0xa590a3e0); // ldnf1sb {z0.d}, p0/z, [sp] __ dci(0xa530a3e0); // ldnf1sh {z0.s}, p0/z, [sp] __ dci(0xa510a3e0); // ldnf1sh {z0.d}, p0/z, [sp] __ dci(0xa490a3e0); // ldnf1sw {z0.d}, p0/z, [sp] __ dci(0xa550a3e0); // ldnf1w {z0.s}, p0/z, [sp] __ dci(0xa570a3e0); // ldnf1w {z0.d}, p0/z, [sp] __ dci(0xa400e3e0); // ldnt1b {z0.b}, p0/z, [sp] __ dci(0xa400c3e0); // ldnt1b {z0.b}, p0/z, [sp, x0] __ dci(0xa580e3e0); // ldnt1d {z0.d}, p0/z, [sp] __ dci(0xa580c3e0); // ldnt1d {z0.d}, p0/z, [sp, x0, lsl #3] __ dci(0xa480e3e0); // ldnt1h {z0.h}, p0/z, [sp] __ dci(0xa480c3e0); // ldnt1h {z0.h}, p0/z, [sp, x0, lsl #1] __ dci(0xa500e3e0); // ldnt1w {z0.s}, p0/z, [sp] __ dci(0xa500c3e0); // ldnt1w {z0.s}, p0/z, [sp, x0, lsl #2] __ dci(0x858043e0); // ldr z0, [sp] __ dci(0xe400e3e0); // st1b {z0.b}, p0, [sp] __ dci(0xe40043e0); // st1b {z0.b}, p0, [sp, x0] __ dci(0xe400a3e0); // st1b {z0.d}, p0, [sp, z0.d] __ dci(0xe5e0e3e0); // st1d {z0.d}, p0, [sp] __ dci(0xe5e043e0); // st1d {z0.d}, p0, [sp, x0, lsl #3] __ dci(0xe5a0a3e0); // st1d {z0.d}, p0, [sp, z0.d, lsl #3] __ dci(0xe580a3e0); // st1d {z0.d}, p0, [sp, z0.d] __ dci(0xe4e0e3e0); // st1h {z0.d}, p0, [sp] __ dci(0xe4e043e0); // st1h {z0.d}, p0, [sp, x0, lsl #1] __ dci(0xe4a0a3e0); // st1h {z0.d}, p0, [sp, z0.d, lsl #1] __ dci(0xe480a3e0); // st1h {z0.d}, p0, [sp, z0.d] __ dci(0xe560e3e0); // st1w {z0.d}, p0, [sp] __ dci(0xe56043e0); // st1w {z0.d}, p0, [sp, x0, lsl #2] __ dci(0xe430e3e0); // st2b {z0.b, z1.b}, p0, [sp] __ dci(0xe42063e0); // st2b {z0.b, z1.b}, p0, [sp, x0] __ dci(0xe5b0e3e0); // st2d {z0.d, z1.d}, p0, [sp] __ dci(0xe5a063e0); // st2d {z0.d, z1.d}, p0, [sp, x0, lsl #3] __ dci(0xe4b0e3e0); // st2h {z0.h, z1.h}, p0, [sp] __ dci(0xe4a063e0); // st2h {z0.h, z1.h}, p0, [sp, x0, lsl #1] __ dci(0xe530e3e0); // st2w {z0.s, z1.s}, p0, [sp] __ dci(0xe52063e0); // st2w {z0.s, z1.s}, p0, [sp, x0, lsl #2] __ dci(0xe450e3e0); // st3b {z0.b, z1.b, z2.b}, p0, [sp] __ dci(0xe44063e0); // st3b {z0.b, z1.b, z2.b}, p0, [sp, x0] __ dci(0xe5d0e3e0); // st3d {z0.d, z1.d, z2.d}, p0, [sp] __ dci(0xe5c063e0); // st3d {z0.d, z1.d, z2.d}, p0, [sp, x0, lsl #3] __ dci(0xe4d0e3e0); // st3h {z0.h, z1.h, z2.h}, p0, [sp] __ dci(0xe4c063e0); // st3h {z0.h, z1.h, z2.h}, p0, [sp, x0, lsl #1] __ dci(0xe550e3e0); // st3w {z0.s, z1.s, z2.s}, p0, [sp] __ dci(0xe54063e0); // st3w {z0.s, z1.s, z2.s}, p0, [sp, x0, lsl #2] __ dci(0xe470e3e0); // st4b {z0.b, z1.b, z2.b, z3.b}, p0, [sp] __ dci(0xe46063e0); // st4b {z0.b, z1.b, z2.b, z3.b}, p0, [sp, x0] __ dci(0xe5f0e3e0); // st4d {z0.d, z1.d, z2.d, z3.d}, p0, [sp] __ dci(0xe5e063e0); // st4d {z0.d, z1.d, z2.d, z3.d}, p0, [sp, x0, lsl #3] __ dci(0xe4f0e3e0); // st4h {z0.h, z1.h, z2.h, z3.h}, p0, [sp] __ dci(0xe4e063e0); // st4h {z0.h, z1.h, z2.h, z3.h}, p0, [sp, x0, lsl #1] __ dci(0xe570e3e0); // st4w {z0.s, z1.s, z2.s, z3.s}, p0, [sp] __ dci(0xe56063e0); // st4w {z0.s, z1.s, z2.s, z3.s}, p0, [sp, x0, lsl #2] __ dci(0xe410e3e0); // stnt1b {z0.b}, p0, [sp] __ dci(0xe40063e0); // stnt1b {z0.b}, p0, [sp, x0] __ dci(0xe590e3e0); // stnt1d {z0.d}, p0, [sp] __ dci(0xe58063e0); // stnt1d {z0.d}, p0, [sp, x0, lsl #3] __ dci(0xe490e3e0); // stnt1h {z0.h}, p0, [sp] __ dci(0xe48063e0); // stnt1h {z0.h}, p0, [sp, x0, lsl #1] __ dci(0xe510e3e0); // stnt1w {z0.s}, p0, [sp] __ dci(0xe50063e0); // stnt1w {z0.s}, p0, [sp, x0, lsl #2] __ dci(0x858003e0); // ldr p0, [sp] __ dci(0xe58003e0); // str p0, [sp] __ dci(0xe58043e0); // str z0, [sp] } END(); if (CAN_RUN()) { RUN(); // No checks are made here. The test is designed to ensure that the base // register is interpreted correctly as sp, not xzr. If it is interpreted // as xzr, the memory access to addresses near zero will fault, and the // test will fail. } } // Manually constructed simulator test to avoid creating a VL128 variant. #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 void Test_sve_fmatmul(Test* config) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM); // Only double-precision matrix multiply is tested here. Single-precision is // tested in the simulator tests using a generated sequence. The (templated) // code used in the simulator for both cases is the same, which is why the // tests here don't need to be comprehensive. START(); Label vl_too_short; __ Rdvl(x0, 1); __ Cmp(x0, 32); __ B(lt, &vl_too_short); // Skip testing VL128. __ Fdup(z0.VnD(), 1.0); __ Fdup(z1.VnD(), 2.0); __ Mov(z2.VnD(), 0); // Build 2x2 identity matrix in z3. Label iden_loop; __ Lsr(x0, x0, 5); __ Bind(&iden_loop); __ Insr(z3.VnD(), d0); __ Insr(z3.VnD(), d2); __ Insr(z3.VnD(), d2); __ Insr(z3.VnD(), d0); __ Sub(x0, x0, 1); __ Cbnz(x0, &iden_loop); __ Fmmla(z1.VnD(), z1.VnD(), z0.VnD(), z0.VnD()); __ Fmmla(z2.VnD(), z2.VnD(), z1.VnD(), z3.VnD()); __ Ptrue(p0.VnB()); __ Index(z4.VnD(), -8, 3); __ Scvtf(z4.VnD(), p0.Merging(), z4.VnD()); __ Mov(z5.VnD(), 0); __ Fmmla(z4.VnD(), z4.VnD(), z4.VnD(), z4.VnD()); __ Fmmla(z5.VnD(), z5.VnD(), z4.VnD(), z3.VnD()); __ Bind(&vl_too_short); END(); if (CAN_RUN()) { RUN(); int vl = core.GetSVELaneCount(kBRegSize) * 8; if (vl >= 256) { ASSERT_EQUAL_SVE(z1, z2); ASSERT_EQUAL_SVE(z4, z5); switch (vl) { case 256: case 384: { // All results are 4.0 (1 * 1 + 2). Results for elements beyond a VL // that's a multiple of 256 bits should be zero. uint64_t z1_expected[] = {0x0000000000000000, 0x0000000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); uint64_t z4_expected[] = {0x0000000000000000, 0x0000000000000000, 0x4018000000000000, // 6.0 0x4022000000000000, // 9.0 0x4018000000000000, // 6.0 0x4054400000000000}; // 81.0 ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); break; } case 2048: { uint64_t z1_expected[] = {0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000}; ASSERT_EQUAL_SVE(z1_expected, z1.VnD()); uint64_t z4_expected[] = { 0x40cb690000000000, 0x40c9728000000000, 0x40c9710000000000, 0x40c79e8000000000, 0x40c41f0000000000, 0x40c2708000000000, 0x40c26f0000000000, 0x40c0e48000000000, 0x40bbea0000000000, 0x40b91d0000000000, 0x40b91a0000000000, 0x40b6950000000000, 0x40b1d60000000000, 0x40af320000000000, 0x40af2c0000000000, 0x40ab420000000000, 0x40a4040000000000, 0x40a0aa0000000000, 0x40a0a40000000000, 0x409bb40000000000, 0x4091b80000000000, 0x408a880000000000, 0x408a700000000000, 0x4083c80000000000, 0x4071a00000000000, 0x4061a00000000000, 0x4061400000000000, 0x4051400000000000, 0x4018000000000000, 0x4022000000000000, 0x4018000000000000, 0x4054400000000000, }; ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); break; } default: printf("WARNING: Some tests skipped due to unexpected VL.\n"); break; } } } } Test* test_sve_fmatmul_list[] = {Test::MakeSVETest(256, "AARCH64_ASM_sve_fmatmul_vl256", &Test_sve_fmatmul), Test::MakeSVETest(384, "AARCH64_ASM_sve_fmatmul_vl384", &Test_sve_fmatmul), Test::MakeSVETest(2048, "AARCH64_ASM_sve_fmatmul_vl2048", &Test_sve_fmatmul)}; void Test_sve_ld1ro(Test* config) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM); START(); int data_size = (kQRegSizeInBytes + 128) * 4; uint8_t* data = new uint8_t[data_size]; for (int i = 0; i < data_size; i++) { data[i] = i & 0xff; } // Set the base to just past half-way through the buffer so we can use // negative indices. __ Mov(x0, reinterpret_cast(&data[7 + data_size / 2])); __ Index(z0.VnB(), 0, 1); __ Ptrue(p0.VnB()); __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4); __ Pfalse(p1.VnB()); __ Zip1(p1.VnB(), p0.VnB(), p1.VnB()); __ Ptrue(p2.VnB()); __ Mov(x1, -32); __ Ld1rob(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, -32)); __ Ld1rob(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1)); __ Mov(x1, 64 / 2); __ Ld1roh(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, 64)); __ Ld1roh(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1)); __ Mov(x1, -96 / 4); __ Ld1row(z4.VnS(), p2.Zeroing(), SVEMemOperand(x0, -96)); __ Ld1row(z5.VnS(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 2)); __ Mov(x1, 128 / 8); __ Ld1rod(z6.VnD(), p2.Zeroing(), SVEMemOperand(x0, 128)); __ Ld1rod(z7.VnD(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 3)); // Check that all 256-bit segments match by rotating the vector by one // segment, eoring, and orring across the vector. __ Dup(z11.VnQ(), z0.VnQ(), 2); __ Mov(z8, z0); __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32); __ Eor(z8.VnB(), z8.VnB(), z0.VnB()); __ Orv(b9, p2, z8.VnB()); __ Mov(z8, z2); __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32); __ Eor(z8.VnB(), z8.VnB(), z2.VnB()); __ Orv(b8, p2, z8.VnB()); __ Orr(z9, z9, z8); __ Mov(z8, z4); __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32); __ Eor(z8.VnB(), z8.VnB(), z4.VnB()); __ Orv(b8, p2, z8.VnB()); __ Orr(z9, z9, z8); __ Mov(z8, z6); __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32); __ Eor(z8.VnB(), z8.VnB(), z6.VnB()); __ Orv(b8, p2, z8.VnB()); __ Orr(z9, z9, z8); END(); if (CAN_RUN()) { RUN(); int vl = core.GetSVELaneCount(kBRegSize) * 8; if (vl >= 256) { ASSERT_EQUAL_SVE(z0, z1); ASSERT_EQUAL_SVE(z2, z3); ASSERT_EQUAL_SVE(z4, z5); ASSERT_EQUAL_SVE(z6, z7); switch (vl) { case 256: case 2048: { // Check the result of the rotate/eor sequence. uint64_t expected_z9[] = {0, 0}; ASSERT_EQUAL_SVE(expected_z9, z9.VnD()); break; } case 384: { // For non-multiple-of-256 VL, the top 128-bits must be zero, which // breaks the rotate/eor sequence. Check the results explicitly. uint64_t z0_expected[] = {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x000d000b00090007}; uint64_t z2_expected[] = {0x0000000000000000, 0x0000000000000000, 0x868584838281807f, 0x7e7d7c7b7a797877, 0x767574737271706f, 0x6e6d6c6b6a696867}; uint64_t z4_expected[] = {0x0000000000000000, 0x0000000000000000, 0xe6e5e4e3e2e1e0df, 0xdedddcdbdad9d8d7, 0xd6d5d4d3d2d1d0cf, 0xcecdcccbcac9c8c7}; uint64_t z6_expected[] = {0x0000000000000000, 0x0000000000000000, 0xc6c5c4c3c2c1c0bf, 0xbebdbcbbbab9b8b7, 0xb6b5b4b3b2b1b0af, 0xaeadacabaaa9a8a7}; ASSERT_EQUAL_SVE(z0_expected, z0.VnD()); ASSERT_EQUAL_SVE(z2_expected, z2.VnD()); ASSERT_EQUAL_SVE(z4_expected, z4.VnD()); ASSERT_EQUAL_SVE(z6_expected, z6.VnD()); break; } default: printf("WARNING: Some tests skipped due to unexpected VL.\n"); break; } } } } Test* test_sve_ld1ro_list[] = {Test::MakeSVETest(256, "AARCH64_ASM_sve_ld1ro_vl256", &Test_sve_ld1ro), Test::MakeSVETest(384, "AARCH64_ASM_sve_ld1ro_vl384", &Test_sve_ld1ro), Test::MakeSVETest(2048, "AARCH64_ASM_sve_ld1ro_vl2048", &Test_sve_ld1ro)}; #endif } // namespace aarch64 } // namespace vixl