1// Copyright 2019, VIXL authors 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions are met: 6// 7// * Redistributions of source code must retain the above copyright notice, 8// this list of conditions and the following disclaimer. 9// * Redistributions in binary form must reproduce the above copyright notice, 10// this list of conditions and the following disclaimer in the documentation 11// and/or other materials provided with the distribution. 12// * Neither the name of ARM Limited nor the names of its contributors may be 13// used to endorse or promote products derived from this software without 14// specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND 17// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27#include "macro-assembler-aarch64.h" 28 29namespace vixl { 30namespace aarch64 { 31 32void MacroAssembler::AddSubHelper(AddSubHelperOption option, 33 const ZRegister& zd, 34 const ZRegister& zn, 35 IntegerOperand imm) { 36 VIXL_ASSERT(imm.FitsInLane(zd)); 37 38 // Simple, encodable cases. 39 if (TrySingleAddSub(option, zd, zn, imm)) return; 40 41 VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate)); 42 bool add_imm = (option == kAddImmediate); 43 44 // Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one 45 // instruction. Also interpret the immediate as signed, so we can convert 46 // Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc. 47 IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits())); 48 if (signed_imm.IsNegative()) { 49 AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate; 50 IntegerOperand n_imm(signed_imm.GetMagnitude()); 51 // IntegerOperand can represent -INT_MIN, so this is always safe. 52 VIXL_ASSERT(n_imm.IsPositiveOrZero()); 53 if (TrySingleAddSub(n_option, zd, zn, n_imm)) return; 54 } 55 56 // Otherwise, fall back to dup + ADD_z_z/SUB_z_z. 57 UseScratchRegisterScope temps(this); 58 ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits()); 59 Dup(scratch, imm); 60 61 SingleEmissionCheckScope guard(this); 62 if (add_imm) { 63 add(zd, zn, scratch); 64 } else { 65 sub(zd, zn, scratch); 66 } 67} 68 69bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option, 70 const ZRegister& zd, 71 const ZRegister& zn, 72 IntegerOperand imm) { 73 VIXL_ASSERT(imm.FitsInLane(zd)); 74 75 int imm8; 76 int shift = -1; 77 if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) || 78 imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) { 79 MovprfxHelperScope guard(this, zd, zn); 80 switch (option) { 81 case kAddImmediate: 82 add(zd, zd, imm8, shift); 83 return true; 84 case kSubImmediate: 85 sub(zd, zd, imm8, shift); 86 return true; 87 } 88 } 89 return false; 90} 91 92void MacroAssembler::IntWideImmHelper(IntArithImmFn imm_fn, 93 SVEArithPredicatedFn reg_macro, 94 const ZRegister& zd, 95 const ZRegister& zn, 96 IntegerOperand imm, 97 bool is_signed) { 98 if (is_signed) { 99 // E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi 100 if (imm.IsInt8()) { 101 MovprfxHelperScope guard(this, zd, zn); 102 (this->*imm_fn)(zd, zd, imm.AsInt8()); 103 return; 104 } 105 } else { 106 // E.g. UMIN_z_zi, UMAX_z_zi 107 if (imm.IsUint8()) { 108 MovprfxHelperScope guard(this, zd, zn); 109 (this->*imm_fn)(zd, zd, imm.AsUint8()); 110 return; 111 } 112 } 113 114 UseScratchRegisterScope temps(this); 115 PRegister pg = temps.AcquireGoverningP(); 116 Ptrue(pg.WithSameLaneSizeAs(zd)); 117 118 // Try to re-use zd if we can, so we can avoid a movprfx. 119 ZRegister scratch = 120 zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits()) 121 : zd; 122 Dup(scratch, imm); 123 124 // The vector-form macro for commutative operations will swap the arguments to 125 // avoid movprfx, if necessary. 126 (this->*reg_macro)(zd, pg.Merging(), zn, scratch); 127} 128 129void MacroAssembler::Mul(const ZRegister& zd, 130 const ZRegister& zn, 131 IntegerOperand imm) { 132 VIXL_ASSERT(allow_macro_instructions_); 133 IntArithImmFn imm_fn = &Assembler::mul; 134 SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul; 135 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true); 136} 137 138void MacroAssembler::Smin(const ZRegister& zd, 139 const ZRegister& zn, 140 IntegerOperand imm) { 141 VIXL_ASSERT(allow_macro_instructions_); 142 VIXL_ASSERT(imm.FitsInSignedLane(zd)); 143 IntArithImmFn imm_fn = &Assembler::smin; 144 SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin; 145 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true); 146} 147 148void MacroAssembler::Smax(const ZRegister& zd, 149 const ZRegister& zn, 150 IntegerOperand imm) { 151 VIXL_ASSERT(allow_macro_instructions_); 152 VIXL_ASSERT(imm.FitsInSignedLane(zd)); 153 IntArithImmFn imm_fn = &Assembler::smax; 154 SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax; 155 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true); 156} 157 158void MacroAssembler::Umax(const ZRegister& zd, 159 const ZRegister& zn, 160 IntegerOperand imm) { 161 VIXL_ASSERT(allow_macro_instructions_); 162 VIXL_ASSERT(imm.FitsInUnsignedLane(zd)); 163 IntArithImmFn imm_fn = &Assembler::umax; 164 SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax; 165 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false); 166} 167 168void MacroAssembler::Umin(const ZRegister& zd, 169 const ZRegister& zn, 170 IntegerOperand imm) { 171 VIXL_ASSERT(allow_macro_instructions_); 172 VIXL_ASSERT(imm.FitsInUnsignedLane(zd)); 173 IntArithImmFn imm_fn = &Assembler::umin; 174 SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin; 175 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false); 176} 177 178void MacroAssembler::Addpl(const Register& xd, 179 const Register& xn, 180 int64_t multiplier) { 181 VIXL_ASSERT(allow_macro_instructions_); 182 183 // This macro relies on `Rdvl` to handle some out-of-range cases. Check that 184 // `VL * multiplier` cannot overflow, for any possible value of VL. 185 VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes)); 186 VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes)); 187 188 if (xd.IsZero()) return; 189 if (xn.IsZero() && xd.IsSP()) { 190 // TODO: This operation doesn't make much sense, but we could support it 191 // with a scratch register if necessary. 192 VIXL_UNIMPLEMENTED(); 193 } 194 195 // Handling xzr requires an extra move, so defer it until later so we can try 196 // to use `rdvl` instead (via `Addvl`). 197 if (IsInt6(multiplier) && !xn.IsZero()) { 198 SingleEmissionCheckScope guard(this); 199 addpl(xd, xn, static_cast<int>(multiplier)); 200 return; 201 } 202 203 // If `multiplier` is a multiple of 8, we can use `Addvl` instead. 204 if ((multiplier % kZRegBitsPerPRegBit) == 0) { 205 Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit); 206 return; 207 } 208 209 if (IsInt6(multiplier)) { 210 VIXL_ASSERT(xn.IsZero()); // Other cases were handled with `addpl`. 211 // There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so 212 // materialise a zero. 213 MacroEmissionCheckScope guard(this); 214 movz(xd, 0); 215 addpl(xd, xd, static_cast<int>(multiplier)); 216 return; 217 } 218 219 // TODO: Some probable cases result in rather long sequences. For example, 220 // `Addpl(sp, sp, 33)` requires five instructions, even though it's only just 221 // outside the encodable range. We should look for ways to cover such cases 222 // without drastically increasing the complexity of this logic. 223 224 // For other cases, calculate xn + (PL * multiplier) using discrete 225 // instructions. This requires two scratch registers in the general case, so 226 // try to re-use the destination as a scratch register. 227 UseScratchRegisterScope temps(this); 228 temps.Include(xd); 229 temps.Exclude(xn); 230 231 Register scratch = temps.AcquireX(); 232 // Because there is no `rdpl`, so we have to calculate PL from VL. We can't 233 // scale the multiplier because (we already know) it isn't a multiple of 8. 234 Rdvl(scratch, multiplier); 235 236 MacroEmissionCheckScope guard(this); 237 if (xn.IsZero()) { 238 asr(xd, scratch, kZRegBitsPerPRegBitLog2); 239 } else if (xd.IsSP() || xn.IsSP()) { 240 // TODO: MacroAssembler::Add should be able to handle this. 241 asr(scratch, scratch, kZRegBitsPerPRegBitLog2); 242 add(xd, xn, scratch); 243 } else { 244 add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2)); 245 } 246} 247 248void MacroAssembler::Addvl(const Register& xd, 249 const Register& xn, 250 int64_t multiplier) { 251 VIXL_ASSERT(allow_macro_instructions_); 252 VIXL_ASSERT(xd.IsX()); 253 VIXL_ASSERT(xn.IsX()); 254 255 // Check that `VL * multiplier` cannot overflow, for any possible value of VL. 256 VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes)); 257 VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes)); 258 259 if (xd.IsZero()) return; 260 if (xn.IsZero() && xd.IsSP()) { 261 // TODO: This operation doesn't make much sense, but we could support it 262 // with a scratch register if necessary. `rdvl` cannot write into `sp`. 263 VIXL_UNIMPLEMENTED(); 264 } 265 266 if (IsInt6(multiplier)) { 267 SingleEmissionCheckScope guard(this); 268 if (xn.IsZero()) { 269 rdvl(xd, static_cast<int>(multiplier)); 270 } else { 271 addvl(xd, xn, static_cast<int>(multiplier)); 272 } 273 return; 274 } 275 276 // TODO: Some probable cases result in rather long sequences. For example, 277 // `Addvl(sp, sp, 42)` requires four instructions, even though it's only just 278 // outside the encodable range. We should look for ways to cover such cases 279 // without drastically increasing the complexity of this logic. 280 281 // For other cases, calculate xn + (VL * multiplier) using discrete 282 // instructions. This requires two scratch registers in the general case, so 283 // we try to re-use the destination as a scratch register. 284 UseScratchRegisterScope temps(this); 285 temps.Include(xd); 286 temps.Exclude(xn); 287 288 Register a = temps.AcquireX(); 289 Mov(a, multiplier); 290 291 MacroEmissionCheckScope guard(this); 292 Register b = temps.AcquireX(); 293 rdvl(b, 1); 294 if (xn.IsZero()) { 295 mul(xd, a, b); 296 } else if (xd.IsSP() || xn.IsSP()) { 297 mul(a, a, b); 298 add(xd, xn, a); 299 } else { 300 madd(xd, a, b, xn); 301 } 302} 303 304void MacroAssembler::CalculateSVEAddress(const Register& xd, 305 const SVEMemOperand& addr, 306 int vl_divisor_log2) { 307 VIXL_ASSERT(allow_macro_instructions_); 308 VIXL_ASSERT(!addr.IsScatterGather()); 309 VIXL_ASSERT(xd.IsX()); 310 311 // The lower bound is where a whole Z register is accessed. 312 VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0)); 313 // The upper bound is for P register accesses, and for instructions like 314 // "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane. 315 VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2)); 316 317 SVEOffsetModifier mod = addr.GetOffsetModifier(); 318 Register base = addr.GetScalarBase(); 319 320 if (addr.IsEquivalentToScalar()) { 321 // For example: 322 // [x0] 323 // [x0, #0] 324 // [x0, xzr, LSL 2] 325 Mov(xd, base); 326 } else if (addr.IsScalarPlusImmediate()) { 327 // For example: 328 // [x0, #42] 329 // [x0, #42, MUL VL] 330 int64_t offset = addr.GetImmediateOffset(); 331 VIXL_ASSERT(offset != 0); // Handled by IsEquivalentToScalar. 332 if (addr.IsMulVl()) { 333 int vl_divisor = 1 << vl_divisor_log2; 334 // For all possible values of vl_divisor, we can simply use `Addpl`. This 335 // will select `addvl` if necessary. 336 VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0); 337 Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor)); 338 } else { 339 // IsScalarPlusImmediate() ensures that no other modifiers can occur. 340 VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER); 341 Add(xd, base, offset); 342 } 343 } else if (addr.IsScalarPlusScalar()) { 344 // For example: 345 // [x0, x1] 346 // [x0, x1, LSL #4] 347 Register offset = addr.GetScalarOffset(); 348 VIXL_ASSERT(!offset.IsZero()); // Handled by IsEquivalentToScalar. 349 if (mod == SVE_LSL) { 350 Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount())); 351 } else { 352 // IsScalarPlusScalar() ensures that no other modifiers can occur. 353 VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER); 354 Add(xd, base, offset); 355 } 356 } else { 357 // All other forms are scatter-gather addresses, which cannot be evaluated 358 // into an X register. 359 VIXL_UNREACHABLE(); 360 } 361} 362 363void MacroAssembler::Cpy(const ZRegister& zd, 364 const PRegister& pg, 365 IntegerOperand imm) { 366 VIXL_ASSERT(allow_macro_instructions_); 367 VIXL_ASSERT(imm.FitsInLane(zd)); 368 int imm8; 369 int shift; 370 if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) || 371 imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) { 372 SingleEmissionCheckScope guard(this); 373 cpy(zd, pg, imm8, shift); 374 return; 375 } 376 377 // The fallbacks rely on `cpy` variants that only support merging predication. 378 // If zeroing predication was requested, zero the destination first. 379 if (pg.IsZeroing()) { 380 SingleEmissionCheckScope guard(this); 381 dup(zd, 0); 382 } 383 PRegisterM pg_m = pg.Merging(); 384 385 // Try to encode the immediate using fcpy. 386 VIXL_ASSERT(imm.FitsInLane(zd)); 387 if (zd.GetLaneSizeInBits() >= kHRegSize) { 388 double fp_imm = 0.0; 389 switch (zd.GetLaneSizeInBits()) { 390 case kHRegSize: 391 fp_imm = 392 FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN); 393 break; 394 case kSRegSize: 395 fp_imm = RawbitsToFloat(imm.AsUint32()); 396 break; 397 case kDRegSize: 398 fp_imm = RawbitsToDouble(imm.AsUint64()); 399 break; 400 default: 401 VIXL_UNREACHABLE(); 402 break; 403 } 404 // IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so 405 // we can use IsImmFP64 for all lane sizes. 406 if (IsImmFP64(fp_imm)) { 407 SingleEmissionCheckScope guard(this); 408 fcpy(zd, pg_m, fp_imm); 409 return; 410 } 411 } 412 413 // Fall back to using a scratch register. 414 UseScratchRegisterScope temps(this); 415 Register scratch = temps.AcquireRegisterToHoldLane(zd); 416 Mov(scratch, imm); 417 418 SingleEmissionCheckScope guard(this); 419 cpy(zd, pg_m, scratch); 420} 421 422// TODO: We implement Fcpy (amongst other things) for all FP types because it 423// allows us to preserve user-specified NaNs. We should come up with some 424// FPImmediate type to abstract this, and avoid all the duplication below (and 425// elsewhere). 426 427void MacroAssembler::Fcpy(const ZRegister& zd, 428 const PRegisterM& pg, 429 double imm) { 430 VIXL_ASSERT(allow_macro_instructions_); 431 VIXL_ASSERT(pg.IsMerging()); 432 433 if (IsImmFP64(imm)) { 434 SingleEmissionCheckScope guard(this); 435 fcpy(zd, pg, imm); 436 return; 437 } 438 439 // As a fall-back, cast the immediate to the required lane size, and try to 440 // encode the bit pattern using `Cpy`. 441 Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm)); 442} 443 444void MacroAssembler::Fcpy(const ZRegister& zd, 445 const PRegisterM& pg, 446 float imm) { 447 VIXL_ASSERT(allow_macro_instructions_); 448 VIXL_ASSERT(pg.IsMerging()); 449 450 if (IsImmFP32(imm)) { 451 SingleEmissionCheckScope guard(this); 452 fcpy(zd, pg, imm); 453 return; 454 } 455 456 // As a fall-back, cast the immediate to the required lane size, and try to 457 // encode the bit pattern using `Cpy`. 458 Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm)); 459} 460 461void MacroAssembler::Fcpy(const ZRegister& zd, 462 const PRegisterM& pg, 463 Float16 imm) { 464 VIXL_ASSERT(allow_macro_instructions_); 465 VIXL_ASSERT(pg.IsMerging()); 466 467 if (IsImmFP16(imm)) { 468 SingleEmissionCheckScope guard(this); 469 fcpy(zd, pg, imm); 470 return; 471 } 472 473 // As a fall-back, cast the immediate to the required lane size, and try to 474 // encode the bit pattern using `Cpy`. 475 Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm)); 476} 477 478void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) { 479 VIXL_ASSERT(allow_macro_instructions_); 480 VIXL_ASSERT(imm.FitsInLane(zd)); 481 unsigned lane_size = zd.GetLaneSizeInBits(); 482 int imm8; 483 int shift; 484 if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) || 485 imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) { 486 SingleEmissionCheckScope guard(this); 487 dup(zd, imm8, shift); 488 } else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) { 489 SingleEmissionCheckScope guard(this); 490 dupm(zd, imm.AsUintN(lane_size)); 491 } else { 492 UseScratchRegisterScope temps(this); 493 Register scratch = temps.AcquireRegisterToHoldLane(zd); 494 Mov(scratch, imm); 495 496 SingleEmissionCheckScope guard(this); 497 dup(zd, scratch); 498 } 499} 500 501void MacroAssembler::NoncommutativeArithmeticHelper( 502 const ZRegister& zd, 503 const PRegisterM& pg, 504 const ZRegister& zn, 505 const ZRegister& zm, 506 SVEArithPredicatedFn fn, 507 SVEArithPredicatedFn rev_fn) { 508 if (zd.Aliases(zn)) { 509 // E.g. zd = zd / zm 510 SingleEmissionCheckScope guard(this); 511 (this->*fn)(zd, pg, zn, zm); 512 } else if (zd.Aliases(zm)) { 513 // E.g. zd = zn / zd 514 SingleEmissionCheckScope guard(this); 515 (this->*rev_fn)(zd, pg, zm, zn); 516 } else { 517 // E.g. zd = zn / zm 518 MovprfxHelperScope guard(this, zd, pg, zn); 519 (this->*fn)(zd, pg, zd, zm); 520 } 521} 522 523void MacroAssembler::FPCommutativeArithmeticHelper( 524 const ZRegister& zd, 525 const PRegisterM& pg, 526 const ZRegister& zn, 527 const ZRegister& zm, 528 SVEArithPredicatedFn fn, 529 FPMacroNaNPropagationOption nan_option) { 530 ResolveFPNaNPropagationOption(&nan_option); 531 532 if (zd.Aliases(zn)) { 533 SingleEmissionCheckScope guard(this); 534 (this->*fn)(zd, pg, zd, zm); 535 } else if (zd.Aliases(zm)) { 536 switch (nan_option) { 537 case FastNaNPropagation: { 538 // Swap the arguments. 539 SingleEmissionCheckScope guard(this); 540 (this->*fn)(zd, pg, zd, zn); 541 return; 542 } 543 case StrictNaNPropagation: { 544 UseScratchRegisterScope temps(this); 545 // Use a scratch register to keep the argument order exactly as 546 // specified. 547 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn); 548 { 549 MovprfxHelperScope guard(this, scratch, pg, zn); 550 (this->*fn)(scratch, pg, scratch, zm); 551 } 552 Mov(zd, scratch); 553 return; 554 } 555 case NoFPMacroNaNPropagationSelected: 556 VIXL_UNREACHABLE(); 557 return; 558 } 559 } else { 560 MovprfxHelperScope guard(this, zd, pg, zn); 561 (this->*fn)(zd, pg, zd, zm); 562 } 563} 564 565// Instructions of the form "inst zda, zn, zm, #num", where they are 566// non-commutative and no reversed form is provided. 567#define VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(V) \ 568 V(Cmla, cmla) \ 569 V(Sqrdcmlah, sqrdcmlah) 570 571#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \ 572 void MacroAssembler::MASMFN(const ZRegister& zd, \ 573 const ZRegister& za, \ 574 const ZRegister& zn, \ 575 const ZRegister& zm, \ 576 int imm) { \ 577 if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \ 578 UseScratchRegisterScope temps(this); \ 579 VIXL_ASSERT(AreSameLaneSize(zn, zm)); \ 580 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn); \ 581 Mov(ztmp, zd.Aliases(zn) ? zn : zm); \ 582 MovprfxHelperScope guard(this, zd, za); \ 583 ASMFN(zd, \ 584 (zd.Aliases(zn) ? ztmp : zn), \ 585 (zd.Aliases(zm) ? ztmp : zm), \ 586 imm); \ 587 } else { \ 588 MovprfxHelperScope guard(this, zd, za); \ 589 ASMFN(zd, zn, zm, imm); \ 590 } \ 591 } 592VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(VIXL_DEFINE_MASM_FUNC) 593#undef VIXL_DEFINE_MASM_FUNC 594 595// Instructions of the form "inst zda, zn, zm, #num, #num", where they are 596// non-commutative and no reversed form is provided. 597#define VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(V) \ 598 V(Cmla, cmla) \ 599 V(Sqrdcmlah, sqrdcmlah) 600 601// This doesn't handle zm when it's out of the range that can be encoded in 602// instruction. The range depends on element size: z0-z7 for H, z0-15 for S. 603#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \ 604 void MacroAssembler::MASMFN(const ZRegister& zd, \ 605 const ZRegister& za, \ 606 const ZRegister& zn, \ 607 const ZRegister& zm, \ 608 int index, \ 609 int rot) { \ 610 if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \ 611 UseScratchRegisterScope temps(this); \ 612 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd); \ 613 { \ 614 MovprfxHelperScope guard(this, ztmp, za); \ 615 ASMFN(ztmp, zn, zm, index, rot); \ 616 } \ 617 Mov(zd, ztmp); \ 618 } else { \ 619 MovprfxHelperScope guard(this, zd, za); \ 620 ASMFN(zd, zn, zm, index, rot); \ 621 } \ 622 } 623VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(VIXL_DEFINE_MASM_FUNC) 624#undef VIXL_DEFINE_MASM_FUNC 625 626// Instructions of the form "inst zda, pg, zda, zn", where they are 627// non-commutative and no reversed form is provided. 628#define VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(V) \ 629 V(Addp, addp) \ 630 V(Bic, bic) \ 631 V(Faddp, faddp) \ 632 V(Fmaxnmp, fmaxnmp) \ 633 V(Fminnmp, fminnmp) \ 634 V(Fmaxp, fmaxp) \ 635 V(Fminp, fminp) \ 636 V(Fscale, fscale) \ 637 V(Smaxp, smaxp) \ 638 V(Sminp, sminp) \ 639 V(Suqadd, suqadd) \ 640 V(Umaxp, umaxp) \ 641 V(Uminp, uminp) \ 642 V(Usqadd, usqadd) 643 644#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \ 645 void MacroAssembler::MASMFN(const ZRegister& zd, \ 646 const PRegisterM& pg, \ 647 const ZRegister& zn, \ 648 const ZRegister& zm) { \ 649 VIXL_ASSERT(allow_macro_instructions_); \ 650 if (zd.Aliases(zm) && !zd.Aliases(zn)) { \ 651 UseScratchRegisterScope temps(this); \ 652 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm); \ 653 Mov(scratch, zm); \ 654 MovprfxHelperScope guard(this, zd, pg, zn); \ 655 ASMFN(zd, pg, zd, scratch); \ 656 } else { \ 657 MovprfxHelperScope guard(this, zd, pg, zn); \ 658 ASMFN(zd, pg, zd, zm); \ 659 } \ 660 } 661VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC) 662#undef VIXL_DEFINE_MASM_FUNC 663 664// Instructions of the form "inst zda, pg, zda, zn", where they are 665// non-commutative and a reversed form is provided. 666#define VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(V) \ 667 V(Asr, asr) \ 668 V(Fdiv, fdiv) \ 669 V(Fsub, fsub) \ 670 V(Lsl, lsl) \ 671 V(Lsr, lsr) \ 672 V(Sdiv, sdiv) \ 673 V(Shsub, shsub) \ 674 V(Sqrshl, sqrshl) \ 675 V(Sqshl, sqshl) \ 676 V(Sqsub, sqsub) \ 677 V(Srshl, srshl) \ 678 V(Sub, sub) \ 679 V(Udiv, udiv) \ 680 V(Uhsub, uhsub) \ 681 V(Uqrshl, uqrshl) \ 682 V(Uqshl, uqshl) \ 683 V(Uqsub, uqsub) \ 684 V(Urshl, urshl) 685 686#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \ 687 void MacroAssembler::MASMFN(const ZRegister& zd, \ 688 const PRegisterM& pg, \ 689 const ZRegister& zn, \ 690 const ZRegister& zm) { \ 691 VIXL_ASSERT(allow_macro_instructions_); \ 692 NoncommutativeArithmeticHelper(zd, \ 693 pg, \ 694 zn, \ 695 zm, \ 696 static_cast<SVEArithPredicatedFn>( \ 697 &Assembler::ASMFN), \ 698 static_cast<SVEArithPredicatedFn>( \ 699 &Assembler::ASMFN##r)); \ 700 } 701VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC) 702#undef VIXL_DEFINE_MASM_FUNC 703 704void MacroAssembler::Fadd(const ZRegister& zd, 705 const PRegisterM& pg, 706 const ZRegister& zn, 707 const ZRegister& zm, 708 FPMacroNaNPropagationOption nan_option) { 709 VIXL_ASSERT(allow_macro_instructions_); 710 FPCommutativeArithmeticHelper(zd, 711 pg, 712 zn, 713 zm, 714 static_cast<SVEArithPredicatedFn>( 715 &Assembler::fadd), 716 nan_option); 717} 718 719void MacroAssembler::Fabd(const ZRegister& zd, 720 const PRegisterM& pg, 721 const ZRegister& zn, 722 const ZRegister& zm, 723 FPMacroNaNPropagationOption nan_option) { 724 VIXL_ASSERT(allow_macro_instructions_); 725 FPCommutativeArithmeticHelper(zd, 726 pg, 727 zn, 728 zm, 729 static_cast<SVEArithPredicatedFn>( 730 &Assembler::fabd), 731 nan_option); 732} 733 734void MacroAssembler::Fmul(const ZRegister& zd, 735 const PRegisterM& pg, 736 const ZRegister& zn, 737 const ZRegister& zm, 738 FPMacroNaNPropagationOption nan_option) { 739 VIXL_ASSERT(allow_macro_instructions_); 740 FPCommutativeArithmeticHelper(zd, 741 pg, 742 zn, 743 zm, 744 static_cast<SVEArithPredicatedFn>( 745 &Assembler::fmul), 746 nan_option); 747} 748 749void MacroAssembler::Fmulx(const ZRegister& zd, 750 const PRegisterM& pg, 751 const ZRegister& zn, 752 const ZRegister& zm, 753 FPMacroNaNPropagationOption nan_option) { 754 VIXL_ASSERT(allow_macro_instructions_); 755 FPCommutativeArithmeticHelper(zd, 756 pg, 757 zn, 758 zm, 759 static_cast<SVEArithPredicatedFn>( 760 &Assembler::fmulx), 761 nan_option); 762} 763 764void MacroAssembler::Fmax(const ZRegister& zd, 765 const PRegisterM& pg, 766 const ZRegister& zn, 767 const ZRegister& zm, 768 FPMacroNaNPropagationOption nan_option) { 769 VIXL_ASSERT(allow_macro_instructions_); 770 FPCommutativeArithmeticHelper(zd, 771 pg, 772 zn, 773 zm, 774 static_cast<SVEArithPredicatedFn>( 775 &Assembler::fmax), 776 nan_option); 777} 778 779void MacroAssembler::Fmin(const ZRegister& zd, 780 const PRegisterM& pg, 781 const ZRegister& zn, 782 const ZRegister& zm, 783 FPMacroNaNPropagationOption nan_option) { 784 VIXL_ASSERT(allow_macro_instructions_); 785 FPCommutativeArithmeticHelper(zd, 786 pg, 787 zn, 788 zm, 789 static_cast<SVEArithPredicatedFn>( 790 &Assembler::fmin), 791 nan_option); 792} 793 794void MacroAssembler::Fmaxnm(const ZRegister& zd, 795 const PRegisterM& pg, 796 const ZRegister& zn, 797 const ZRegister& zm, 798 FPMacroNaNPropagationOption nan_option) { 799 VIXL_ASSERT(allow_macro_instructions_); 800 FPCommutativeArithmeticHelper(zd, 801 pg, 802 zn, 803 zm, 804 static_cast<SVEArithPredicatedFn>( 805 &Assembler::fmaxnm), 806 nan_option); 807} 808 809void MacroAssembler::Fminnm(const ZRegister& zd, 810 const PRegisterM& pg, 811 const ZRegister& zn, 812 const ZRegister& zm, 813 FPMacroNaNPropagationOption nan_option) { 814 VIXL_ASSERT(allow_macro_instructions_); 815 FPCommutativeArithmeticHelper(zd, 816 pg, 817 zn, 818 zm, 819 static_cast<SVEArithPredicatedFn>( 820 &Assembler::fminnm), 821 nan_option); 822} 823 824void MacroAssembler::Fdup(const ZRegister& zd, double imm) { 825 VIXL_ASSERT(allow_macro_instructions_); 826 827 switch (zd.GetLaneSizeInBits()) { 828 case kHRegSize: 829 Fdup(zd, Float16(imm)); 830 break; 831 case kSRegSize: 832 Fdup(zd, static_cast<float>(imm)); 833 break; 834 case kDRegSize: 835 uint64_t bits = DoubleToRawbits(imm); 836 if (IsImmFP64(bits)) { 837 SingleEmissionCheckScope guard(this); 838 fdup(zd, imm); 839 } else { 840 Dup(zd, bits); 841 } 842 break; 843 } 844} 845 846void MacroAssembler::Fdup(const ZRegister& zd, float imm) { 847 VIXL_ASSERT(allow_macro_instructions_); 848 849 switch (zd.GetLaneSizeInBits()) { 850 case kHRegSize: 851 Fdup(zd, Float16(imm)); 852 break; 853 case kSRegSize: 854 if (IsImmFP32(imm)) { 855 SingleEmissionCheckScope guard(this); 856 fdup(zd, imm); 857 } else { 858 Dup(zd, FloatToRawbits(imm)); 859 } 860 break; 861 case kDRegSize: 862 Fdup(zd, static_cast<double>(imm)); 863 break; 864 } 865} 866 867void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) { 868 VIXL_ASSERT(allow_macro_instructions_); 869 870 switch (zd.GetLaneSizeInBits()) { 871 case kHRegSize: 872 if (IsImmFP16(imm)) { 873 SingleEmissionCheckScope guard(this); 874 fdup(zd, imm); 875 } else { 876 Dup(zd, Float16ToRawbits(imm)); 877 } 878 break; 879 case kSRegSize: 880 Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN)); 881 break; 882 case kDRegSize: 883 Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN)); 884 break; 885 } 886} 887 888void MacroAssembler::Index(const ZRegister& zd, 889 const Operand& start, 890 const Operand& step) { 891 class IndexOperand : public Operand { 892 public: 893 static IndexOperand Prepare(MacroAssembler* masm, 894 UseScratchRegisterScope* temps, 895 const Operand& op, 896 const ZRegister& zd_inner) { 897 // Look for encodable immediates. 898 int imm; 899 if (op.IsImmediate()) { 900 if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd_inner, &imm)) { 901 return IndexOperand(imm); 902 } 903 Register scratch = temps->AcquireRegisterToHoldLane(zd_inner); 904 masm->Mov(scratch, op); 905 return IndexOperand(scratch); 906 } else { 907 // Plain registers can be encoded directly. 908 VIXL_ASSERT(op.IsPlainRegister()); 909 return IndexOperand(op.GetRegister()); 910 } 911 } 912 913 int GetImm5() const { 914 int64_t imm = GetImmediate(); 915 VIXL_ASSERT(IsInt5(imm)); 916 return static_cast<int>(imm); 917 } 918 919 private: 920 explicit IndexOperand(const Register& reg) : Operand(reg) {} 921 explicit IndexOperand(int64_t imm) : Operand(imm) {} 922 }; 923 924 UseScratchRegisterScope temps(this); 925 IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd); 926 IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd); 927 928 SingleEmissionCheckScope guard(this); 929 if (start_enc.IsImmediate()) { 930 if (step_enc.IsImmediate()) { 931 index(zd, start_enc.GetImm5(), step_enc.GetImm5()); 932 } else { 933 index(zd, start_enc.GetImm5(), step_enc.GetRegister()); 934 } 935 } else { 936 if (step_enc.IsImmediate()) { 937 index(zd, start_enc.GetRegister(), step_enc.GetImm5()); 938 } else { 939 index(zd, start_enc.GetRegister(), step_enc.GetRegister()); 940 } 941 } 942} 943 944void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) { 945 VIXL_ASSERT(allow_macro_instructions_); 946 VIXL_ASSERT(imm.FitsInLane(zdn)); 947 948 if (imm.IsZero()) { 949 SingleEmissionCheckScope guard(this); 950 insr(zdn, xzr); 951 return; 952 } 953 954 UseScratchRegisterScope temps(this); 955 Register scratch = temps.AcquireRegisterToHoldLane(zdn); 956 957 // TODO: There are many cases where we could optimise immediates, such as by 958 // detecting repeating patterns or FP immediates. We should optimise and 959 // abstract this for use in other SVE mov-immediate-like macros. 960 Mov(scratch, imm); 961 962 SingleEmissionCheckScope guard(this); 963 insr(zdn, scratch); 964} 965 966void MacroAssembler::Mla(const ZRegister& zd, 967 const PRegisterM& pg, 968 const ZRegister& za, 969 const ZRegister& zn, 970 const ZRegister& zm) { 971 VIXL_ASSERT(allow_macro_instructions_); 972 if (zd.Aliases(za)) { 973 // zda = zda + (zn * zm) 974 SingleEmissionCheckScope guard(this); 975 mla(zd, pg, zn, zm); 976 } else if (zd.Aliases(zn)) { 977 // zdn = za + (zdn * zm) 978 SingleEmissionCheckScope guard(this); 979 mad(zd, pg, zm, za); 980 } else if (zd.Aliases(zm)) { 981 // Multiplication is commutative, so we can swap zn and zm. 982 // zdm = za + (zdm * zn) 983 SingleEmissionCheckScope guard(this); 984 mad(zd, pg, zn, za); 985 } else { 986 // zd = za + (zn * zm) 987 ExactAssemblyScope guard(this, 2 * kInstructionSize); 988 movprfx(zd, pg, za); 989 mla(zd, pg, zn, zm); 990 } 991} 992 993void MacroAssembler::Mls(const ZRegister& zd, 994 const PRegisterM& pg, 995 const ZRegister& za, 996 const ZRegister& zn, 997 const ZRegister& zm) { 998 VIXL_ASSERT(allow_macro_instructions_); 999 if (zd.Aliases(za)) { 1000 // zda = zda - (zn * zm) 1001 SingleEmissionCheckScope guard(this); 1002 mls(zd, pg, zn, zm); 1003 } else if (zd.Aliases(zn)) { 1004 // zdn = za - (zdn * zm) 1005 SingleEmissionCheckScope guard(this); 1006 msb(zd, pg, zm, za); 1007 } else if (zd.Aliases(zm)) { 1008 // Multiplication is commutative, so we can swap zn and zm. 1009 // zdm = za - (zdm * zn) 1010 SingleEmissionCheckScope guard(this); 1011 msb(zd, pg, zn, za); 1012 } else { 1013 // zd = za - (zn * zm) 1014 ExactAssemblyScope guard(this, 2 * kInstructionSize); 1015 movprfx(zd, pg, za); 1016 mls(zd, pg, zn, zm); 1017 } 1018} 1019 1020void MacroAssembler::CompareHelper(Condition cond, 1021 const PRegisterWithLaneSize& pd, 1022 const PRegisterZ& pg, 1023 const ZRegister& zn, 1024 IntegerOperand imm) { 1025 UseScratchRegisterScope temps(this); 1026 ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits()); 1027 Dup(zm, imm); 1028 SingleEmissionCheckScope guard(this); 1029 cmp(cond, pd, pg, zn, zm); 1030} 1031 1032void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd, 1033 const PRegister& pg, 1034 const PRegisterWithLaneSize& pn) { 1035 VIXL_ASSERT(allow_macro_instructions_); 1036 VIXL_ASSERT(pd.IsLaneSizeB()); 1037 VIXL_ASSERT(pn.IsLaneSizeB()); 1038 if (pd.Is(pn)) { 1039 SingleEmissionCheckScope guard(this); 1040 pfirst(pd, pg, pn); 1041 } else { 1042 UseScratchRegisterScope temps(this); 1043 PRegister temp_pg = pg; 1044 if (pd.Aliases(pg)) { 1045 temp_pg = temps.AcquireP(); 1046 Mov(temp_pg.VnB(), pg.VnB()); 1047 } 1048 Mov(pd, pn); 1049 SingleEmissionCheckScope guard(this); 1050 pfirst(pd, temp_pg, pd); 1051 } 1052} 1053 1054void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd, 1055 const PRegister& pg, 1056 const PRegisterWithLaneSize& pn) { 1057 VIXL_ASSERT(allow_macro_instructions_); 1058 VIXL_ASSERT(AreSameFormat(pd, pn)); 1059 if (pd.Is(pn)) { 1060 SingleEmissionCheckScope guard(this); 1061 pnext(pd, pg, pn); 1062 } else { 1063 UseScratchRegisterScope temps(this); 1064 PRegister temp_pg = pg; 1065 if (pd.Aliases(pg)) { 1066 temp_pg = temps.AcquireP(); 1067 Mov(temp_pg.VnB(), pg.VnB()); 1068 } 1069 Mov(pd.VnB(), pn.VnB()); 1070 SingleEmissionCheckScope guard(this); 1071 pnext(pd, temp_pg, pd); 1072 } 1073} 1074 1075void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd, 1076 SVEPredicateConstraint pattern, 1077 FlagsUpdate s) { 1078 VIXL_ASSERT(allow_macro_instructions_); 1079 switch (s) { 1080 case LeaveFlags: 1081 Ptrue(pd, pattern); 1082 return; 1083 case SetFlags: 1084 Ptrues(pd, pattern); 1085 return; 1086 } 1087 VIXL_UNREACHABLE(); 1088} 1089 1090void MacroAssembler::Sub(const ZRegister& zd, 1091 IntegerOperand imm, 1092 const ZRegister& zm) { 1093 VIXL_ASSERT(allow_macro_instructions_); 1094 1095 int imm8; 1096 int shift = -1; 1097 if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) || 1098 imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) { 1099 MovprfxHelperScope guard(this, zd, zm); 1100 subr(zd, zd, imm8, shift); 1101 } else { 1102 UseScratchRegisterScope temps(this); 1103 ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits()); 1104 Dup(scratch, imm); 1105 1106 SingleEmissionCheckScope guard(this); 1107 sub(zd, scratch, zm); 1108 } 1109} 1110 1111void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt, 1112 const PRegisterZ& pg, 1113 const SVEMemOperand& addr, 1114 SVELoadBroadcastFn fn, 1115 int divisor) { 1116 VIXL_ASSERT(addr.IsScalarPlusImmediate()); 1117 int64_t imm = addr.GetImmediateOffset(); 1118 if ((imm % divisor == 0) && IsUint6(imm / divisor)) { 1119 SingleEmissionCheckScope guard(this); 1120 (this->*fn)(zt, pg, addr); 1121 } else { 1122 UseScratchRegisterScope temps(this); 1123 Register scratch = temps.AcquireX(); 1124 CalculateSVEAddress(scratch, addr, zt); 1125 SingleEmissionCheckScope guard(this); 1126 (this->*fn)(zt, pg, SVEMemOperand(scratch)); 1127 } 1128} 1129 1130void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt, 1131 const SVEMemOperand& addr, 1132 SVELoadStoreFn fn) { 1133 VIXL_ASSERT(allow_macro_instructions_); 1134 VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister()); 1135 1136 if (addr.IsPlainScalar() || 1137 (addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) && 1138 addr.IsMulVl())) { 1139 SingleEmissionCheckScope guard(this); 1140 (this->*fn)(rt, addr); 1141 return; 1142 } 1143 1144 if (addr.IsEquivalentToScalar()) { 1145 SingleEmissionCheckScope guard(this); 1146 (this->*fn)(rt, SVEMemOperand(addr.GetScalarBase())); 1147 return; 1148 } 1149 1150 UseScratchRegisterScope temps(this); 1151 Register scratch = temps.AcquireX(); 1152 CalculateSVEAddress(scratch, addr, rt); 1153 SingleEmissionCheckScope guard(this); 1154 (this->*fn)(rt, SVEMemOperand(scratch)); 1155} 1156 1157template <typename Tg, typename Tf> 1158void MacroAssembler::SVELoadStoreNTBroadcastQOHelper( 1159 const ZRegister& zt, 1160 const Tg& pg, 1161 const SVEMemOperand& addr, 1162 Tf fn, 1163 int imm_bits, 1164 int shift_amount, 1165 SVEOffsetModifier supported_modifier, 1166 int vl_divisor_log2) { 1167 VIXL_ASSERT(allow_macro_instructions_); 1168 int imm_divisor = 1 << shift_amount; 1169 1170 if (addr.IsPlainScalar() || 1171 (addr.IsScalarPlusImmediate() && 1172 IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) && 1173 ((addr.GetImmediateOffset() % imm_divisor) == 0) && 1174 (addr.GetOffsetModifier() == supported_modifier))) { 1175 SingleEmissionCheckScope guard(this); 1176 (this->*fn)(zt, pg, addr); 1177 return; 1178 } 1179 1180 if (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() && 1181 addr.IsEquivalentToLSL(zt.GetLaneSizeInBytesLog2())) { 1182 SingleEmissionCheckScope guard(this); 1183 (this->*fn)(zt, pg, addr); 1184 return; 1185 } 1186 1187 if (addr.IsEquivalentToScalar()) { 1188 SingleEmissionCheckScope guard(this); 1189 (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase())); 1190 return; 1191 } 1192 1193 if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) && 1194 (vl_divisor_log2 == -1)) { 1195 // We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL 1196 // dependent. 1197 VIXL_UNIMPLEMENTED(); 1198 } 1199 1200 UseScratchRegisterScope temps(this); 1201 Register scratch = temps.AcquireX(); 1202 CalculateSVEAddress(scratch, addr, vl_divisor_log2); 1203 SingleEmissionCheckScope guard(this); 1204 (this->*fn)(zt, pg, SVEMemOperand(scratch)); 1205} 1206 1207template <typename Tg, typename Tf> 1208void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2, 1209 const ZRegister& zt, 1210 const Tg& pg, 1211 const SVEMemOperand& addr, 1212 Tf fn) { 1213 if (addr.IsPlainScalar() || 1214 (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() && 1215 addr.IsEquivalentToLSL(msize_in_bytes_log2)) || 1216 (addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) && 1217 addr.IsMulVl())) { 1218 SingleEmissionCheckScope guard(this); 1219 (this->*fn)(zt, pg, addr); 1220 return; 1221 } 1222 1223 if (addr.IsEquivalentToScalar()) { 1224 SingleEmissionCheckScope guard(this); 1225 (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase())); 1226 return; 1227 } 1228 1229 if (addr.IsVectorPlusImmediate()) { 1230 uint64_t offset = addr.GetImmediateOffset(); 1231 if (IsMultiple(offset, (1 << msize_in_bytes_log2)) && 1232 IsUint5(offset >> msize_in_bytes_log2)) { 1233 SingleEmissionCheckScope guard(this); 1234 (this->*fn)(zt, pg, addr); 1235 return; 1236 } 1237 } 1238 1239 if (addr.IsScalarPlusVector()) { 1240 VIXL_ASSERT(addr.IsScatterGather()); 1241 SingleEmissionCheckScope guard(this); 1242 (this->*fn)(zt, pg, addr); 1243 return; 1244 } 1245 1246 UseScratchRegisterScope temps(this); 1247 if (addr.IsScatterGather()) { 1248 // In scatter-gather modes, zt and zn/zm have the same lane size. However, 1249 // for 32-bit accesses, the result of each lane's address calculation still 1250 // requires 64 bits; we can't naively use `Adr` for the address calculation 1251 // because it would truncate each address to 32 bits. 1252 1253 if (addr.IsVectorPlusImmediate()) { 1254 // Synthesise the immediate in an X register, then use a 1255 // scalar-plus-vector access with the original vector. 1256 Register scratch = temps.AcquireX(); 1257 Mov(scratch, addr.GetImmediateOffset()); 1258 SingleEmissionCheckScope guard(this); 1259 SVEOffsetModifier om = 1260 zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER; 1261 (this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om)); 1262 return; 1263 } 1264 1265 VIXL_UNIMPLEMENTED(); 1266 } else { 1267 Register scratch = temps.AcquireX(); 1268 // TODO: If we have an immediate offset that is a multiple of 1269 // msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to 1270 // save an instruction. 1271 int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2; 1272 CalculateSVEAddress(scratch, addr, vl_divisor_log2); 1273 SingleEmissionCheckScope guard(this); 1274 (this->*fn)(zt, pg, SVEMemOperand(scratch)); 1275 } 1276} 1277 1278template <typename Tf> 1279void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2, 1280 const ZRegister& zt, 1281 const PRegisterZ& pg, 1282 const SVEMemOperand& addr, 1283 Tf fn) { 1284 if (addr.IsScatterGather()) { 1285 // Scatter-gather first-fault loads share encodings with normal loads. 1286 SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn); 1287 return; 1288 } 1289 1290 // Contiguous first-faulting loads have no scalar-plus-immediate form at all, 1291 // so we don't do immediate synthesis. 1292 1293 // We cannot currently distinguish "[x0]" from "[x0, #0]", and this 1294 // is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here. 1295 if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() && 1296 addr.IsEquivalentToLSL(msize_in_bytes_log2))) { 1297 SingleEmissionCheckScope guard(this); 1298 (this->*fn)(zt, pg, addr); 1299 return; 1300 } 1301 1302 VIXL_UNIMPLEMENTED(); 1303} 1304 1305void MacroAssembler::Ld1b(const ZRegister& zt, 1306 const PRegisterZ& pg, 1307 const SVEMemOperand& addr) { 1308 VIXL_ASSERT(allow_macro_instructions_); 1309 SVELoadStore1Helper(kBRegSizeInBytesLog2, 1310 zt, 1311 pg, 1312 addr, 1313 static_cast<SVELoad1Fn>(&Assembler::ld1b)); 1314} 1315 1316void MacroAssembler::Ld1h(const ZRegister& zt, 1317 const PRegisterZ& pg, 1318 const SVEMemOperand& addr) { 1319 VIXL_ASSERT(allow_macro_instructions_); 1320 SVELoadStore1Helper(kHRegSizeInBytesLog2, 1321 zt, 1322 pg, 1323 addr, 1324 static_cast<SVELoad1Fn>(&Assembler::ld1h)); 1325} 1326 1327void MacroAssembler::Ld1w(const ZRegister& zt, 1328 const PRegisterZ& pg, 1329 const SVEMemOperand& addr) { 1330 VIXL_ASSERT(allow_macro_instructions_); 1331 SVELoadStore1Helper(kWRegSizeInBytesLog2, 1332 zt, 1333 pg, 1334 addr, 1335 static_cast<SVELoad1Fn>(&Assembler::ld1w)); 1336} 1337 1338void MacroAssembler::Ld1d(const ZRegister& zt, 1339 const PRegisterZ& pg, 1340 const SVEMemOperand& addr) { 1341 VIXL_ASSERT(allow_macro_instructions_); 1342 SVELoadStore1Helper(kDRegSizeInBytesLog2, 1343 zt, 1344 pg, 1345 addr, 1346 static_cast<SVELoad1Fn>(&Assembler::ld1d)); 1347} 1348 1349void MacroAssembler::Ld1sb(const ZRegister& zt, 1350 const PRegisterZ& pg, 1351 const SVEMemOperand& addr) { 1352 VIXL_ASSERT(allow_macro_instructions_); 1353 SVELoadStore1Helper(kBRegSizeInBytesLog2, 1354 zt, 1355 pg, 1356 addr, 1357 static_cast<SVELoad1Fn>(&Assembler::ld1sb)); 1358} 1359 1360void MacroAssembler::Ld1sh(const ZRegister& zt, 1361 const PRegisterZ& pg, 1362 const SVEMemOperand& addr) { 1363 VIXL_ASSERT(allow_macro_instructions_); 1364 SVELoadStore1Helper(kHRegSizeInBytesLog2, 1365 zt, 1366 pg, 1367 addr, 1368 static_cast<SVELoad1Fn>(&Assembler::ld1sh)); 1369} 1370 1371void MacroAssembler::Ld1sw(const ZRegister& zt, 1372 const PRegisterZ& pg, 1373 const SVEMemOperand& addr) { 1374 VIXL_ASSERT(allow_macro_instructions_); 1375 SVELoadStore1Helper(kSRegSizeInBytesLog2, 1376 zt, 1377 pg, 1378 addr, 1379 static_cast<SVELoad1Fn>(&Assembler::ld1sw)); 1380} 1381 1382void MacroAssembler::St1b(const ZRegister& zt, 1383 const PRegister& pg, 1384 const SVEMemOperand& addr) { 1385 VIXL_ASSERT(allow_macro_instructions_); 1386 SVELoadStore1Helper(kBRegSizeInBytesLog2, 1387 zt, 1388 pg, 1389 addr, 1390 static_cast<SVEStore1Fn>(&Assembler::st1b)); 1391} 1392 1393void MacroAssembler::St1h(const ZRegister& zt, 1394 const PRegister& pg, 1395 const SVEMemOperand& addr) { 1396 VIXL_ASSERT(allow_macro_instructions_); 1397 SVELoadStore1Helper(kHRegSizeInBytesLog2, 1398 zt, 1399 pg, 1400 addr, 1401 static_cast<SVEStore1Fn>(&Assembler::st1h)); 1402} 1403 1404void MacroAssembler::St1w(const ZRegister& zt, 1405 const PRegister& pg, 1406 const SVEMemOperand& addr) { 1407 VIXL_ASSERT(allow_macro_instructions_); 1408 SVELoadStore1Helper(kSRegSizeInBytesLog2, 1409 zt, 1410 pg, 1411 addr, 1412 static_cast<SVEStore1Fn>(&Assembler::st1w)); 1413} 1414 1415void MacroAssembler::St1d(const ZRegister& zt, 1416 const PRegister& pg, 1417 const SVEMemOperand& addr) { 1418 VIXL_ASSERT(allow_macro_instructions_); 1419 SVELoadStore1Helper(kDRegSizeInBytesLog2, 1420 zt, 1421 pg, 1422 addr, 1423 static_cast<SVEStore1Fn>(&Assembler::st1d)); 1424} 1425 1426void MacroAssembler::Ldff1b(const ZRegister& zt, 1427 const PRegisterZ& pg, 1428 const SVEMemOperand& addr) { 1429 VIXL_ASSERT(allow_macro_instructions_); 1430 SVELoadFFHelper(kBRegSizeInBytesLog2, 1431 zt, 1432 pg, 1433 addr, 1434 static_cast<SVELoad1Fn>(&Assembler::ldff1b)); 1435} 1436 1437void MacroAssembler::Ldff1h(const ZRegister& zt, 1438 const PRegisterZ& pg, 1439 const SVEMemOperand& addr) { 1440 VIXL_ASSERT(allow_macro_instructions_); 1441 SVELoadFFHelper(kHRegSizeInBytesLog2, 1442 zt, 1443 pg, 1444 addr, 1445 static_cast<SVELoad1Fn>(&Assembler::ldff1h)); 1446} 1447 1448void MacroAssembler::Ldff1w(const ZRegister& zt, 1449 const PRegisterZ& pg, 1450 const SVEMemOperand& addr) { 1451 VIXL_ASSERT(allow_macro_instructions_); 1452 SVELoadFFHelper(kSRegSizeInBytesLog2, 1453 zt, 1454 pg, 1455 addr, 1456 static_cast<SVELoad1Fn>(&Assembler::ldff1w)); 1457} 1458 1459void MacroAssembler::Ldff1d(const ZRegister& zt, 1460 const PRegisterZ& pg, 1461 const SVEMemOperand& addr) { 1462 VIXL_ASSERT(allow_macro_instructions_); 1463 SVELoadFFHelper(kDRegSizeInBytesLog2, 1464 zt, 1465 pg, 1466 addr, 1467 static_cast<SVELoad1Fn>(&Assembler::ldff1d)); 1468} 1469 1470void MacroAssembler::Ldff1sb(const ZRegister& zt, 1471 const PRegisterZ& pg, 1472 const SVEMemOperand& addr) { 1473 VIXL_ASSERT(allow_macro_instructions_); 1474 SVELoadFFHelper(kBRegSizeInBytesLog2, 1475 zt, 1476 pg, 1477 addr, 1478 static_cast<SVELoad1Fn>(&Assembler::ldff1sb)); 1479} 1480 1481void MacroAssembler::Ldff1sh(const ZRegister& zt, 1482 const PRegisterZ& pg, 1483 const SVEMemOperand& addr) { 1484 VIXL_ASSERT(allow_macro_instructions_); 1485 SVELoadFFHelper(kHRegSizeInBytesLog2, 1486 zt, 1487 pg, 1488 addr, 1489 static_cast<SVELoad1Fn>(&Assembler::ldff1sh)); 1490} 1491 1492void MacroAssembler::Ldff1sw(const ZRegister& zt, 1493 const PRegisterZ& pg, 1494 const SVEMemOperand& addr) { 1495 VIXL_ASSERT(allow_macro_instructions_); 1496 SVELoadFFHelper(kSRegSizeInBytesLog2, 1497 zt, 1498 pg, 1499 addr, 1500 static_cast<SVELoad1Fn>(&Assembler::ldff1sw)); 1501} 1502 1503#define VIXL_SVE_LD1R_LIST(V) \ 1504 V(qb, 4) V(qh, 4) V(qw, 4) V(qd, 4) V(ob, 5) V(oh, 5) V(ow, 5) V(od, 5) 1505 1506#define VIXL_DEFINE_MASM_FUNC(SZ, SH) \ 1507 void MacroAssembler::Ld1r##SZ(const ZRegister& zt, \ 1508 const PRegisterZ& pg, \ 1509 const SVEMemOperand& addr) { \ 1510 VIXL_ASSERT(allow_macro_instructions_); \ 1511 SVELoadStoreNTBroadcastQOHelper(zt, \ 1512 pg, \ 1513 addr, \ 1514 &MacroAssembler::ld1r##SZ, \ 1515 4, \ 1516 SH, \ 1517 NO_SVE_OFFSET_MODIFIER, \ 1518 -1); \ 1519 } 1520 1521VIXL_SVE_LD1R_LIST(VIXL_DEFINE_MASM_FUNC) 1522 1523#undef VIXL_DEFINE_MASM_FUNC 1524#undef VIXL_SVE_LD1R_LIST 1525 1526void MacroAssembler::Ldnt1b(const ZRegister& zt, 1527 const PRegisterZ& pg, 1528 const SVEMemOperand& addr) { 1529 VIXL_ASSERT(allow_macro_instructions_); 1530 if (addr.IsVectorPlusScalar()) { 1531 SingleEmissionCheckScope guard(this); 1532 ldnt1b(zt, pg, addr); 1533 } else { 1534 SVELoadStoreNTBroadcastQOHelper(zt, 1535 pg, 1536 addr, 1537 &MacroAssembler::ldnt1b, 1538 4, 1539 0, 1540 SVE_MUL_VL); 1541 } 1542} 1543 1544void MacroAssembler::Ldnt1d(const ZRegister& zt, 1545 const PRegisterZ& pg, 1546 const SVEMemOperand& addr) { 1547 VIXL_ASSERT(allow_macro_instructions_); 1548 if (addr.IsVectorPlusScalar()) { 1549 SingleEmissionCheckScope guard(this); 1550 ldnt1d(zt, pg, addr); 1551 } else { 1552 SVELoadStoreNTBroadcastQOHelper(zt, 1553 pg, 1554 addr, 1555 &MacroAssembler::ldnt1d, 1556 4, 1557 0, 1558 SVE_MUL_VL); 1559 } 1560} 1561 1562void MacroAssembler::Ldnt1h(const ZRegister& zt, 1563 const PRegisterZ& pg, 1564 const SVEMemOperand& addr) { 1565 VIXL_ASSERT(allow_macro_instructions_); 1566 if (addr.IsVectorPlusScalar()) { 1567 SingleEmissionCheckScope guard(this); 1568 ldnt1h(zt, pg, addr); 1569 } else { 1570 SVELoadStoreNTBroadcastQOHelper(zt, 1571 pg, 1572 addr, 1573 &MacroAssembler::ldnt1h, 1574 4, 1575 0, 1576 SVE_MUL_VL); 1577 } 1578} 1579 1580void MacroAssembler::Ldnt1w(const ZRegister& zt, 1581 const PRegisterZ& pg, 1582 const SVEMemOperand& addr) { 1583 VIXL_ASSERT(allow_macro_instructions_); 1584 if (addr.IsVectorPlusScalar()) { 1585 SingleEmissionCheckScope guard(this); 1586 ldnt1w(zt, pg, addr); 1587 } else { 1588 SVELoadStoreNTBroadcastQOHelper(zt, 1589 pg, 1590 addr, 1591 &MacroAssembler::ldnt1w, 1592 4, 1593 0, 1594 SVE_MUL_VL); 1595 } 1596} 1597 1598void MacroAssembler::Stnt1b(const ZRegister& zt, 1599 const PRegister& pg, 1600 const SVEMemOperand& addr) { 1601 VIXL_ASSERT(allow_macro_instructions_); 1602 if (addr.IsVectorPlusScalar()) { 1603 SingleEmissionCheckScope guard(this); 1604 stnt1b(zt, pg, addr); 1605 } else { 1606 SVELoadStoreNTBroadcastQOHelper(zt, 1607 pg, 1608 addr, 1609 &MacroAssembler::stnt1b, 1610 4, 1611 0, 1612 SVE_MUL_VL); 1613 } 1614} 1615void MacroAssembler::Stnt1d(const ZRegister& zt, 1616 const PRegister& pg, 1617 const SVEMemOperand& addr) { 1618 VIXL_ASSERT(allow_macro_instructions_); 1619 if (addr.IsVectorPlusScalar()) { 1620 SingleEmissionCheckScope guard(this); 1621 stnt1d(zt, pg, addr); 1622 } else { 1623 SVELoadStoreNTBroadcastQOHelper(zt, 1624 pg, 1625 addr, 1626 &MacroAssembler::stnt1d, 1627 4, 1628 0, 1629 SVE_MUL_VL); 1630 } 1631} 1632void MacroAssembler::Stnt1h(const ZRegister& zt, 1633 const PRegister& pg, 1634 const SVEMemOperand& addr) { 1635 VIXL_ASSERT(allow_macro_instructions_); 1636 if (addr.IsVectorPlusScalar()) { 1637 SingleEmissionCheckScope guard(this); 1638 stnt1h(zt, pg, addr); 1639 } else { 1640 SVELoadStoreNTBroadcastQOHelper(zt, 1641 pg, 1642 addr, 1643 &MacroAssembler::stnt1h, 1644 4, 1645 0, 1646 SVE_MUL_VL); 1647 } 1648} 1649void MacroAssembler::Stnt1w(const ZRegister& zt, 1650 const PRegister& pg, 1651 const SVEMemOperand& addr) { 1652 VIXL_ASSERT(allow_macro_instructions_); 1653 if (addr.IsVectorPlusScalar()) { 1654 SingleEmissionCheckScope guard(this); 1655 stnt1w(zt, pg, addr); 1656 } else { 1657 SVELoadStoreNTBroadcastQOHelper(zt, 1658 pg, 1659 addr, 1660 &MacroAssembler::stnt1w, 1661 4, 1662 0, 1663 SVE_MUL_VL); 1664 } 1665} 1666 1667void MacroAssembler::SVEDotIndexHelper(ZZZImmFn fn, 1668 const ZRegister& zd, 1669 const ZRegister& za, 1670 const ZRegister& zn, 1671 const ZRegister& zm, 1672 int index) { 1673 if (zd.Aliases(za)) { 1674 // zda = zda + (zn . zm) 1675 SingleEmissionCheckScope guard(this); 1676 (this->*fn)(zd, zn, zm, index); 1677 1678 } else if (zd.Aliases(zn) || zd.Aliases(zm)) { 1679 // zdn = za + (zdn . zm[index]) 1680 // zdm = za + (zn . zdm[index]) 1681 // zdnm = za + (zdnm . zdnm[index]) 1682 UseScratchRegisterScope temps(this); 1683 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd); 1684 { 1685 MovprfxHelperScope guard(this, scratch, za); 1686 (this->*fn)(scratch, zn, zm, index); 1687 } 1688 1689 Mov(zd, scratch); 1690 } else { 1691 // zd = za + (zn . zm) 1692 MovprfxHelperScope guard(this, zd, za); 1693 (this->*fn)(zd, zn, zm, index); 1694 } 1695} 1696 1697void MacroAssembler::FourRegDestructiveHelper(Int3ArithFn fn, 1698 const ZRegister& zd, 1699 const ZRegister& za, 1700 const ZRegister& zn, 1701 const ZRegister& zm) { 1702 if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) { 1703 // zd = za . zd . zm 1704 // zd = za . zn . zd 1705 // zd = za . zd . zd 1706 UseScratchRegisterScope temps(this); 1707 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd); 1708 { 1709 MovprfxHelperScope guard(this, scratch, za); 1710 (this->*fn)(scratch, zn, zm); 1711 } 1712 1713 Mov(zd, scratch); 1714 } else { 1715 MovprfxHelperScope guard(this, zd, za); 1716 (this->*fn)(zd, zn, zm); 1717 } 1718} 1719 1720void MacroAssembler::FourRegDestructiveHelper(Int4ArithFn fn, 1721 const ZRegister& zd, 1722 const ZRegister& za, 1723 const ZRegister& zn, 1724 const ZRegister& zm) { 1725 if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) { 1726 // zd = za . zd . zm 1727 // zd = za . zn . zd 1728 // zd = za . zd . zd 1729 UseScratchRegisterScope temps(this); 1730 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd); 1731 { 1732 MovprfxHelperScope guard(this, scratch, za); 1733 (this->*fn)(scratch, scratch, zn, zm); 1734 } 1735 1736 Mov(zd, scratch); 1737 } else { 1738 MovprfxHelperScope guard(this, zd, za); 1739 (this->*fn)(zd, zd, zn, zm); 1740 } 1741} 1742 1743void MacroAssembler::FourRegOneImmDestructiveHelper(ZZZImmFn fn, 1744 const ZRegister& zd, 1745 const ZRegister& za, 1746 const ZRegister& zn, 1747 const ZRegister& zm, 1748 int imm) { 1749 if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) { 1750 // zd = za . zd . zm[i] 1751 // zd = za . zn . zd[i] 1752 // zd = za . zd . zd[i] 1753 UseScratchRegisterScope temps(this); 1754 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd); 1755 { 1756 MovprfxHelperScope guard(this, scratch, za); 1757 (this->*fn)(scratch, zn, zm, imm); 1758 } 1759 1760 Mov(zd, scratch); 1761 } else { 1762 // zd = za . zn . zm[i] 1763 MovprfxHelperScope guard(this, zd, za); 1764 (this->*fn)(zd, zn, zm, imm); 1765 } 1766} 1767 1768void MacroAssembler::AbsoluteDifferenceAccumulate(Int3ArithFn fn, 1769 const ZRegister& zd, 1770 const ZRegister& za, 1771 const ZRegister& zn, 1772 const ZRegister& zm) { 1773 if (zn.Aliases(zm)) { 1774 // If zn == zm, the difference is zero. 1775 if (!zd.Aliases(za)) { 1776 Mov(zd, za); 1777 } 1778 } else if (zd.Aliases(za)) { 1779 SingleEmissionCheckScope guard(this); 1780 (this->*fn)(zd, zn, zm); 1781 } else if (zd.Aliases(zn)) { 1782 UseScratchRegisterScope temps(this); 1783 ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits()); 1784 Mov(ztmp, zn); 1785 MovprfxHelperScope guard(this, zd, za); 1786 (this->*fn)(zd, ztmp, zm); 1787 } else if (zd.Aliases(zm)) { 1788 UseScratchRegisterScope temps(this); 1789 ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits()); 1790 Mov(ztmp, zm); 1791 MovprfxHelperScope guard(this, zd, za); 1792 (this->*fn)(zd, zn, ztmp); 1793 } else { 1794 MovprfxHelperScope guard(this, zd, za); 1795 (this->*fn)(zd, zn, zm); 1796 } 1797} 1798 1799#define VIXL_SVE_4REG_LIST(V) \ 1800 V(Saba, saba, AbsoluteDifferenceAccumulate) \ 1801 V(Uaba, uaba, AbsoluteDifferenceAccumulate) \ 1802 V(Sabalb, sabalb, AbsoluteDifferenceAccumulate) \ 1803 V(Sabalt, sabalt, AbsoluteDifferenceAccumulate) \ 1804 V(Uabalb, uabalb, AbsoluteDifferenceAccumulate) \ 1805 V(Uabalt, uabalt, AbsoluteDifferenceAccumulate) \ 1806 V(Sdot, sdot, FourRegDestructiveHelper) \ 1807 V(Udot, udot, FourRegDestructiveHelper) \ 1808 V(Adclb, adclb, FourRegDestructiveHelper) \ 1809 V(Adclt, adclt, FourRegDestructiveHelper) \ 1810 V(Sbclb, sbclb, FourRegDestructiveHelper) \ 1811 V(Sbclt, sbclt, FourRegDestructiveHelper) \ 1812 V(Smlalb, smlalb, FourRegDestructiveHelper) \ 1813 V(Smlalt, smlalt, FourRegDestructiveHelper) \ 1814 V(Smlslb, smlslb, FourRegDestructiveHelper) \ 1815 V(Smlslt, smlslt, FourRegDestructiveHelper) \ 1816 V(Umlalb, umlalb, FourRegDestructiveHelper) \ 1817 V(Umlalt, umlalt, FourRegDestructiveHelper) \ 1818 V(Umlslb, umlslb, FourRegDestructiveHelper) \ 1819 V(Umlslt, umlslt, FourRegDestructiveHelper) \ 1820 V(Bcax, bcax, FourRegDestructiveHelper) \ 1821 V(Bsl, bsl, FourRegDestructiveHelper) \ 1822 V(Bsl1n, bsl1n, FourRegDestructiveHelper) \ 1823 V(Bsl2n, bsl2n, FourRegDestructiveHelper) \ 1824 V(Eor3, eor3, FourRegDestructiveHelper) \ 1825 V(Nbsl, nbsl, FourRegDestructiveHelper) \ 1826 V(Fmlalb, fmlalb, FourRegDestructiveHelper) \ 1827 V(Fmlalt, fmlalt, FourRegDestructiveHelper) \ 1828 V(Fmlslb, fmlslb, FourRegDestructiveHelper) \ 1829 V(Fmlslt, fmlslt, FourRegDestructiveHelper) \ 1830 V(Sqdmlalb, sqdmlalb, FourRegDestructiveHelper) \ 1831 V(Sqdmlalbt, sqdmlalbt, FourRegDestructiveHelper) \ 1832 V(Sqdmlalt, sqdmlalt, FourRegDestructiveHelper) \ 1833 V(Sqdmlslb, sqdmlslb, FourRegDestructiveHelper) \ 1834 V(Sqdmlslbt, sqdmlslbt, FourRegDestructiveHelper) \ 1835 V(Sqdmlslt, sqdmlslt, FourRegDestructiveHelper) \ 1836 V(Sqrdmlah, sqrdmlah, FourRegDestructiveHelper) \ 1837 V(Sqrdmlsh, sqrdmlsh, FourRegDestructiveHelper) \ 1838 V(Fmmla, fmmla, FourRegDestructiveHelper) \ 1839 V(Smmla, smmla, FourRegDestructiveHelper) \ 1840 V(Ummla, ummla, FourRegDestructiveHelper) \ 1841 V(Usmmla, usmmla, FourRegDestructiveHelper) \ 1842 V(Usdot, usdot, FourRegDestructiveHelper) 1843 1844#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \ 1845 void MacroAssembler::MASMFN(const ZRegister& zd, \ 1846 const ZRegister& za, \ 1847 const ZRegister& zn, \ 1848 const ZRegister& zm) { \ 1849 VIXL_ASSERT(allow_macro_instructions_); \ 1850 HELPER(&Assembler::ASMFN, zd, za, zn, zm); \ 1851 } 1852VIXL_SVE_4REG_LIST(VIXL_DEFINE_MASM_FUNC) 1853#undef VIXL_DEFINE_MASM_FUNC 1854 1855#define VIXL_SVE_4REG_1IMM_LIST(V) \ 1856 V(Fmla, fmla, FourRegOneImmDestructiveHelper) \ 1857 V(Fmls, fmls, FourRegOneImmDestructiveHelper) \ 1858 V(Fmlalb, fmlalb, FourRegOneImmDestructiveHelper) \ 1859 V(Fmlalt, fmlalt, FourRegOneImmDestructiveHelper) \ 1860 V(Fmlslb, fmlslb, FourRegOneImmDestructiveHelper) \ 1861 V(Fmlslt, fmlslt, FourRegOneImmDestructiveHelper) \ 1862 V(Mla, mla, FourRegOneImmDestructiveHelper) \ 1863 V(Mls, mls, FourRegOneImmDestructiveHelper) \ 1864 V(Smlalb, smlalb, FourRegOneImmDestructiveHelper) \ 1865 V(Smlalt, smlalt, FourRegOneImmDestructiveHelper) \ 1866 V(Smlslb, smlslb, FourRegOneImmDestructiveHelper) \ 1867 V(Smlslt, smlslt, FourRegOneImmDestructiveHelper) \ 1868 V(Sqdmlalb, sqdmlalb, FourRegOneImmDestructiveHelper) \ 1869 V(Sqdmlalt, sqdmlalt, FourRegOneImmDestructiveHelper) \ 1870 V(Sqdmlslb, sqdmlslb, FourRegOneImmDestructiveHelper) \ 1871 V(Sqdmlslt, sqdmlslt, FourRegOneImmDestructiveHelper) \ 1872 V(Sqrdmlah, sqrdmlah, FourRegOneImmDestructiveHelper) \ 1873 V(Sqrdmlsh, sqrdmlsh, FourRegOneImmDestructiveHelper) \ 1874 V(Umlalb, umlalb, FourRegOneImmDestructiveHelper) \ 1875 V(Umlalt, umlalt, FourRegOneImmDestructiveHelper) \ 1876 V(Umlslb, umlslb, FourRegOneImmDestructiveHelper) \ 1877 V(Umlslt, umlslt, FourRegOneImmDestructiveHelper) 1878 1879#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \ 1880 void MacroAssembler::MASMFN(const ZRegister& zd, \ 1881 const ZRegister& za, \ 1882 const ZRegister& zn, \ 1883 const ZRegister& zm, \ 1884 int imm) { \ 1885 VIXL_ASSERT(allow_macro_instructions_); \ 1886 HELPER(&Assembler::ASMFN, zd, za, zn, zm, imm); \ 1887 } 1888VIXL_SVE_4REG_1IMM_LIST(VIXL_DEFINE_MASM_FUNC) 1889#undef VIXL_DEFINE_MASM_FUNC 1890 1891void MacroAssembler::Sdot(const ZRegister& zd, 1892 const ZRegister& za, 1893 const ZRegister& zn, 1894 const ZRegister& zm, 1895 int index) { 1896 VIXL_ASSERT(allow_macro_instructions_); 1897 SVEDotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index); 1898} 1899 1900void MacroAssembler::Udot(const ZRegister& zd, 1901 const ZRegister& za, 1902 const ZRegister& zn, 1903 const ZRegister& zm, 1904 int index) { 1905 VIXL_ASSERT(allow_macro_instructions_); 1906 SVEDotIndexHelper(&Assembler::udot, zd, za, zn, zm, index); 1907} 1908 1909void MacroAssembler::Sudot(const ZRegister& zd, 1910 const ZRegister& za, 1911 const ZRegister& zn, 1912 const ZRegister& zm, 1913 int index) { 1914 VIXL_ASSERT(allow_macro_instructions_); 1915 SVEDotIndexHelper(&Assembler::sudot, zd, za, zn, zm, index); 1916} 1917 1918void MacroAssembler::Usdot(const ZRegister& zd, 1919 const ZRegister& za, 1920 const ZRegister& zn, 1921 const ZRegister& zm, 1922 int index) { 1923 VIXL_ASSERT(allow_macro_instructions_); 1924 SVEDotIndexHelper(&Assembler::usdot, zd, za, zn, zm, index); 1925} 1926 1927void MacroAssembler::Cdot(const ZRegister& zd, 1928 const ZRegister& za, 1929 const ZRegister& zn, 1930 const ZRegister& zm, 1931 int index, 1932 int rot) { 1933 // This doesn't handle zm when it's out of the range that can be encoded in 1934 // instruction. The range depends on element size: z0-z7 for B, z0-15 for H. 1935 if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { 1936 UseScratchRegisterScope temps(this); 1937 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd); 1938 { 1939 MovprfxHelperScope guard(this, ztmp, za); 1940 cdot(ztmp, zn, zm, index, rot); 1941 } 1942 Mov(zd, ztmp); 1943 } else { 1944 MovprfxHelperScope guard(this, zd, za); 1945 cdot(zd, zn, zm, index, rot); 1946 } 1947} 1948 1949void MacroAssembler::Cdot(const ZRegister& zd, 1950 const ZRegister& za, 1951 const ZRegister& zn, 1952 const ZRegister& zm, 1953 int rot) { 1954 if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { 1955 UseScratchRegisterScope temps(this); 1956 VIXL_ASSERT(AreSameLaneSize(zn, zm)); 1957 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn); 1958 Mov(ztmp, zd.Aliases(zn) ? zn : zm); 1959 MovprfxHelperScope guard(this, zd, za); 1960 cdot(zd, (zd.Aliases(zn) ? ztmp : zn), (zd.Aliases(zm) ? ztmp : zm), rot); 1961 } else { 1962 MovprfxHelperScope guard(this, zd, za); 1963 cdot(zd, zn, zm, rot); 1964 } 1965} 1966 1967void MacroAssembler::FPMulAddHelper(const ZRegister& zd, 1968 const PRegisterM& pg, 1969 const ZRegister& za, 1970 const ZRegister& zn, 1971 const ZRegister& zm, 1972 SVEMulAddPredicatedZdaFn fn_zda, 1973 SVEMulAddPredicatedZdnFn fn_zdn, 1974 FPMacroNaNPropagationOption nan_option) { 1975 ResolveFPNaNPropagationOption(&nan_option); 1976 1977 if (zd.Aliases(za)) { 1978 // zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls. 1979 SingleEmissionCheckScope guard(this); 1980 (this->*fn_zda)(zd, pg, zn, zm); 1981 } else if (zd.Aliases(zn)) { 1982 // zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb. 1983 SingleEmissionCheckScope guard(this); 1984 (this->*fn_zdn)(zd, pg, zm, za); 1985 } else if (zd.Aliases(zm)) { 1986 switch (nan_option) { 1987 case FastNaNPropagation: { 1988 // We treat multiplication as commutative in the fast mode, so we can 1989 // swap zn and zm. 1990 // zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb. 1991 SingleEmissionCheckScope guard(this); 1992 (this->*fn_zdn)(zd, pg, zn, za); 1993 return; 1994 } 1995 case StrictNaNPropagation: { 1996 UseScratchRegisterScope temps(this); 1997 // Use a scratch register to keep the argument order exactly as 1998 // specified. 1999 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn); 2000 { 2001 MovprfxHelperScope guard(this, scratch, pg, za); 2002 // scratch = (-)za + ((-)zn * zm) 2003 (this->*fn_zda)(scratch, pg, zn, zm); 2004 } 2005 Mov(zd, scratch); 2006 return; 2007 } 2008 case NoFPMacroNaNPropagationSelected: 2009 VIXL_UNREACHABLE(); 2010 return; 2011 } 2012 } else { 2013 // zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls. 2014 MovprfxHelperScope guard(this, zd, pg, za); 2015 (this->*fn_zda)(zd, pg, zn, zm); 2016 } 2017} 2018 2019void MacroAssembler::Fmla(const ZRegister& zd, 2020 const PRegisterM& pg, 2021 const ZRegister& za, 2022 const ZRegister& zn, 2023 const ZRegister& zm, 2024 FPMacroNaNPropagationOption nan_option) { 2025 VIXL_ASSERT(allow_macro_instructions_); 2026 FPMulAddHelper(zd, 2027 pg, 2028 za, 2029 zn, 2030 zm, 2031 &Assembler::fmla, 2032 &Assembler::fmad, 2033 nan_option); 2034} 2035 2036void MacroAssembler::Fmls(const ZRegister& zd, 2037 const PRegisterM& pg, 2038 const ZRegister& za, 2039 const ZRegister& zn, 2040 const ZRegister& zm, 2041 FPMacroNaNPropagationOption nan_option) { 2042 VIXL_ASSERT(allow_macro_instructions_); 2043 FPMulAddHelper(zd, 2044 pg, 2045 za, 2046 zn, 2047 zm, 2048 &Assembler::fmls, 2049 &Assembler::fmsb, 2050 nan_option); 2051} 2052 2053void MacroAssembler::Fnmla(const ZRegister& zd, 2054 const PRegisterM& pg, 2055 const ZRegister& za, 2056 const ZRegister& zn, 2057 const ZRegister& zm, 2058 FPMacroNaNPropagationOption nan_option) { 2059 VIXL_ASSERT(allow_macro_instructions_); 2060 FPMulAddHelper(zd, 2061 pg, 2062 za, 2063 zn, 2064 zm, 2065 &Assembler::fnmla, 2066 &Assembler::fnmad, 2067 nan_option); 2068} 2069 2070void MacroAssembler::Fnmls(const ZRegister& zd, 2071 const PRegisterM& pg, 2072 const ZRegister& za, 2073 const ZRegister& zn, 2074 const ZRegister& zm, 2075 FPMacroNaNPropagationOption nan_option) { 2076 VIXL_ASSERT(allow_macro_instructions_); 2077 FPMulAddHelper(zd, 2078 pg, 2079 za, 2080 zn, 2081 zm, 2082 &Assembler::fnmls, 2083 &Assembler::fnmsb, 2084 nan_option); 2085} 2086 2087void MacroAssembler::Ftmad(const ZRegister& zd, 2088 const ZRegister& zn, 2089 const ZRegister& zm, 2090 int imm3) { 2091 VIXL_ASSERT(allow_macro_instructions_); 2092 if (zd.Aliases(zm) && !zd.Aliases(zn)) { 2093 UseScratchRegisterScope temps(this); 2094 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm); 2095 Mov(scratch, zm); 2096 MovprfxHelperScope guard(this, zd, zn); 2097 ftmad(zd, zd, scratch, imm3); 2098 } else { 2099 MovprfxHelperScope guard(this, zd, zn); 2100 ftmad(zd, zd, zm, imm3); 2101 } 2102} 2103 2104void MacroAssembler::Fcadd(const ZRegister& zd, 2105 const PRegisterM& pg, 2106 const ZRegister& zn, 2107 const ZRegister& zm, 2108 int rot) { 2109 VIXL_ASSERT(allow_macro_instructions_); 2110 if (zd.Aliases(zm) && !zd.Aliases(zn)) { 2111 UseScratchRegisterScope temps(this); 2112 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd); 2113 { 2114 MovprfxHelperScope guard(this, scratch, pg, zn); 2115 fcadd(scratch, pg, scratch, zm, rot); 2116 } 2117 Mov(zd, scratch); 2118 } else { 2119 MovprfxHelperScope guard(this, zd, pg, zn); 2120 fcadd(zd, pg, zd, zm, rot); 2121 } 2122} 2123 2124void MacroAssembler::Fcmla(const ZRegister& zd, 2125 const PRegisterM& pg, 2126 const ZRegister& za, 2127 const ZRegister& zn, 2128 const ZRegister& zm, 2129 int rot) { 2130 VIXL_ASSERT(allow_macro_instructions_); 2131 if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { 2132 UseScratchRegisterScope temps(this); 2133 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd); 2134 { 2135 MovprfxHelperScope guard(this, ztmp, za); 2136 fcmla(ztmp, pg, zn, zm, rot); 2137 } 2138 Mov(zd, pg, ztmp); 2139 } else { 2140 MovprfxHelperScope guard(this, zd, pg, za); 2141 fcmla(zd, pg, zn, zm, rot); 2142 } 2143} 2144 2145void MacroAssembler::Splice(const ZRegister& zd, 2146 const PRegister& pg, 2147 const ZRegister& zn, 2148 const ZRegister& zm) { 2149 VIXL_ASSERT(allow_macro_instructions_); 2150 if (CPUHas(CPUFeatures::kSVE2) && AreConsecutive(zn, zm) && !zd.Aliases(zn)) { 2151 SingleEmissionCheckScope guard(this); 2152 splice(zd, pg, zn, zm); 2153 } else if (zd.Aliases(zm) && !zd.Aliases(zn)) { 2154 UseScratchRegisterScope temps(this); 2155 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd); 2156 { 2157 MovprfxHelperScope guard(this, scratch, zn); 2158 splice(scratch, pg, scratch, zm); 2159 } 2160 Mov(zd, scratch); 2161 } else { 2162 MovprfxHelperScope guard(this, zd, zn); 2163 splice(zd, pg, zd, zm); 2164 } 2165} 2166 2167void MacroAssembler::Clasta(const ZRegister& zd, 2168 const PRegister& pg, 2169 const ZRegister& zn, 2170 const ZRegister& zm) { 2171 VIXL_ASSERT(allow_macro_instructions_); 2172 if (zd.Aliases(zm) && !zd.Aliases(zn)) { 2173 UseScratchRegisterScope temps(this); 2174 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd); 2175 { 2176 MovprfxHelperScope guard(this, scratch, zn); 2177 clasta(scratch, pg, scratch, zm); 2178 } 2179 Mov(zd, scratch); 2180 } else { 2181 MovprfxHelperScope guard(this, zd, zn); 2182 clasta(zd, pg, zd, zm); 2183 } 2184} 2185 2186void MacroAssembler::Clastb(const ZRegister& zd, 2187 const PRegister& pg, 2188 const ZRegister& zn, 2189 const ZRegister& zm) { 2190 VIXL_ASSERT(allow_macro_instructions_); 2191 if (zd.Aliases(zm) && !zd.Aliases(zn)) { 2192 UseScratchRegisterScope temps(this); 2193 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd); 2194 { 2195 MovprfxHelperScope guard(this, scratch, zn); 2196 clastb(scratch, pg, scratch, zm); 2197 } 2198 Mov(zd, scratch); 2199 } else { 2200 MovprfxHelperScope guard(this, zd, zn); 2201 clastb(zd, pg, zd, zm); 2202 } 2203} 2204 2205void MacroAssembler::ShiftRightAccumulate(IntArithImmFn fn, 2206 const ZRegister& zd, 2207 const ZRegister& za, 2208 const ZRegister& zn, 2209 int shift) { 2210 VIXL_ASSERT(allow_macro_instructions_); 2211 if (!zd.Aliases(za) && zd.Aliases(zn)) { 2212 UseScratchRegisterScope temps(this); 2213 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn); 2214 Mov(ztmp, zn); 2215 { 2216 MovprfxHelperScope guard(this, zd, za); 2217 (this->*fn)(zd, ztmp, shift); 2218 } 2219 } else { 2220 MovprfxHelperScope guard(this, zd, za); 2221 (this->*fn)(zd, zn, shift); 2222 } 2223} 2224 2225void MacroAssembler::Srsra(const ZRegister& zd, 2226 const ZRegister& za, 2227 const ZRegister& zn, 2228 int shift) { 2229 ShiftRightAccumulate(&Assembler::srsra, zd, za, zn, shift); 2230} 2231 2232void MacroAssembler::Ssra(const ZRegister& zd, 2233 const ZRegister& za, 2234 const ZRegister& zn, 2235 int shift) { 2236 ShiftRightAccumulate(&Assembler::ssra, zd, za, zn, shift); 2237} 2238 2239void MacroAssembler::Ursra(const ZRegister& zd, 2240 const ZRegister& za, 2241 const ZRegister& zn, 2242 int shift) { 2243 ShiftRightAccumulate(&Assembler::ursra, zd, za, zn, shift); 2244} 2245 2246void MacroAssembler::Usra(const ZRegister& zd, 2247 const ZRegister& za, 2248 const ZRegister& zn, 2249 int shift) { 2250 ShiftRightAccumulate(&Assembler::usra, zd, za, zn, shift); 2251} 2252 2253void MacroAssembler::ComplexAddition(ZZZImmFn fn, 2254 const ZRegister& zd, 2255 const ZRegister& zn, 2256 const ZRegister& zm, 2257 int rot) { 2258 VIXL_ASSERT(allow_macro_instructions_); 2259 if (!zd.Aliases(zn) && zd.Aliases(zm)) { 2260 UseScratchRegisterScope temps(this); 2261 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zm); 2262 Mov(ztmp, zm); 2263 { 2264 MovprfxHelperScope guard(this, zd, zn); 2265 (this->*fn)(zd, zd, ztmp, rot); 2266 } 2267 } else { 2268 MovprfxHelperScope guard(this, zd, zn); 2269 (this->*fn)(zd, zd, zm, rot); 2270 } 2271} 2272 2273void MacroAssembler::Cadd(const ZRegister& zd, 2274 const ZRegister& zn, 2275 const ZRegister& zm, 2276 int rot) { 2277 ComplexAddition(&Assembler::cadd, zd, zn, zm, rot); 2278} 2279 2280void MacroAssembler::Sqcadd(const ZRegister& zd, 2281 const ZRegister& zn, 2282 const ZRegister& zm, 2283 int rot) { 2284 ComplexAddition(&Assembler::sqcadd, zd, zn, zm, rot); 2285} 2286 2287} // namespace aarch64 2288} // namespace vixl 2289