1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#ifndef ACO_IR_H 26#define ACO_IR_H 27 28#include "aco_opcodes.h" 29#include "aco_util.h" 30#include "aco_interface.h" 31#include "aco_shader_info.h" 32#include "vulkan/radv_shader.h" 33 34#include "nir.h" 35 36#include <bitset> 37#include <memory> 38#include <vector> 39 40struct radv_shader_args; 41 42namespace aco { 43 44extern uint64_t debug_flags; 45 46enum { 47 DEBUG_VALIDATE_IR = 0x1, 48 DEBUG_VALIDATE_RA = 0x2, 49 DEBUG_PERFWARN = 0x4, 50 DEBUG_FORCE_WAITCNT = 0x8, 51 DEBUG_NO_VN = 0x10, 52 DEBUG_NO_OPT = 0x20, 53 DEBUG_NO_SCHED = 0x40, 54 DEBUG_PERF_INFO = 0x80, 55 DEBUG_LIVE_INFO = 0x100, 56}; 57 58/** 59 * Representation of the instruction's microcode encoding format 60 * Note: Some Vector ALU Formats can be combined, such that: 61 * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding 62 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive. 63 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing. 64 * 65 * (*) The same is applicable for VOP1 and VOPC instructions. 66 */ 67enum class Format : std::uint16_t { 68 /* Pseudo Instruction Format */ 69 PSEUDO = 0, 70 /* Scalar ALU & Control Formats */ 71 SOP1 = 1, 72 SOP2 = 2, 73 SOPK = 3, 74 SOPP = 4, 75 SOPC = 5, 76 /* Scalar Memory Format */ 77 SMEM = 6, 78 /* LDS/GDS Format */ 79 DS = 8, 80 /* Vector Memory Buffer Formats */ 81 MTBUF = 9, 82 MUBUF = 10, 83 /* Vector Memory Image Format */ 84 MIMG = 11, 85 /* Export Format */ 86 EXP = 12, 87 /* Flat Formats */ 88 FLAT = 13, 89 GLOBAL = 14, 90 SCRATCH = 15, 91 92 PSEUDO_BRANCH = 16, 93 PSEUDO_BARRIER = 17, 94 PSEUDO_REDUCTION = 18, 95 96 /* Vector ALU Formats */ 97 VOP3P = 19, 98 VOP1 = 1 << 8, 99 VOP2 = 1 << 9, 100 VOPC = 1 << 10, 101 VOP3 = 1 << 11, 102 /* Vector Parameter Interpolation Format */ 103 VINTRP = 1 << 12, 104 DPP16 = 1 << 13, 105 SDWA = 1 << 14, 106 DPP8 = 1 << 15, 107}; 108 109enum class instr_class : uint8_t { 110 valu32 = 0, 111 valu_convert32 = 1, 112 valu64 = 2, 113 valu_quarter_rate32 = 3, 114 valu_fma = 4, 115 valu_transcendental32 = 5, 116 valu_double = 6, 117 valu_double_add = 7, 118 valu_double_convert = 8, 119 valu_double_transcendental = 9, 120 salu = 10, 121 smem = 11, 122 barrier = 12, 123 branch = 13, 124 sendmsg = 14, 125 ds = 15, 126 exp = 16, 127 vmem = 17, 128 waitcnt = 18, 129 other = 19, 130 count, 131}; 132 133enum storage_class : uint8_t { 134 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */ 135 storage_buffer = 0x1, /* SSBOs and global memory */ 136 storage_atomic_counter = 0x2, /* not used for Vulkan */ 137 storage_image = 0x4, 138 storage_shared = 0x8, /* or TCS output */ 139 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */ 140 storage_task_payload = 0x20,/* Task-Mesh payload */ 141 storage_scratch = 0x40, 142 storage_vgpr_spill = 0x80, 143 storage_count = 8, /* not counting storage_none */ 144}; 145 146enum memory_semantics : uint8_t { 147 semantic_none = 0x0, 148 /* for loads: don't move any access after this load to before this load (even other loads) 149 * for barriers: don't move any access after the barrier to before any 150 * atomics/control_barriers/sendmsg_gs_done before the barrier */ 151 semantic_acquire = 0x1, 152 /* for stores: don't move any access before this store to after this store 153 * for barriers: don't move any access before the barrier to after any 154 * atomics/control_barriers/sendmsg_gs_done after the barrier */ 155 semantic_release = 0x2, 156 157 /* the rest are for load/stores/atomics only */ 158 /* cannot be DCE'd or CSE'd */ 159 semantic_volatile = 0x4, 160 /* does not interact with barriers and assumes this lane is the only lane 161 * accessing this memory */ 162 semantic_private = 0x8, 163 /* this operation can be reordered around operations of the same storage. 164 * says nothing about barriers */ 165 semantic_can_reorder = 0x10, 166 /* this is a atomic instruction (may only read or write memory) */ 167 semantic_atomic = 0x20, 168 /* this is instruction both reads and writes memory */ 169 semantic_rmw = 0x40, 170 171 semantic_acqrel = semantic_acquire | semantic_release, 172 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw, 173}; 174 175enum sync_scope : uint8_t { 176 scope_invocation = 0, 177 scope_subgroup = 1, 178 scope_workgroup = 2, 179 scope_queuefamily = 3, 180 scope_device = 4, 181}; 182 183struct memory_sync_info { 184 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {} 185 memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation) 186 : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_) 187 {} 188 189 storage_class storage : 8; 190 memory_semantics semantics : 8; 191 sync_scope scope : 8; 192 193 bool operator==(const memory_sync_info& rhs) const 194 { 195 return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope; 196 } 197 198 bool can_reorder() const 199 { 200 if (semantics & semantic_acqrel) 201 return false; 202 /* Also check storage so that zero-initialized memory_sync_info can be 203 * reordered. */ 204 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile); 205 } 206}; 207static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding"); 208 209enum fp_round { 210 fp_round_ne = 0, 211 fp_round_pi = 1, 212 fp_round_ni = 2, 213 fp_round_tz = 3, 214}; 215 216enum fp_denorm { 217 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and 218 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */ 219 fp_denorm_flush = 0x0, 220 fp_denorm_keep_in = 0x1, 221 fp_denorm_keep_out = 0x2, 222 fp_denorm_keep = 0x3, 223}; 224 225struct float_mode { 226 /* matches encoding of the MODE register */ 227 union { 228 struct { 229 fp_round round32 : 2; 230 fp_round round16_64 : 2; 231 unsigned denorm32 : 2; 232 unsigned denorm16_64 : 2; 233 }; 234 struct { 235 uint8_t round : 4; 236 uint8_t denorm : 4; 237 }; 238 uint8_t val = 0; 239 }; 240 /* if false, optimizations which may remove infs/nan/-0.0 can be done */ 241 bool preserve_signed_zero_inf_nan32 : 1; 242 bool preserve_signed_zero_inf_nan16_64 : 1; 243 /* if false, optimizations which may remove denormal flushing can be done */ 244 bool must_flush_denorms32 : 1; 245 bool must_flush_denorms16_64 : 1; 246 bool care_about_round32 : 1; 247 bool care_about_round16_64 : 1; 248 249 /* Returns true if instructions using the mode "other" can safely use the 250 * current one instead. */ 251 bool canReplace(float_mode other) const noexcept 252 { 253 return val == other.val && 254 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) && 255 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) && 256 (must_flush_denorms32 || !other.must_flush_denorms32) && 257 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) && 258 (care_about_round32 || !other.care_about_round32) && 259 (care_about_round16_64 || !other.care_about_round16_64); 260 } 261}; 262 263struct wait_imm { 264 static const uint8_t unset_counter = 0xff; 265 266 uint8_t vm; 267 uint8_t exp; 268 uint8_t lgkm; 269 uint8_t vs; 270 271 wait_imm(); 272 wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_); 273 wait_imm(enum amd_gfx_level chip, uint16_t packed); 274 275 uint16_t pack(enum amd_gfx_level chip) const; 276 277 bool combine(const wait_imm& other); 278 279 bool empty() const; 280}; 281 282constexpr Format 283asVOP3(Format format) 284{ 285 return (Format)((uint32_t)Format::VOP3 | (uint32_t)format); 286}; 287 288constexpr Format 289asSDWA(Format format) 290{ 291 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC); 292 return (Format)((uint32_t)Format::SDWA | (uint32_t)format); 293} 294 295constexpr Format 296withoutDPP(Format format) 297{ 298 return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8)); 299} 300 301enum class RegType { 302 none = 0, 303 sgpr, 304 vgpr, 305 linear_vgpr, 306}; 307 308struct RegClass { 309 310 enum RC : uint8_t { 311 s1 = 1, 312 s2 = 2, 313 s3 = 3, 314 s4 = 4, 315 s6 = 6, 316 s8 = 8, 317 s16 = 16, 318 v1 = s1 | (1 << 5), 319 v2 = s2 | (1 << 5), 320 v3 = s3 | (1 << 5), 321 v4 = s4 | (1 << 5), 322 v5 = 5 | (1 << 5), 323 v6 = 6 | (1 << 5), 324 v7 = 7 | (1 << 5), 325 v8 = 8 | (1 << 5), 326 /* byte-sized register class */ 327 v1b = v1 | (1 << 7), 328 v2b = v2 | (1 << 7), 329 v3b = v3 | (1 << 7), 330 v4b = v4 | (1 << 7), 331 v6b = v6 | (1 << 7), 332 v8b = v8 | (1 << 7), 333 /* these are used for WWM and spills to vgpr */ 334 v1_linear = v1 | (1 << 6), 335 v2_linear = v2 | (1 << 6), 336 }; 337 338 RegClass() = default; 339 constexpr RegClass(RC rc_) : rc(rc_) {} 340 constexpr RegClass(RegType type, unsigned size) 341 : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size)) 342 {} 343 344 constexpr operator RC() const { return rc; } 345 explicit operator bool() = delete; 346 347 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; } 348 constexpr bool is_linear_vgpr() const { return rc & (1 << 6); }; 349 constexpr bool is_subdword() const { return rc & (1 << 7); } 350 constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); } 351 // TODO: use size() less in favor of bytes() 352 constexpr unsigned size() const { return (bytes() + 3) >> 2; } 353 constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); } 354 constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); } 355 constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); } 356 357 static constexpr RegClass get(RegType type, unsigned bytes) 358 { 359 if (type == RegType::sgpr) { 360 return RegClass(type, DIV_ROUND_UP(bytes, 4u)); 361 } else { 362 return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u); 363 } 364 } 365 366 constexpr RegClass resize(unsigned bytes) const 367 { 368 if (is_linear_vgpr()) { 369 assert(bytes % 4u == 0); 370 return get(RegType::vgpr, bytes).as_linear(); 371 } 372 return get(type(), bytes); 373 } 374 375private: 376 RC rc; 377}; 378 379/* transitional helper expressions */ 380static constexpr RegClass s1{RegClass::s1}; 381static constexpr RegClass s2{RegClass::s2}; 382static constexpr RegClass s3{RegClass::s3}; 383static constexpr RegClass s4{RegClass::s4}; 384static constexpr RegClass s8{RegClass::s8}; 385static constexpr RegClass s16{RegClass::s16}; 386static constexpr RegClass v1{RegClass::v1}; 387static constexpr RegClass v2{RegClass::v2}; 388static constexpr RegClass v3{RegClass::v3}; 389static constexpr RegClass v4{RegClass::v4}; 390static constexpr RegClass v5{RegClass::v5}; 391static constexpr RegClass v6{RegClass::v6}; 392static constexpr RegClass v7{RegClass::v7}; 393static constexpr RegClass v8{RegClass::v8}; 394static constexpr RegClass v1b{RegClass::v1b}; 395static constexpr RegClass v2b{RegClass::v2b}; 396static constexpr RegClass v3b{RegClass::v3b}; 397static constexpr RegClass v4b{RegClass::v4b}; 398static constexpr RegClass v6b{RegClass::v6b}; 399static constexpr RegClass v8b{RegClass::v8b}; 400 401/** 402 * Temp Class 403 * Each temporary virtual register has a 404 * register class (i.e. size and type) 405 * and SSA id. 406 */ 407struct Temp { 408 Temp() noexcept : id_(0), reg_class(0) {} 409 constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {} 410 411 constexpr uint32_t id() const noexcept { return id_; } 412 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; } 413 414 constexpr unsigned bytes() const noexcept { return regClass().bytes(); } 415 constexpr unsigned size() const noexcept { return regClass().size(); } 416 constexpr RegType type() const noexcept { return regClass().type(); } 417 constexpr bool is_linear() const noexcept { return regClass().is_linear(); } 418 419 constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); } 420 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); } 421 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); } 422 423private: 424 uint32_t id_ : 24; 425 uint32_t reg_class : 8; 426}; 427 428/** 429 * PhysReg 430 * Represents the physical register for each 431 * Operand and Definition. 432 */ 433struct PhysReg { 434 constexpr PhysReg() = default; 435 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {} 436 constexpr unsigned reg() const { return reg_b >> 2; } 437 constexpr unsigned byte() const { return reg_b & 0x3; } 438 constexpr operator unsigned() const { return reg(); } 439 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; } 440 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; } 441 constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; } 442 constexpr PhysReg advance(int bytes) const 443 { 444 PhysReg res = *this; 445 res.reg_b += bytes; 446 return res; 447 } 448 449 uint16_t reg_b = 0; 450}; 451 452/* helper expressions for special registers */ 453static constexpr PhysReg m0{124}; 454static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */ 455static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */ 456static constexpr PhysReg vcc{106}; 457static constexpr PhysReg vcc_hi{107}; 458static constexpr PhysReg tba{108}; /* GFX6-GFX8 */ 459static constexpr PhysReg tma{110}; /* GFX6-GFX8 */ 460static constexpr PhysReg ttmp0{112}; 461static constexpr PhysReg ttmp1{113}; 462static constexpr PhysReg ttmp2{114}; 463static constexpr PhysReg ttmp3{115}; 464static constexpr PhysReg ttmp4{116}; 465static constexpr PhysReg ttmp5{117}; 466static constexpr PhysReg ttmp6{118}; 467static constexpr PhysReg ttmp7{119}; 468static constexpr PhysReg ttmp8{120}; 469static constexpr PhysReg ttmp9{121}; 470static constexpr PhysReg ttmp10{122}; 471static constexpr PhysReg ttmp11{123}; 472static constexpr PhysReg sgpr_null{125}; /* GFX10+ */ 473static constexpr PhysReg exec{126}; 474static constexpr PhysReg exec_lo{126}; 475static constexpr PhysReg exec_hi{127}; 476static constexpr PhysReg vccz{251}; 477static constexpr PhysReg execz{252}; 478static constexpr PhysReg scc{253}; 479 480/** 481 * Operand Class 482 * Initially, each Operand refers to either 483 * a temporary virtual register 484 * or to a constant value 485 * Temporary registers get mapped to physical register during RA 486 * Constant values are inlined into the instruction sequence. 487 */ 488class Operand final { 489public: 490 constexpr Operand() 491 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false), 492 isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false), 493 is24bit_(false), signext(false) 494 {} 495 496 explicit Operand(Temp r) noexcept 497 { 498 data_.temp = r; 499 if (r.id()) { 500 isTemp_ = true; 501 } else { 502 isUndef_ = true; 503 setFixed(PhysReg{128}); 504 } 505 }; 506 explicit Operand(Temp r, PhysReg reg) noexcept 507 { 508 assert(r.id()); /* Don't allow fixing an undef to a register */ 509 data_.temp = r; 510 isTemp_ = true; 511 setFixed(reg); 512 }; 513 514 /* 8-bit constant */ 515 static Operand c8(uint8_t v) noexcept 516 { 517 /* 8-bit constants are only used for copies and copies from any 8-bit 518 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all 519 * to be inline constants. */ 520 Operand op; 521 op.control_ = 0; 522 op.data_.i = v; 523 op.isConstant_ = true; 524 op.constSize = 0; 525 op.setFixed(PhysReg{0u}); 526 return op; 527 }; 528 529 /* 16-bit constant */ 530 static Operand c16(uint16_t v) noexcept 531 { 532 Operand op; 533 op.control_ = 0; 534 op.data_.i = v; 535 op.isConstant_ = true; 536 op.constSize = 1; 537 if (v <= 64) 538 op.setFixed(PhysReg{128u + v}); 539 else if (v >= 0xFFF0) /* [-16 .. -1] */ 540 op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)}); 541 else if (v == 0x3800) /* 0.5 */ 542 op.setFixed(PhysReg{240}); 543 else if (v == 0xB800) /* -0.5 */ 544 op.setFixed(PhysReg{241}); 545 else if (v == 0x3C00) /* 1.0 */ 546 op.setFixed(PhysReg{242}); 547 else if (v == 0xBC00) /* -1.0 */ 548 op.setFixed(PhysReg{243}); 549 else if (v == 0x4000) /* 2.0 */ 550 op.setFixed(PhysReg{244}); 551 else if (v == 0xC000) /* -2.0 */ 552 op.setFixed(PhysReg{245}); 553 else if (v == 0x4400) /* 4.0 */ 554 op.setFixed(PhysReg{246}); 555 else if (v == 0xC400) /* -4.0 */ 556 op.setFixed(PhysReg{247}); 557 else if (v == 0x3118) /* 1/2 PI */ 558 op.setFixed(PhysReg{248}); 559 else /* Literal Constant */ 560 op.setFixed(PhysReg{255}); 561 return op; 562 } 563 564 /* 32-bit constant */ 565 static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); } 566 567 /* 64-bit constant */ 568 static Operand c64(uint64_t v) noexcept 569 { 570 Operand op; 571 op.control_ = 0; 572 op.isConstant_ = true; 573 op.constSize = 3; 574 if (v <= 64) { 575 op.data_.i = (uint32_t)v; 576 op.setFixed(PhysReg{128 + (uint32_t)v}); 577 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */ 578 op.data_.i = (uint32_t)v; 579 op.setFixed(PhysReg{192 - (uint32_t)v}); 580 } else if (v == 0x3FE0000000000000) { /* 0.5 */ 581 op.data_.i = 0x3f000000; 582 op.setFixed(PhysReg{240}); 583 } else if (v == 0xBFE0000000000000) { /* -0.5 */ 584 op.data_.i = 0xbf000000; 585 op.setFixed(PhysReg{241}); 586 } else if (v == 0x3FF0000000000000) { /* 1.0 */ 587 op.data_.i = 0x3f800000; 588 op.setFixed(PhysReg{242}); 589 } else if (v == 0xBFF0000000000000) { /* -1.0 */ 590 op.data_.i = 0xbf800000; 591 op.setFixed(PhysReg{243}); 592 } else if (v == 0x4000000000000000) { /* 2.0 */ 593 op.data_.i = 0x40000000; 594 op.setFixed(PhysReg{244}); 595 } else if (v == 0xC000000000000000) { /* -2.0 */ 596 op.data_.i = 0xc0000000; 597 op.setFixed(PhysReg{245}); 598 } else if (v == 0x4010000000000000) { /* 4.0 */ 599 op.data_.i = 0x40800000; 600 op.setFixed(PhysReg{246}); 601 } else if (v == 0xC010000000000000) { /* -4.0 */ 602 op.data_.i = 0xc0800000; 603 op.setFixed(PhysReg{247}); 604 } else { /* Literal Constant: we don't know if it is a long or double.*/ 605 op.signext = v >> 63; 606 op.data_.i = v & 0xffffffffu; 607 op.setFixed(PhysReg{255}); 608 assert(op.constantValue64() == v && 609 "attempt to create a unrepresentable 64-bit literal constant"); 610 } 611 return op; 612 } 613 614 /* 32-bit constant stored as a 32-bit or 64-bit operand */ 615 static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept 616 { 617 Operand op; 618 op.control_ = 0; 619 op.data_.i = v; 620 op.isConstant_ = true; 621 op.constSize = is64bit ? 3 : 2; 622 if (v <= 64) 623 op.setFixed(PhysReg{128 + v}); 624 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */ 625 op.setFixed(PhysReg{192 - v}); 626 else if (v == 0x3f000000) /* 0.5 */ 627 op.setFixed(PhysReg{240}); 628 else if (v == 0xbf000000) /* -0.5 */ 629 op.setFixed(PhysReg{241}); 630 else if (v == 0x3f800000) /* 1.0 */ 631 op.setFixed(PhysReg{242}); 632 else if (v == 0xbf800000) /* -1.0 */ 633 op.setFixed(PhysReg{243}); 634 else if (v == 0x40000000) /* 2.0 */ 635 op.setFixed(PhysReg{244}); 636 else if (v == 0xc0000000) /* -2.0 */ 637 op.setFixed(PhysReg{245}); 638 else if (v == 0x40800000) /* 4.0 */ 639 op.setFixed(PhysReg{246}); 640 else if (v == 0xc0800000) /* -4.0 */ 641 op.setFixed(PhysReg{247}); 642 else { /* Literal Constant */ 643 assert(!is64bit && "attempt to create a 64-bit literal constant"); 644 op.setFixed(PhysReg{255}); 645 } 646 return op; 647 } 648 649 static Operand literal32(uint32_t v) noexcept 650 { 651 Operand op; 652 op.control_ = 0; 653 op.data_.i = v; 654 op.isConstant_ = true; 655 op.constSize = 2; 656 op.setFixed(PhysReg{255}); 657 return op; 658 } 659 660 explicit Operand(RegClass type) noexcept 661 { 662 isUndef_ = true; 663 data_.temp = Temp(0, type); 664 setFixed(PhysReg{128}); 665 }; 666 explicit Operand(PhysReg reg, RegClass type) noexcept 667 { 668 data_.temp = Temp(0, type); 669 setFixed(reg); 670 } 671 672 static Operand zero(unsigned bytes = 4) noexcept 673 { 674 if (bytes == 8) 675 return Operand::c64(0); 676 else if (bytes == 4) 677 return Operand::c32(0); 678 else if (bytes == 2) 679 return Operand::c16(0); 680 assert(bytes == 1); 681 return Operand::c8(0); 682 } 683 684 /* This is useful over the constructors when you want to take a gfx level 685 * for 1/2 PI or an unknown operand size. 686 */ 687 static Operand get_const(enum amd_gfx_level chip, uint64_t val, unsigned bytes) 688 { 689 if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) { 690 /* 1/2 PI can be an inline constant on GFX8+ */ 691 Operand op = Operand::c32(val); 692 op.setFixed(PhysReg{248}); 693 return op; 694 } 695 696 if (bytes == 8) 697 return Operand::c64(val); 698 else if (bytes == 4) 699 return Operand::c32(val); 700 else if (bytes == 2) 701 return Operand::c16(val); 702 assert(bytes == 1); 703 return Operand::c8(val); 704 } 705 706 static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false, 707 bool sext = false) 708 { 709 if (bytes <= 4) 710 return true; 711 712 if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000) 713 return true; 714 uint64_t upper33 = val & 0xFFFFFFFF80000000; 715 if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0)) 716 return true; 717 718 return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */ 719 val == 0x3FE0000000000000 || /* 0.5 */ 720 val == 0xBFE0000000000000 || /* -0.5 */ 721 val == 0x3FF0000000000000 || /* 1.0 */ 722 val == 0xBFF0000000000000 || /* -1.0 */ 723 val == 0x4000000000000000 || /* 2.0 */ 724 val == 0xC000000000000000 || /* -2.0 */ 725 val == 0x4010000000000000 || /* 4.0 */ 726 val == 0xC010000000000000; /* -4.0 */ 727 } 728 729 constexpr bool isTemp() const noexcept { return isTemp_; } 730 731 constexpr void setTemp(Temp t) noexcept 732 { 733 assert(!isConstant_); 734 isTemp_ = true; 735 data_.temp = t; 736 } 737 738 constexpr Temp getTemp() const noexcept { return data_.temp; } 739 740 constexpr uint32_t tempId() const noexcept { return data_.temp.id(); } 741 742 constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); } 743 744 constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); } 745 746 constexpr unsigned bytes() const noexcept 747 { 748 if (isConstant()) 749 return 1 << constSize; 750 else 751 return data_.temp.bytes(); 752 } 753 754 constexpr unsigned size() const noexcept 755 { 756 if (isConstant()) 757 return constSize > 2 ? 2 : 1; 758 else 759 return data_.temp.size(); 760 } 761 762 constexpr bool isFixed() const noexcept { return isFixed_; } 763 764 constexpr PhysReg physReg() const noexcept { return reg_; } 765 766 constexpr void setFixed(PhysReg reg) noexcept 767 { 768 isFixed_ = reg != unsigned(-1); 769 reg_ = reg; 770 } 771 772 constexpr bool isConstant() const noexcept { return isConstant_; } 773 774 constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; } 775 776 constexpr bool isUndefined() const noexcept { return isUndef_; } 777 778 constexpr uint32_t constantValue() const noexcept { return data_.i; } 779 780 constexpr bool constantEquals(uint32_t cmp) const noexcept 781 { 782 return isConstant() && constantValue() == cmp; 783 } 784 785 constexpr uint64_t constantValue64() const noexcept 786 { 787 if (constSize == 3) { 788 if (reg_ <= 192) 789 return reg_ - 128; 790 else if (reg_ <= 208) 791 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193); 792 793 switch (reg_) { 794 case 240: return 0x3FE0000000000000; 795 case 241: return 0xBFE0000000000000; 796 case 242: return 0x3FF0000000000000; 797 case 243: return 0xBFF0000000000000; 798 case 244: return 0x4000000000000000; 799 case 245: return 0xC000000000000000; 800 case 246: return 0x4010000000000000; 801 case 247: return 0xC010000000000000; 802 case 255: 803 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i; 804 } 805 unreachable("invalid register for 64-bit constant"); 806 } else { 807 return data_.i; 808 } 809 } 810 811 /* Value if this were used with vop3/opsel or vop3p. */ 812 constexpr uint16_t constantValue16(bool opsel) const noexcept 813 { 814 assert(bytes() == 2 || bytes() == 4); 815 if (opsel) { 816 if (bytes() == 2 && int16_t(data_.i) >= -16 && int16_t(data_.i) <= 64 && !isLiteral()) 817 return int16_t(data_.i) >> 16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */ 818 else 819 return data_.i >> 16; 820 } 821 return data_.i; 822 } 823 824 constexpr bool isOfType(RegType type) const noexcept 825 { 826 return hasRegClass() && regClass().type() == type; 827 } 828 829 /* Indicates that the killed operand's live range intersects with the 830 * instruction's definitions. Unlike isKill() and isFirstKill(), this is 831 * not set by liveness analysis. */ 832 constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; } 833 834 constexpr bool isLateKill() const noexcept { return isLateKill_; } 835 836 constexpr void setKill(bool flag) noexcept 837 { 838 isKill_ = flag; 839 if (!flag) 840 setFirstKill(false); 841 } 842 843 constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); } 844 845 constexpr void setFirstKill(bool flag) noexcept 846 { 847 isFirstKill_ = flag; 848 if (flag) 849 setKill(flag); 850 } 851 852 /* When there are multiple operands killing the same temporary, 853 * isFirstKill() is only returns true for the first one. */ 854 constexpr bool isFirstKill() const noexcept { return isFirstKill_; } 855 856 constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); } 857 858 constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); } 859 860 constexpr bool operator==(Operand other) const noexcept 861 { 862 if (other.size() != size()) 863 return false; 864 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef()) 865 return false; 866 if (isFixed() && other.isFixed() && physReg() != other.physReg()) 867 return false; 868 if (isLiteral()) 869 return other.isLiteral() && other.constantValue() == constantValue(); 870 else if (isConstant()) 871 return other.isConstant() && other.physReg() == physReg(); 872 else if (isUndefined()) 873 return other.isUndefined() && other.regClass() == regClass(); 874 else 875 return other.isTemp() && other.getTemp() == getTemp(); 876 } 877 878 constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); } 879 880 constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; } 881 882 constexpr bool is16bit() const noexcept { return is16bit_; } 883 884 constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; } 885 886 constexpr bool is24bit() const noexcept { return is24bit_; } 887 888private: 889 union { 890 Temp temp; 891 uint32_t i; 892 float f; 893 } data_ = {Temp(0, s1)}; 894 PhysReg reg_; 895 union { 896 struct { 897 uint8_t isTemp_ : 1; 898 uint8_t isFixed_ : 1; 899 uint8_t isConstant_ : 1; 900 uint8_t isKill_ : 1; 901 uint8_t isUndef_ : 1; 902 uint8_t isFirstKill_ : 1; 903 uint8_t constSize : 2; 904 uint8_t isLateKill_ : 1; 905 uint8_t is16bit_ : 1; 906 uint8_t is24bit_ : 1; 907 uint8_t signext : 1; 908 }; 909 /* can't initialize bit-fields in c++11, so work around using a union */ 910 uint16_t control_ = 0; 911 }; 912}; 913 914/** 915 * Definition Class 916 * Definitions are the results of Instructions 917 * and refer to temporary virtual registers 918 * which are later mapped to physical registers 919 */ 920class Definition final { 921public: 922 constexpr Definition() 923 : temp(Temp(0, s1)), reg_(0), isFixed_(0), isKill_(0), isPrecise_(0), isNUW_(0), isNoCSE_(0) 924 {} 925 Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {} 926 explicit Definition(Temp tmp) noexcept : temp(tmp) {} 927 Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); } 928 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type)) 929 { 930 setFixed(reg); 931 } 932 933 constexpr bool isTemp() const noexcept { return tempId() > 0; } 934 935 constexpr Temp getTemp() const noexcept { return temp; } 936 937 constexpr uint32_t tempId() const noexcept { return temp.id(); } 938 939 constexpr void setTemp(Temp t) noexcept { temp = t; } 940 941 void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); } 942 943 constexpr RegClass regClass() const noexcept { return temp.regClass(); } 944 945 constexpr unsigned bytes() const noexcept { return temp.bytes(); } 946 947 constexpr unsigned size() const noexcept { return temp.size(); } 948 949 constexpr bool isFixed() const noexcept { return isFixed_; } 950 951 constexpr PhysReg physReg() const noexcept { return reg_; } 952 953 constexpr void setFixed(PhysReg reg) noexcept 954 { 955 isFixed_ = 1; 956 reg_ = reg; 957 } 958 959 constexpr void setKill(bool flag) noexcept { isKill_ = flag; } 960 961 constexpr bool isKill() const noexcept { return isKill_; } 962 963 constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; } 964 965 constexpr bool isPrecise() const noexcept { return isPrecise_; } 966 967 /* No Unsigned Wrap */ 968 constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; } 969 970 constexpr bool isNUW() const noexcept { return isNUW_; } 971 972 constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; } 973 974 constexpr bool isNoCSE() const noexcept { return isNoCSE_; } 975 976private: 977 Temp temp = Temp(0, s1); 978 PhysReg reg_; 979 union { 980 struct { 981 uint8_t isFixed_ : 1; 982 uint8_t isKill_ : 1; 983 uint8_t isPrecise_ : 1; 984 uint8_t isNUW_ : 1; 985 uint8_t isNoCSE_ : 1; 986 }; 987 /* can't initialize bit-fields in c++11, so work around using a union */ 988 uint8_t control_ = 0; 989 }; 990}; 991 992struct Block; 993struct Instruction; 994struct Pseudo_instruction; 995struct SOP1_instruction; 996struct SOP2_instruction; 997struct SOPK_instruction; 998struct SOPP_instruction; 999struct SOPC_instruction; 1000struct SMEM_instruction; 1001struct DS_instruction; 1002struct MTBUF_instruction; 1003struct MUBUF_instruction; 1004struct MIMG_instruction; 1005struct Export_instruction; 1006struct FLAT_instruction; 1007struct Pseudo_branch_instruction; 1008struct Pseudo_barrier_instruction; 1009struct Pseudo_reduction_instruction; 1010struct VOP3P_instruction; 1011struct VOP1_instruction; 1012struct VOP2_instruction; 1013struct VOPC_instruction; 1014struct VOP3_instruction; 1015struct Interp_instruction; 1016struct DPP16_instruction; 1017struct DPP8_instruction; 1018struct SDWA_instruction; 1019 1020struct Instruction { 1021 aco_opcode opcode; 1022 Format format; 1023 uint32_t pass_flags; 1024 1025 aco::span<Operand> operands; 1026 aco::span<Definition> definitions; 1027 1028 constexpr bool usesModifiers() const noexcept; 1029 1030 constexpr bool reads_exec() const noexcept 1031 { 1032 for (const Operand& op : operands) { 1033 if (op.isFixed() && op.physReg() == exec) 1034 return true; 1035 } 1036 return false; 1037 } 1038 1039 Pseudo_instruction& pseudo() noexcept 1040 { 1041 assert(isPseudo()); 1042 return *(Pseudo_instruction*)this; 1043 } 1044 const Pseudo_instruction& pseudo() const noexcept 1045 { 1046 assert(isPseudo()); 1047 return *(Pseudo_instruction*)this; 1048 } 1049 constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; } 1050 SOP1_instruction& sop1() noexcept 1051 { 1052 assert(isSOP1()); 1053 return *(SOP1_instruction*)this; 1054 } 1055 const SOP1_instruction& sop1() const noexcept 1056 { 1057 assert(isSOP1()); 1058 return *(SOP1_instruction*)this; 1059 } 1060 constexpr bool isSOP1() const noexcept { return format == Format::SOP1; } 1061 SOP2_instruction& sop2() noexcept 1062 { 1063 assert(isSOP2()); 1064 return *(SOP2_instruction*)this; 1065 } 1066 const SOP2_instruction& sop2() const noexcept 1067 { 1068 assert(isSOP2()); 1069 return *(SOP2_instruction*)this; 1070 } 1071 constexpr bool isSOP2() const noexcept { return format == Format::SOP2; } 1072 SOPK_instruction& sopk() noexcept 1073 { 1074 assert(isSOPK()); 1075 return *(SOPK_instruction*)this; 1076 } 1077 const SOPK_instruction& sopk() const noexcept 1078 { 1079 assert(isSOPK()); 1080 return *(SOPK_instruction*)this; 1081 } 1082 constexpr bool isSOPK() const noexcept { return format == Format::SOPK; } 1083 SOPP_instruction& sopp() noexcept 1084 { 1085 assert(isSOPP()); 1086 return *(SOPP_instruction*)this; 1087 } 1088 const SOPP_instruction& sopp() const noexcept 1089 { 1090 assert(isSOPP()); 1091 return *(SOPP_instruction*)this; 1092 } 1093 constexpr bool isSOPP() const noexcept { return format == Format::SOPP; } 1094 SOPC_instruction& sopc() noexcept 1095 { 1096 assert(isSOPC()); 1097 return *(SOPC_instruction*)this; 1098 } 1099 const SOPC_instruction& sopc() const noexcept 1100 { 1101 assert(isSOPC()); 1102 return *(SOPC_instruction*)this; 1103 } 1104 constexpr bool isSOPC() const noexcept { return format == Format::SOPC; } 1105 SMEM_instruction& smem() noexcept 1106 { 1107 assert(isSMEM()); 1108 return *(SMEM_instruction*)this; 1109 } 1110 const SMEM_instruction& smem() const noexcept 1111 { 1112 assert(isSMEM()); 1113 return *(SMEM_instruction*)this; 1114 } 1115 constexpr bool isSMEM() const noexcept { return format == Format::SMEM; } 1116 DS_instruction& ds() noexcept 1117 { 1118 assert(isDS()); 1119 return *(DS_instruction*)this; 1120 } 1121 const DS_instruction& ds() const noexcept 1122 { 1123 assert(isDS()); 1124 return *(DS_instruction*)this; 1125 } 1126 constexpr bool isDS() const noexcept { return format == Format::DS; } 1127 MTBUF_instruction& mtbuf() noexcept 1128 { 1129 assert(isMTBUF()); 1130 return *(MTBUF_instruction*)this; 1131 } 1132 const MTBUF_instruction& mtbuf() const noexcept 1133 { 1134 assert(isMTBUF()); 1135 return *(MTBUF_instruction*)this; 1136 } 1137 constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; } 1138 MUBUF_instruction& mubuf() noexcept 1139 { 1140 assert(isMUBUF()); 1141 return *(MUBUF_instruction*)this; 1142 } 1143 const MUBUF_instruction& mubuf() const noexcept 1144 { 1145 assert(isMUBUF()); 1146 return *(MUBUF_instruction*)this; 1147 } 1148 constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; } 1149 MIMG_instruction& mimg() noexcept 1150 { 1151 assert(isMIMG()); 1152 return *(MIMG_instruction*)this; 1153 } 1154 const MIMG_instruction& mimg() const noexcept 1155 { 1156 assert(isMIMG()); 1157 return *(MIMG_instruction*)this; 1158 } 1159 constexpr bool isMIMG() const noexcept { return format == Format::MIMG; } 1160 Export_instruction& exp() noexcept 1161 { 1162 assert(isEXP()); 1163 return *(Export_instruction*)this; 1164 } 1165 const Export_instruction& exp() const noexcept 1166 { 1167 assert(isEXP()); 1168 return *(Export_instruction*)this; 1169 } 1170 constexpr bool isEXP() const noexcept { return format == Format::EXP; } 1171 FLAT_instruction& flat() noexcept 1172 { 1173 assert(isFlat()); 1174 return *(FLAT_instruction*)this; 1175 } 1176 const FLAT_instruction& flat() const noexcept 1177 { 1178 assert(isFlat()); 1179 return *(FLAT_instruction*)this; 1180 } 1181 constexpr bool isFlat() const noexcept { return format == Format::FLAT; } 1182 FLAT_instruction& global() noexcept 1183 { 1184 assert(isGlobal()); 1185 return *(FLAT_instruction*)this; 1186 } 1187 const FLAT_instruction& global() const noexcept 1188 { 1189 assert(isGlobal()); 1190 return *(FLAT_instruction*)this; 1191 } 1192 constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; } 1193 FLAT_instruction& scratch() noexcept 1194 { 1195 assert(isScratch()); 1196 return *(FLAT_instruction*)this; 1197 } 1198 const FLAT_instruction& scratch() const noexcept 1199 { 1200 assert(isScratch()); 1201 return *(FLAT_instruction*)this; 1202 } 1203 constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; } 1204 Pseudo_branch_instruction& branch() noexcept 1205 { 1206 assert(isBranch()); 1207 return *(Pseudo_branch_instruction*)this; 1208 } 1209 const Pseudo_branch_instruction& branch() const noexcept 1210 { 1211 assert(isBranch()); 1212 return *(Pseudo_branch_instruction*)this; 1213 } 1214 constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; } 1215 Pseudo_barrier_instruction& barrier() noexcept 1216 { 1217 assert(isBarrier()); 1218 return *(Pseudo_barrier_instruction*)this; 1219 } 1220 const Pseudo_barrier_instruction& barrier() const noexcept 1221 { 1222 assert(isBarrier()); 1223 return *(Pseudo_barrier_instruction*)this; 1224 } 1225 constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; } 1226 Pseudo_reduction_instruction& reduction() noexcept 1227 { 1228 assert(isReduction()); 1229 return *(Pseudo_reduction_instruction*)this; 1230 } 1231 const Pseudo_reduction_instruction& reduction() const noexcept 1232 { 1233 assert(isReduction()); 1234 return *(Pseudo_reduction_instruction*)this; 1235 } 1236 constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; } 1237 VOP3P_instruction& vop3p() noexcept 1238 { 1239 assert(isVOP3P()); 1240 return *(VOP3P_instruction*)this; 1241 } 1242 const VOP3P_instruction& vop3p() const noexcept 1243 { 1244 assert(isVOP3P()); 1245 return *(VOP3P_instruction*)this; 1246 } 1247 constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; } 1248 VOP1_instruction& vop1() noexcept 1249 { 1250 assert(isVOP1()); 1251 return *(VOP1_instruction*)this; 1252 } 1253 const VOP1_instruction& vop1() const noexcept 1254 { 1255 assert(isVOP1()); 1256 return *(VOP1_instruction*)this; 1257 } 1258 constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; } 1259 VOP2_instruction& vop2() noexcept 1260 { 1261 assert(isVOP2()); 1262 return *(VOP2_instruction*)this; 1263 } 1264 const VOP2_instruction& vop2() const noexcept 1265 { 1266 assert(isVOP2()); 1267 return *(VOP2_instruction*)this; 1268 } 1269 constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; } 1270 VOPC_instruction& vopc() noexcept 1271 { 1272 assert(isVOPC()); 1273 return *(VOPC_instruction*)this; 1274 } 1275 const VOPC_instruction& vopc() const noexcept 1276 { 1277 assert(isVOPC()); 1278 return *(VOPC_instruction*)this; 1279 } 1280 constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; } 1281 VOP3_instruction& vop3() noexcept 1282 { 1283 assert(isVOP3()); 1284 return *(VOP3_instruction*)this; 1285 } 1286 const VOP3_instruction& vop3() const noexcept 1287 { 1288 assert(isVOP3()); 1289 return *(VOP3_instruction*)this; 1290 } 1291 constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; } 1292 Interp_instruction& vintrp() noexcept 1293 { 1294 assert(isVINTRP()); 1295 return *(Interp_instruction*)this; 1296 } 1297 const Interp_instruction& vintrp() const noexcept 1298 { 1299 assert(isVINTRP()); 1300 return *(Interp_instruction*)this; 1301 } 1302 constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; } 1303 DPP16_instruction& dpp16() noexcept 1304 { 1305 assert(isDPP16()); 1306 return *(DPP16_instruction*)this; 1307 } 1308 const DPP16_instruction& dpp16() const noexcept 1309 { 1310 assert(isDPP16()); 1311 return *(DPP16_instruction*)this; 1312 } 1313 constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; } 1314 DPP8_instruction& dpp8() noexcept 1315 { 1316 assert(isDPP8()); 1317 return *(DPP8_instruction*)this; 1318 } 1319 const DPP8_instruction& dpp8() const noexcept 1320 { 1321 assert(isDPP8()); 1322 return *(DPP8_instruction*)this; 1323 } 1324 constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; } 1325 constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); } 1326 SDWA_instruction& sdwa() noexcept 1327 { 1328 assert(isSDWA()); 1329 return *(SDWA_instruction*)this; 1330 } 1331 const SDWA_instruction& sdwa() const noexcept 1332 { 1333 assert(isSDWA()); 1334 return *(SDWA_instruction*)this; 1335 } 1336 constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; } 1337 1338 FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; } 1339 1340 const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; } 1341 1342 constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); } 1343 1344 constexpr bool isVALU() const noexcept 1345 { 1346 return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P(); 1347 } 1348 1349 constexpr bool isSALU() const noexcept 1350 { 1351 return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP(); 1352 } 1353 1354 constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); } 1355}; 1356static_assert(sizeof(Instruction) == 16, "Unexpected padding"); 1357 1358struct SOPK_instruction : public Instruction { 1359 uint16_t imm; 1360 uint16_t padding; 1361}; 1362static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1363 1364struct SOPP_instruction : public Instruction { 1365 uint32_t imm; 1366 int block; 1367}; 1368static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1369 1370struct SOPC_instruction : public Instruction { 1371 uint32_t padding; 1372}; 1373static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1374 1375struct SOP1_instruction : public Instruction {}; 1376static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); 1377 1378struct SOP2_instruction : public Instruction { 1379 uint32_t padding; 1380}; 1381static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1382 1383/** 1384 * Scalar Memory Format: 1385 * For s_(buffer_)load_dword*: 1386 * Operand(0): SBASE - SGPR-pair which provides base address 1387 * Operand(1): Offset - immediate (un)signed offset or SGPR 1388 * Operand(2) / Definition(0): SDATA - SGPR for read / write result 1389 * Operand(n-1): SOffset - SGPR offset (Vega only) 1390 * 1391 * Having no operands is also valid for instructions such as s_dcache_inv. 1392 * 1393 */ 1394struct SMEM_instruction : public Instruction { 1395 memory_sync_info sync; 1396 bool glc : 1; /* VI+: globally coherent */ 1397 bool dlc : 1; /* NAVI: device level coherent */ 1398 bool nv : 1; /* VEGA only: Non-volatile */ 1399 bool disable_wqm : 1; 1400 bool prevent_overflow : 1; /* avoid overflow when combining additions */ 1401 uint8_t padding : 3; 1402}; 1403static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1404 1405struct VOP1_instruction : public Instruction {}; 1406static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); 1407 1408struct VOP2_instruction : public Instruction {}; 1409static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); 1410 1411struct VOPC_instruction : public Instruction {}; 1412static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); 1413 1414struct VOP3_instruction : public Instruction { 1415 bool abs[3]; 1416 bool neg[3]; 1417 uint8_t opsel : 4; 1418 uint8_t omod : 2; 1419 bool clamp : 1; 1420 uint8_t padding0 : 1; 1421 uint8_t padding1; 1422}; 1423static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1424 1425struct VOP3P_instruction : public Instruction { 1426 bool neg_lo[3]; 1427 bool neg_hi[3]; /* abs modifier, for v_mad_mix/v_fma_mix */ 1428 uint8_t opsel_lo : 3; 1429 uint8_t opsel_hi : 3; 1430 bool clamp : 1; 1431 uint8_t padding0 : 1; 1432 uint8_t padding1; 1433}; 1434static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1435 1436/** 1437 * Data Parallel Primitives Format: 1438 * This format can be used for VOP1, VOP2 or VOPC instructions. 1439 * The swizzle applies to the src0 operand. 1440 * 1441 */ 1442struct DPP16_instruction : public Instruction { 1443 bool abs[2]; 1444 bool neg[2]; 1445 uint16_t dpp_ctrl; 1446 uint8_t row_mask : 4; 1447 uint8_t bank_mask : 4; 1448 bool bound_ctrl : 1; 1449 uint8_t padding : 7; 1450}; 1451static_assert(sizeof(DPP16_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1452 1453struct DPP8_instruction : public Instruction { 1454 uint8_t lane_sel[8]; 1455}; 1456static_assert(sizeof(DPP8_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1457 1458struct SubdwordSel { 1459 enum sdwa_sel : uint8_t { 1460 ubyte = 0x4, 1461 uword = 0x8, 1462 dword = 0x10, 1463 sext = 0x20, 1464 sbyte = ubyte | sext, 1465 sword = uword | sext, 1466 1467 ubyte0 = ubyte, 1468 ubyte1 = ubyte | 1, 1469 ubyte2 = ubyte | 2, 1470 ubyte3 = ubyte | 3, 1471 sbyte0 = sbyte, 1472 sbyte1 = sbyte | 1, 1473 sbyte2 = sbyte | 2, 1474 sbyte3 = sbyte | 3, 1475 uword0 = uword, 1476 uword1 = uword | 2, 1477 sword0 = sword, 1478 sword1 = sword | 2, 1479 }; 1480 1481 SubdwordSel() : sel((sdwa_sel)0) {} 1482 constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {} 1483 constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend) 1484 : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset)) 1485 {} 1486 constexpr operator sdwa_sel() const { return sel; } 1487 explicit operator bool() const { return sel != 0; } 1488 1489 constexpr unsigned size() const { return (sel >> 2) & 0x7; } 1490 constexpr unsigned offset() const { return sel & 0x3; } 1491 constexpr bool sign_extend() const { return sel & sext; } 1492 constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const 1493 { 1494 reg_byte_offset += offset(); 1495 if (size() == 1) 1496 return reg_byte_offset; 1497 else if (size() == 2) 1498 return 4 + (reg_byte_offset >> 1); 1499 else 1500 return 6; 1501 } 1502 1503private: 1504 sdwa_sel sel; 1505}; 1506 1507/** 1508 * Sub-Dword Addressing Format: 1509 * This format can be used for VOP1, VOP2 or VOPC instructions. 1510 * 1511 * omod and SGPR/constant operands are only available on GFX9+. For VOPC, 1512 * the definition doesn't have to be VCC on GFX9+. 1513 * 1514 */ 1515struct SDWA_instruction : public Instruction { 1516 /* these destination modifiers aren't available with VOPC except for 1517 * clamp on GFX8 */ 1518 SubdwordSel sel[2]; 1519 SubdwordSel dst_sel; 1520 bool neg[2]; 1521 bool abs[2]; 1522 bool clamp : 1; 1523 uint8_t omod : 2; /* GFX9+ */ 1524 uint8_t padding : 5; 1525}; 1526static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1527 1528struct Interp_instruction : public Instruction { 1529 uint8_t attribute; 1530 uint8_t component; 1531 uint16_t padding; 1532}; 1533static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1534 1535/** 1536 * Local and Global Data Sharing instructions 1537 * Operand(0): ADDR - VGPR which supplies the address. 1538 * Operand(1): DATA0 - First data VGPR. 1539 * Operand(2): DATA1 - Second data VGPR. 1540 * Operand(n-1): M0 - LDS size. 1541 * Definition(0): VDST - Destination VGPR when results returned to VGPRs. 1542 * 1543 */ 1544struct DS_instruction : public Instruction { 1545 memory_sync_info sync; 1546 bool gds; 1547 uint16_t offset0; 1548 uint8_t offset1; 1549 uint8_t padding; 1550}; 1551static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1552 1553/** 1554 * Vector Memory Untyped-buffer Instructions 1555 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) 1556 * Operand(1): VADDR - Address source. Can carry an index and/or offset 1557 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) 1558 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data 1559 * 1560 */ 1561struct MUBUF_instruction : public Instruction { 1562 memory_sync_info sync; 1563 bool offen : 1; /* Supply an offset from VGPR (VADDR) */ 1564 bool idxen : 1; /* Supply an index from VGPR (VADDR) */ 1565 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */ 1566 bool glc : 1; /* globally coherent */ 1567 bool dlc : 1; /* NAVI: device level coherent */ 1568 bool slc : 1; /* system level coherent */ 1569 bool tfe : 1; /* texture fail enable */ 1570 bool lds : 1; /* Return read-data to LDS instead of VGPRs */ 1571 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */ 1572 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ 1573 uint16_t swizzled : 1; 1574 uint16_t padding0 : 2; 1575 uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */ 1576 uint16_t padding1 : 10; 1577}; 1578static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1579 1580/** 1581 * Vector Memory Typed-buffer Instructions 1582 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) 1583 * Operand(1): VADDR - Address source. Can carry an index and/or offset 1584 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) 1585 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data 1586 * 1587 */ 1588struct MTBUF_instruction : public Instruction { 1589 memory_sync_info sync; 1590 uint8_t dfmt : 4; /* Data Format of data in memory buffer */ 1591 uint8_t nfmt : 3; /* Numeric format of data in memory */ 1592 bool offen : 1; /* Supply an offset from VGPR (VADDR) */ 1593 uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */ 1594 uint16_t glc : 1; /* globally coherent */ 1595 uint16_t dlc : 1; /* NAVI: device level coherent */ 1596 uint16_t slc : 1; /* system level coherent */ 1597 uint16_t tfe : 1; /* texture fail enable */ 1598 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */ 1599 uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */ 1600 uint16_t padding : 4; 1601 uint16_t offset; /* Unsigned byte offset - 12 bit */ 1602}; 1603static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1604 1605/** 1606 * Vector Memory Image Instructions 1607 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant. 1608 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant. 1609 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1. 1610 * Operand(3): VADDR - Address source. Can carry an offset or an index. 1611 * Definition(0): VDATA - Vector GPR for read result. 1612 * 1613 */ 1614struct MIMG_instruction : public Instruction { 1615 memory_sync_info sync; 1616 uint8_t dmask; /* Data VGPR enable mask */ 1617 uint8_t dim : 3; /* NAVI: dimensionality */ 1618 bool unrm : 1; /* Force address to be un-normalized */ 1619 bool dlc : 1; /* NAVI: device level coherent */ 1620 bool glc : 1; /* globally coherent */ 1621 bool slc : 1; /* system level coherent */ 1622 bool tfe : 1; /* texture fail enable */ 1623 bool da : 1; /* declare an array */ 1624 bool lwe : 1; /* LOD warning enable */ 1625 bool r128 : 1; /* NAVI: Texture resource size */ 1626 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */ 1627 bool d16 : 1; /* Convert 32-bit data to 16-bit data */ 1628 bool disable_wqm : 1; /* Require an exec mask without helper invocations */ 1629 uint8_t padding0 : 2; 1630 uint8_t padding1; 1631 uint8_t padding2; 1632}; 1633static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1634 1635/** 1636 * Flat/Scratch/Global Instructions 1637 * Operand(0): ADDR 1638 * Operand(1): SADDR 1639 * Operand(2) / Definition(0): DATA/VDST 1640 * 1641 */ 1642struct FLAT_instruction : public Instruction { 1643 memory_sync_info sync; 1644 bool slc : 1; /* system level coherent */ 1645 bool glc : 1; /* globally coherent */ 1646 bool dlc : 1; /* NAVI: device level coherent */ 1647 bool lds : 1; 1648 bool nv : 1; 1649 bool disable_wqm : 1; /* Require an exec mask without helper invocations */ 1650 uint8_t padding0 : 2; 1651 int16_t offset; /* Vega/Navi only */ 1652 uint16_t padding1; 1653}; 1654static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1655 1656struct Export_instruction : public Instruction { 1657 uint8_t enabled_mask; 1658 uint8_t dest; 1659 bool compressed : 1; 1660 bool done : 1; 1661 bool valid_mask : 1; 1662 uint8_t padding0 : 5; 1663 uint8_t padding1; 1664}; 1665static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1666 1667struct Pseudo_instruction : public Instruction { 1668 PhysReg scratch_sgpr; /* might not be valid if it's not needed */ 1669 bool tmp_in_scc; 1670 uint8_t padding; 1671}; 1672static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1673 1674struct Pseudo_branch_instruction : public Instruction { 1675 /* target[0] is the block index of the branch target. 1676 * For conditional branches, target[1] contains the fall-through alternative. 1677 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target). 1678 */ 1679 uint32_t target[2]; 1680}; 1681static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1682 1683struct Pseudo_barrier_instruction : public Instruction { 1684 memory_sync_info sync; 1685 sync_scope exec_scope; 1686}; 1687static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1688 1689enum ReduceOp : uint16_t { 1690 // clang-format off 1691 iadd8, iadd16, iadd32, iadd64, 1692 imul8, imul16, imul32, imul64, 1693 fadd16, fadd32, fadd64, 1694 fmul16, fmul32, fmul64, 1695 imin8, imin16, imin32, imin64, 1696 imax8, imax16, imax32, imax64, 1697 umin8, umin16, umin32, umin64, 1698 umax8, umax16, umax32, umax64, 1699 fmin16, fmin32, fmin64, 1700 fmax16, fmax32, fmax64, 1701 iand8, iand16, iand32, iand64, 1702 ior8, ior16, ior32, ior64, 1703 ixor8, ixor16, ixor32, ixor64, 1704 num_reduce_ops, 1705 // clang-format on 1706}; 1707 1708/** 1709 * Subgroup Reduction Instructions, everything except for the data to be 1710 * reduced and the result as inserted by setup_reduce_temp(). 1711 * Operand(0): data to be reduced 1712 * Operand(1): reduce temporary 1713 * Operand(2): vector temporary 1714 * Definition(0): result 1715 * Definition(1): scalar temporary 1716 * Definition(2): scalar identity temporary (not used to store identity on GFX10) 1717 * Definition(3): scc clobber 1718 * Definition(4): vcc clobber 1719 * 1720 */ 1721struct Pseudo_reduction_instruction : public Instruction { 1722 ReduceOp reduce_op; 1723 uint16_t cluster_size; // must be 0 for scans 1724}; 1725static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, 1726 "Unexpected padding"); 1727 1728struct instr_deleter_functor { 1729 void operator()(void* p) { free(p); } 1730}; 1731 1732template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>; 1733 1734template <typename T> 1735T* 1736create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, 1737 uint32_t num_definitions) 1738{ 1739 std::size_t size = 1740 sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition); 1741 char* data = (char*)calloc(1, size); 1742 T* inst = (T*)data; 1743 1744 inst->opcode = opcode; 1745 inst->format = format; 1746 1747 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands; 1748 inst->operands = aco::span<Operand>(operands_offset, num_operands); 1749 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions; 1750 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions); 1751 1752 return inst; 1753} 1754 1755constexpr bool 1756Instruction::usesModifiers() const noexcept 1757{ 1758 if (isDPP() || isSDWA()) 1759 return true; 1760 1761 if (isVOP3P()) { 1762 const VOP3P_instruction& vop3p = this->vop3p(); 1763 for (unsigned i = 0; i < operands.size(); i++) { 1764 if (vop3p.neg_lo[i] || vop3p.neg_hi[i]) 1765 return true; 1766 1767 /* opsel_hi must be 1 to not be considered a modifier - even for constants */ 1768 if (!(vop3p.opsel_hi & (1 << i))) 1769 return true; 1770 } 1771 return vop3p.opsel_lo || vop3p.clamp; 1772 } else if (isVOP3()) { 1773 const VOP3_instruction& vop3 = this->vop3(); 1774 for (unsigned i = 0; i < operands.size(); i++) { 1775 if (vop3.abs[i] || vop3.neg[i]) 1776 return true; 1777 } 1778 return vop3.opsel || vop3.clamp || vop3.omod; 1779 } 1780 return false; 1781} 1782 1783constexpr bool 1784is_phi(Instruction* instr) 1785{ 1786 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi; 1787} 1788 1789static inline bool 1790is_phi(aco_ptr<Instruction>& instr) 1791{ 1792 return is_phi(instr.get()); 1793} 1794 1795memory_sync_info get_sync_info(const Instruction* instr); 1796 1797bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr); 1798 1799bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx); 1800bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op); 1801bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra); 1802bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8); 1803/* updates "instr" and returns the old instruction (or NULL if no update was needed) */ 1804aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr); 1805aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8); 1806bool needs_exec_mask(const Instruction* instr); 1807 1808aco_opcode get_ordered(aco_opcode op); 1809aco_opcode get_unordered(aco_opcode op); 1810aco_opcode get_inverse(aco_opcode op); 1811aco_opcode get_f32_cmp(aco_opcode op); 1812aco_opcode get_vcmpx(aco_opcode op); 1813unsigned get_cmp_bitsize(aco_opcode op); 1814bool is_cmp(aco_opcode op); 1815 1816bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op); 1817 1818uint32_t get_reduction_identity(ReduceOp op, unsigned idx); 1819 1820unsigned get_mimg_nsa_dwords(const Instruction* instr); 1821 1822bool should_form_clause(const Instruction* a, const Instruction* b); 1823 1824enum block_kind { 1825 /* uniform indicates that leaving this block, 1826 * all actives lanes stay active */ 1827 block_kind_uniform = 1 << 0, 1828 block_kind_top_level = 1 << 1, 1829 block_kind_loop_preheader = 1 << 2, 1830 block_kind_loop_header = 1 << 3, 1831 block_kind_loop_exit = 1 << 4, 1832 block_kind_continue = 1 << 5, 1833 block_kind_break = 1 << 6, 1834 block_kind_continue_or_break = 1 << 7, 1835 block_kind_branch = 1 << 8, 1836 block_kind_merge = 1 << 9, 1837 block_kind_invert = 1 << 10, 1838 block_kind_uses_discard = 1 << 12, 1839 block_kind_needs_lowering = 1 << 13, 1840 block_kind_export_end = 1 << 15, 1841}; 1842 1843struct RegisterDemand { 1844 constexpr RegisterDemand() = default; 1845 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {} 1846 int16_t vgpr = 0; 1847 int16_t sgpr = 0; 1848 1849 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept 1850 { 1851 return a.vgpr == b.vgpr && a.sgpr == b.sgpr; 1852 } 1853 1854 constexpr bool exceeds(const RegisterDemand other) const noexcept 1855 { 1856 return vgpr > other.vgpr || sgpr > other.sgpr; 1857 } 1858 1859 constexpr RegisterDemand operator+(const Temp t) const noexcept 1860 { 1861 if (t.type() == RegType::sgpr) 1862 return RegisterDemand(vgpr, sgpr + t.size()); 1863 else 1864 return RegisterDemand(vgpr + t.size(), sgpr); 1865 } 1866 1867 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept 1868 { 1869 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr); 1870 } 1871 1872 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept 1873 { 1874 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr); 1875 } 1876 1877 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept 1878 { 1879 vgpr += other.vgpr; 1880 sgpr += other.sgpr; 1881 return *this; 1882 } 1883 1884 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept 1885 { 1886 vgpr -= other.vgpr; 1887 sgpr -= other.sgpr; 1888 return *this; 1889 } 1890 1891 constexpr RegisterDemand& operator+=(const Temp t) noexcept 1892 { 1893 if (t.type() == RegType::sgpr) 1894 sgpr += t.size(); 1895 else 1896 vgpr += t.size(); 1897 return *this; 1898 } 1899 1900 constexpr RegisterDemand& operator-=(const Temp t) noexcept 1901 { 1902 if (t.type() == RegType::sgpr) 1903 sgpr -= t.size(); 1904 else 1905 vgpr -= t.size(); 1906 return *this; 1907 } 1908 1909 constexpr void update(const RegisterDemand other) noexcept 1910 { 1911 vgpr = std::max(vgpr, other.vgpr); 1912 sgpr = std::max(sgpr, other.sgpr); 1913 } 1914}; 1915 1916/* CFG */ 1917struct Block { 1918 float_mode fp_mode; 1919 unsigned index; 1920 unsigned offset = 0; 1921 std::vector<aco_ptr<Instruction>> instructions; 1922 std::vector<unsigned> logical_preds; 1923 std::vector<unsigned> linear_preds; 1924 std::vector<unsigned> logical_succs; 1925 std::vector<unsigned> linear_succs; 1926 RegisterDemand register_demand = RegisterDemand(); 1927 uint16_t loop_nest_depth = 0; 1928 uint16_t divergent_if_logical_depth = 0; 1929 uint16_t uniform_if_depth = 0; 1930 uint16_t kind = 0; 1931 int logical_idom = -1; 1932 int linear_idom = -1; 1933 1934 /* this information is needed for predecessors to blocks with phis when 1935 * moving out of ssa */ 1936 bool scc_live_out = false; 1937 1938 Block() : index(0) {} 1939}; 1940 1941/* 1942 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage. 1943 */ 1944enum class SWStage : uint16_t { 1945 None = 0, 1946 VS = 1 << 0, /* Vertex Shader */ 1947 GS = 1 << 1, /* Geometry Shader */ 1948 TCS = 1 << 2, /* Tessellation Control aka Hull Shader */ 1949 TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */ 1950 FS = 1 << 4, /* Fragment aka Pixel Shader */ 1951 CS = 1 << 5, /* Compute Shader */ 1952 TS = 1 << 6, /* Task Shader */ 1953 MS = 1 << 7, /* Mesh Shader */ 1954 GSCopy = 1 << 8, /* GS Copy Shader (internal) */ 1955 1956 /* Stage combinations merged to run on a single HWStage */ 1957 VS_GS = VS | GS, 1958 VS_TCS = VS | TCS, 1959 TES_GS = TES | GS, 1960}; 1961 1962constexpr SWStage 1963operator|(SWStage a, SWStage b) 1964{ 1965 return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b)); 1966} 1967 1968/* 1969 * Shader stages as running on the AMD GPU. 1970 * 1971 * The relation between HWStages and SWStages is not a one-to-one mapping: 1972 * Some SWStages are merged by ACO to run on a single HWStage. 1973 * See README.md for details. 1974 */ 1975enum class HWStage : uint8_t { 1976 VS, 1977 ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */ 1978 GS, /* Geometry shader on GFX10/legacy and GFX6-9. */ 1979 NGG, /* Primitive shader, used to implement VS, TES, GS. */ 1980 LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */ 1981 HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */ 1982 FS, 1983 CS, 1984}; 1985 1986/* 1987 * Set of SWStages to be merged into a single shader paired with the 1988 * HWStage it will run on. 1989 */ 1990struct Stage { 1991 constexpr Stage() = default; 1992 1993 explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {} 1994 1995 /* Check if the given SWStage is included */ 1996 constexpr bool has(SWStage stage) const 1997 { 1998 return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage)); 1999 } 2000 2001 unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); } 2002 2003 constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; } 2004 2005 constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; } 2006 2007 /* Mask of merged software stages */ 2008 SWStage sw = SWStage::None; 2009 2010 /* Active hardware stage */ 2011 HWStage hw{}; 2012}; 2013 2014/* possible settings of Program::stage */ 2015static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS); 2016static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS); 2017static constexpr Stage compute_cs(HWStage::CS, SWStage::CS); 2018static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES); 2019static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy); 2020/* Mesh shading pipeline */ 2021static constexpr Stage task_cs(HWStage::CS, SWStage::TS); 2022static constexpr Stage mesh_ngg(HWStage::NGG, SWStage::MS); 2023/* GFX10/NGG */ 2024static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS); 2025static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS); 2026static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES); 2027static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS); 2028/* GFX9 (and GFX10 if NGG isn't used) */ 2029static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS); 2030static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS); 2031static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS); 2032/* pre-GFX9 */ 2033static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */ 2034static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */ 2035static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS); 2036static constexpr Stage tess_eval_es(HWStage::ES, 2037 SWStage::TES); /* tesselation evaluation before geometry */ 2038static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS); 2039 2040enum statistic { 2041 statistic_hash, 2042 statistic_instructions, 2043 statistic_copies, 2044 statistic_branches, 2045 statistic_latency, 2046 statistic_inv_throughput, 2047 statistic_vmem_clauses, 2048 statistic_smem_clauses, 2049 statistic_sgpr_presched, 2050 statistic_vgpr_presched, 2051 num_statistics 2052}; 2053 2054struct DeviceInfo { 2055 uint16_t lds_encoding_granule; 2056 uint16_t lds_alloc_granule; 2057 uint32_t lds_limit; /* in bytes */ 2058 bool has_16bank_lds; 2059 uint16_t physical_sgprs; 2060 uint16_t physical_vgprs; 2061 uint16_t vgpr_limit; 2062 uint16_t sgpr_limit; 2063 uint16_t sgpr_alloc_granule; 2064 uint16_t vgpr_alloc_granule; /* must be power of two */ 2065 unsigned max_wave64_per_simd; 2066 unsigned simd_per_cu; 2067 bool has_fast_fma32 = false; 2068 bool has_mac_legacy32 = false; 2069 bool fused_mad_mix = false; 2070 bool xnack_enabled = false; 2071 bool sram_ecc_enabled = false; 2072 2073 int16_t scratch_global_offset_min; 2074 int16_t scratch_global_offset_max; 2075}; 2076 2077enum class CompilationProgress { 2078 after_isel, 2079 after_spilling, 2080 after_ra, 2081}; 2082 2083class Program final { 2084public: 2085 std::vector<Block> blocks; 2086 std::vector<RegClass> temp_rc = {s1}; 2087 RegisterDemand max_reg_demand = RegisterDemand(); 2088 ac_shader_config* config; 2089 struct aco_shader_info info; 2090 enum amd_gfx_level gfx_level; 2091 enum radeon_family family; 2092 DeviceInfo dev; 2093 unsigned wave_size; 2094 RegClass lane_mask; 2095 Stage stage; 2096 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ 2097 bool needs_wqm = false; /* there exists a p_wqm instruction */ 2098 2099 std::vector<uint8_t> constant_data; 2100 Temp private_segment_buffer; 2101 Temp scratch_offset; 2102 2103 uint16_t num_waves = 0; 2104 uint16_t min_waves = 0; 2105 unsigned workgroup_size; /* if known; otherwise UINT_MAX */ 2106 bool wgp_mode; 2107 bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */ 2108 2109 bool needs_vcc = false; 2110 2111 CompilationProgress progress; 2112 2113 bool collect_statistics = false; 2114 uint32_t statistics[num_statistics]; 2115 2116 float_mode next_fp_mode; 2117 unsigned next_loop_depth = 0; 2118 unsigned next_divergent_if_logical_depth = 0; 2119 unsigned next_uniform_if_depth = 0; 2120 2121 std::vector<Definition> vs_inputs; 2122 2123 struct { 2124 FILE* output = stderr; 2125 bool shorten_messages = false; 2126 void (*func)(void* private_data, enum aco_compiler_debug_level level, const char* message); 2127 void* private_data; 2128 } debug; 2129 2130 uint32_t allocateId(RegClass rc) 2131 { 2132 assert(allocationID <= 16777215); 2133 temp_rc.push_back(rc); 2134 return allocationID++; 2135 } 2136 2137 void allocateRange(unsigned amount) 2138 { 2139 assert(allocationID + amount <= 16777216); 2140 temp_rc.resize(temp_rc.size() + amount); 2141 allocationID += amount; 2142 } 2143 2144 Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); } 2145 2146 uint32_t peekAllocationId() { return allocationID; } 2147 2148 friend void reindex_ssa(Program* program); 2149 friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out); 2150 2151 Block* create_and_insert_block() 2152 { 2153 Block block; 2154 return insert_block(std::move(block)); 2155 } 2156 2157 Block* insert_block(Block&& block) 2158 { 2159 block.index = blocks.size(); 2160 block.fp_mode = next_fp_mode; 2161 block.loop_nest_depth = next_loop_depth; 2162 block.divergent_if_logical_depth = next_divergent_if_logical_depth; 2163 block.uniform_if_depth = next_uniform_if_depth; 2164 blocks.emplace_back(std::move(block)); 2165 return &blocks.back(); 2166 } 2167 2168private: 2169 uint32_t allocationID = 1; 2170}; 2171 2172struct live { 2173 /* live temps out per block */ 2174 std::vector<IDSet> live_out; 2175 /* register demand (sgpr/vgpr) per instruction per block */ 2176 std::vector<std::vector<RegisterDemand>> register_demand; 2177}; 2178 2179struct ra_test_policy { 2180 /* Force RA to always use its pessimistic fallback algorithm */ 2181 bool skip_optimistic_path = false; 2182}; 2183 2184void init(); 2185 2186void init_program(Program* program, Stage stage, const struct aco_shader_info* info, 2187 enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode, 2188 ac_shader_config* config); 2189 2190void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders, 2191 ac_shader_config* config, const struct aco_compiler_options* options, 2192 const struct aco_shader_info* info, 2193 const struct radv_shader_args* args); 2194void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config, 2195 const struct aco_compiler_options* options, 2196 const struct aco_shader_info* info, 2197 const struct radv_shader_args* args); 2198void select_trap_handler_shader(Program* program, struct nir_shader* shader, 2199 ac_shader_config* config, 2200 const struct aco_compiler_options* options, 2201 const struct aco_shader_info* info, 2202 const struct radv_shader_args* args); 2203void select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, 2204 ac_shader_config* config, 2205 const struct aco_compiler_options* options, 2206 const struct aco_shader_info* info, 2207 const struct radv_shader_args* args, 2208 unsigned* num_preserved_sgprs); 2209 2210void select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key, 2211 ac_shader_config* config, 2212 const struct aco_compiler_options* options, 2213 const struct aco_shader_info* info, 2214 const struct radv_shader_args* args); 2215 2216void lower_phis(Program* program); 2217void calc_min_waves(Program* program); 2218void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); 2219live live_var_analysis(Program* program); 2220std::vector<uint16_t> dead_code_analysis(Program* program); 2221void dominator_tree(Program* program); 2222void insert_exec_mask(Program* program); 2223void value_numbering(Program* program); 2224void optimize(Program* program); 2225void optimize_postRA(Program* program); 2226void setup_reduce_temp(Program* program); 2227void lower_to_cssa(Program* program, live& live_vars); 2228void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, 2229 ra_test_policy = {}); 2230void ssa_elimination(Program* program); 2231void lower_to_hw_instr(Program* program); 2232void schedule_program(Program* program, live& live_vars); 2233void spill(Program* program, live& live_vars); 2234void insert_wait_states(Program* program); 2235void insert_NOPs(Program* program); 2236void form_hard_clauses(Program* program); 2237unsigned emit_program(Program* program, std::vector<uint32_t>& code); 2238/** 2239 * Returns true if print_asm can disassemble the given program for the current build/runtime 2240 * configuration 2241 */ 2242bool check_print_asm_support(Program* program); 2243bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output); 2244bool validate_ir(Program* program); 2245bool validate_ra(Program* program); 2246#ifndef NDEBUG 2247void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL); 2248#else 2249#define perfwarn(program, cond, msg, ...) \ 2250 do { \ 2251 } while (0) 2252#endif 2253 2254void collect_presched_stats(Program* program); 2255void collect_preasm_stats(Program* program); 2256void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code); 2257 2258enum print_flags { 2259 print_no_ssa = 0x1, 2260 print_perf_info = 0x2, 2261 print_kill = 0x4, 2262 print_live_vars = 0x8, 2263}; 2264 2265void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0); 2266void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0); 2267void aco_print_program(const Program* program, FILE* output, unsigned flags = 0); 2268void aco_print_program(const Program* program, FILE* output, const live& live_vars, 2269 unsigned flags = 0); 2270 2271void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...); 2272void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...); 2273 2274#define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__) 2275#define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__) 2276 2277/* utilities for dealing with register demand */ 2278RegisterDemand get_live_changes(aco_ptr<Instruction>& instr); 2279RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr); 2280RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, 2281 aco_ptr<Instruction>& instr_before); 2282 2283/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ 2284uint16_t get_extra_sgprs(Program* program); 2285 2286/* adjust num_waves for workgroup size and LDS limits */ 2287uint16_t max_suitable_waves(Program* program, uint16_t waves); 2288 2289/* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */ 2290uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs); 2291uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs); 2292 2293/* return number of addressable sgprs/vgprs for max_waves */ 2294uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves); 2295uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves); 2296 2297typedef struct { 2298 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)]; 2299 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)]; 2300 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)]; 2301 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers; 2302 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers; 2303 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic; 2304 const char* name[static_cast<int>(aco_opcode::num_opcodes)]; 2305 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)]; 2306 /* sizes used for input/output modifiers and constants */ 2307 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)]; 2308 const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)]; 2309} Info; 2310 2311extern const Info instr_info; 2312 2313} // namespace aco 2314 2315#endif /* ACO_IR_H */ 2316