1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_ir.h" 26 27#include <algorithm> 28#include <array> 29#include <bitset> 30#include <map> 31#include <set> 32#include <unordered_map> 33#include <vector> 34 35namespace aco { 36namespace { 37 38struct ra_ctx; 39 40unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, 41 unsigned idx, RegClass rc); 42void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, 43 RegClass rc); 44std::pair<unsigned, unsigned> 45get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc); 46void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg); 47 48struct assignment { 49 PhysReg reg; 50 RegClass rc; 51 union { 52 struct { 53 bool assigned : 1; 54 bool vcc : 1; 55 }; 56 uint8_t _ = 0; 57 }; 58 uint32_t affinity = 0; 59 assignment() = default; 60 assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {} 61 void set(const Definition& def) 62 { 63 assigned = true; 64 reg = def.physReg(); 65 rc = def.regClass(); 66 } 67}; 68 69struct ra_ctx { 70 71 Program* program; 72 Block* block = NULL; 73 std::vector<assignment> assignments; 74 std::vector<std::unordered_map<unsigned, Temp>> renames; 75 std::vector<uint32_t> loop_header; 76 std::unordered_map<unsigned, Temp> orig_names; 77 std::unordered_map<unsigned, Instruction*> vectors; 78 std::unordered_map<unsigned, Instruction*> split_vectors; 79 aco_ptr<Instruction> pseudo_dummy; 80 aco_ptr<Instruction> phi_dummy; 81 uint16_t max_used_sgpr = 0; 82 uint16_t max_used_vgpr = 0; 83 uint16_t sgpr_limit; 84 uint16_t vgpr_limit; 85 std::bitset<512> war_hint; 86 std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */ 87 88 ra_test_policy policy; 89 90 ra_ctx(Program* program_, ra_test_policy policy_) 91 : program(program_), assignments(program->peekAllocationId()), 92 renames(program->blocks.size()), policy(policy_) 93 { 94 pseudo_dummy.reset( 95 create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0)); 96 phi_dummy.reset( 97 create_instruction<Instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0)); 98 sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); 99 vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); 100 } 101}; 102 103/* Iterator type for making PhysRegInterval compatible with range-based for */ 104struct PhysRegIterator { 105 using difference_type = int; 106 using value_type = unsigned; 107 using reference = const unsigned&; 108 using pointer = const unsigned*; 109 using iterator_category = std::bidirectional_iterator_tag; 110 111 PhysReg reg; 112 113 PhysReg operator*() const { return reg; } 114 115 PhysRegIterator& operator++() 116 { 117 reg.reg_b += 4; 118 return *this; 119 } 120 121 PhysRegIterator& operator--() 122 { 123 reg.reg_b -= 4; 124 return *this; 125 } 126 127 bool operator==(PhysRegIterator oth) const { return reg == oth.reg; } 128 129 bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; } 130 131 bool operator<(PhysRegIterator oth) const { return reg < oth.reg; } 132}; 133 134/* Half-open register interval used in "sliding window"-style for-loops */ 135struct PhysRegInterval { 136 PhysReg lo_; 137 unsigned size; 138 139 /* Inclusive lower bound */ 140 PhysReg lo() const { return lo_; } 141 142 /* Exclusive upper bound */ 143 PhysReg hi() const { return PhysReg{lo() + size}; } 144 145 PhysRegInterval& operator+=(uint32_t stride) 146 { 147 lo_ = PhysReg{lo_.reg() + stride}; 148 return *this; 149 } 150 151 bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; } 152 153 /* Construct a half-open interval, excluding the end register */ 154 static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; } 155 156 bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); } 157 158 bool contains(const PhysRegInterval& needle) const 159 { 160 return needle.lo() >= lo() && needle.hi() <= hi(); 161 } 162 163 PhysRegIterator begin() const { return {lo_}; } 164 165 PhysRegIterator end() const { return {PhysReg{lo_ + size}}; } 166}; 167 168bool 169intersects(const PhysRegInterval& a, const PhysRegInterval& b) 170{ 171 return a.hi() > b.lo() && b.hi() > a.lo(); 172} 173 174/* Gets the stride for full (non-subdword) registers */ 175uint32_t 176get_stride(RegClass rc) 177{ 178 if (rc.type() == RegType::vgpr) { 179 return 1; 180 } else { 181 uint32_t size = rc.size(); 182 if (size == 2) { 183 return 2; 184 } else if (size >= 4) { 185 return 4; 186 } else { 187 return 1; 188 } 189 } 190} 191 192PhysRegInterval 193get_reg_bounds(Program* program, RegType type) 194{ 195 if (type == RegType::vgpr) { 196 return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr}; 197 } else { 198 return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr}; 199 } 200} 201 202struct DefInfo { 203 PhysRegInterval bounds; 204 uint8_t size; 205 uint8_t stride; 206 RegClass rc; 207 208 DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_) 209 { 210 size = rc.size(); 211 stride = get_stride(rc); 212 213 bounds = get_reg_bounds(ctx.program, rc.type()); 214 215 if (rc.is_subdword() && operand >= 0) { 216 /* stride in bytes */ 217 stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc); 218 } else if (rc.is_subdword()) { 219 std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc); 220 stride = info.first; 221 if (info.second > rc.bytes()) { 222 rc = RegClass::get(rc.type(), info.second); 223 size = rc.size(); 224 /* we might still be able to put the definition in the high half, 225 * but that's only useful for affinities and this information isn't 226 * used for them */ 227 stride = align(stride, info.second); 228 if (!rc.is_subdword()) 229 stride = DIV_ROUND_UP(stride, 4); 230 } 231 assert(stride > 0); 232 } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) { 233 /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug 234 * 235 * The register use is not calculated correctly, and the hardware assumes a 236 * full dword per component. Don't use the last registers of the register file. 237 * Otherwise, the instruction will be skipped. 238 * 239 * https://reviews.llvm.org/D81172 240 */ 241 bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF; 242 assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported."); 243 244 if (imageGather4D16Bug) 245 bounds.size -= rc.bytes() / 4; 246 } 247 } 248}; 249 250class RegisterFile { 251public: 252 RegisterFile() { regs.fill(0); } 253 254 std::array<uint32_t, 512> regs; 255 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs; 256 257 const uint32_t& operator[](PhysReg index) const { return regs[index]; } 258 259 uint32_t& operator[](PhysReg index) { return regs[index]; } 260 261 unsigned count_zero(PhysRegInterval reg_interval) 262 { 263 unsigned res = 0; 264 for (PhysReg reg : reg_interval) 265 res += !regs[reg]; 266 return res; 267 } 268 269 /* Returns true if any of the bytes in the given range are allocated or blocked */ 270 bool test(PhysReg start, unsigned num_bytes) 271 { 272 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) { 273 assert(i <= 511); 274 if (regs[i] & 0x0FFFFFFF) 275 return true; 276 if (regs[i] == 0xF0000000) { 277 assert(subdword_regs.find(i) != subdword_regs.end()); 278 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) { 279 if (subdword_regs[i][j]) 280 return true; 281 } 282 } 283 } 284 return false; 285 } 286 287 void block(PhysReg start, RegClass rc) 288 { 289 if (rc.is_subdword()) 290 fill_subdword(start, rc.bytes(), 0xFFFFFFFF); 291 else 292 fill(start, rc.size(), 0xFFFFFFFF); 293 } 294 295 bool is_blocked(PhysReg start) 296 { 297 if (regs[start] == 0xFFFFFFFF) 298 return true; 299 if (regs[start] == 0xF0000000) { 300 for (unsigned i = start.byte(); i < 4; i++) 301 if (subdword_regs[start][i] == 0xFFFFFFFF) 302 return true; 303 } 304 return false; 305 } 306 307 bool is_empty_or_blocked(PhysReg start) 308 { 309 /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the 310 * incremented value to 1 */ 311 if (regs[start] == 0xF0000000) { 312 return subdword_regs[start][start.byte()] + 1 <= 1; 313 } 314 return regs[start] + 1 <= 1; 315 } 316 317 void clear(PhysReg start, RegClass rc) 318 { 319 if (rc.is_subdword()) 320 fill_subdword(start, rc.bytes(), 0); 321 else 322 fill(start, rc.size(), 0); 323 } 324 325 void fill(Operand op) 326 { 327 if (op.regClass().is_subdword()) 328 fill_subdword(op.physReg(), op.bytes(), op.tempId()); 329 else 330 fill(op.physReg(), op.size(), op.tempId()); 331 } 332 333 void clear(Operand op) { clear(op.physReg(), op.regClass()); } 334 335 void fill(Definition def) 336 { 337 if (def.regClass().is_subdword()) 338 fill_subdword(def.physReg(), def.bytes(), def.tempId()); 339 else 340 fill(def.physReg(), def.size(), def.tempId()); 341 } 342 343 void clear(Definition def) { clear(def.physReg(), def.regClass()); } 344 345 unsigned get_id(PhysReg reg) 346 { 347 return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg]; 348 } 349 350private: 351 void fill(PhysReg start, unsigned size, uint32_t val) 352 { 353 for (unsigned i = 0; i < size; i++) 354 regs[start + i] = val; 355 } 356 357 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) 358 { 359 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000); 360 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) { 361 /* emplace or get */ 362 std::array<uint32_t, 4>& sub = 363 subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second; 364 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) 365 sub[j] = val; 366 367 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) { 368 subdword_regs.erase(i); 369 regs[i] = 0; 370 } 371 } 372 } 373}; 374 375std::vector<unsigned> find_vars(ra_ctx& ctx, RegisterFile& reg_file, 376 const PhysRegInterval reg_interval); 377 378/* helper function for debugging */ 379UNUSED void 380print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable) 381{ 382 if (reg_file[reg] == 0xFFFFFFFF) { 383 printf("☐"); 384 } else if (reg_file[reg]) { 385 const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000); 386 if (show_subdword_alloc) { 387 const char* block_chars[] = { 388 // clang-format off 389 "?", "▘", "▝", "▀", 390 "▖", "▌", "▞", "▛", 391 "▗", "▚", "▐", "▜", 392 "▄", "▙", "▟", "▉" 393 // clang-format on 394 }; 395 unsigned index = 0; 396 for (int i = 0; i < 4; ++i) { 397 if (reg_file.subdword_regs.at(reg)[i]) { 398 index |= 1 << i; 399 } 400 } 401 printf("%s", block_chars[index]); 402 } else { 403 /* Indicate filled register slot */ 404 if (!has_adjacent_variable) { 405 printf("█"); 406 } else { 407 /* Use a slightly shorter box to leave a small gap between adjacent variables */ 408 printf("▉"); 409 } 410 } 411 } else { 412 printf("·"); 413 } 414} 415 416/* helper function for debugging */ 417UNUSED void 418print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) 419{ 420 PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr); 421 char reg_char = vgprs ? 'v' : 's'; 422 const int max_regs_per_line = 64; 423 424 /* print markers */ 425 printf(" "); 426 for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) { 427 printf("%-3.2u ", i); 428 } 429 printf("\n"); 430 431 /* print usage */ 432 auto line_begin_it = regs.begin(); 433 while (line_begin_it != regs.end()) { 434 const int regs_in_line = 435 std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end())); 436 437 if (line_begin_it == regs.begin()) { 438 printf("%cgprs: ", reg_char); 439 } else { 440 printf(" %+4d ", std::distance(regs.begin(), line_begin_it)); 441 } 442 const auto line_end_it = std::next(line_begin_it, regs_in_line); 443 444 for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) { 445 bool has_adjacent_variable = 446 (std::next(reg_it) != line_end_it && 447 reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]); 448 print_reg(reg_file, *reg_it, has_adjacent_variable); 449 } 450 451 line_begin_it = line_end_it; 452 printf("\n"); 453 } 454 455 const unsigned free_regs = 456 std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; }); 457 printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size); 458 459 /* print assignments ordered by registers */ 460 std::map<PhysReg, std::pair<unsigned, unsigned>> 461 regs_to_vars; /* maps to byte size and temp id */ 462 for (unsigned id : find_vars(ctx, reg_file, regs)) { 463 const assignment& var = ctx.assignments[id]; 464 PhysReg reg = var.reg; 465 ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id)); 466 assert(inserted.second); 467 } 468 469 for (const auto& reg_and_var : regs_to_vars) { 470 const auto& first_reg = reg_and_var.first; 471 const auto& size_id = reg_and_var.second; 472 473 printf("%%%u ", size_id.second); 474 if (ctx.orig_names.count(size_id.second) && 475 ctx.orig_names[size_id.second].id() != size_id.second) { 476 printf("(was %%%d) ", ctx.orig_names[size_id.second].id()); 477 } 478 printf("= %c[%d", reg_char, first_reg.reg() - regs.lo()); 479 PhysReg last_reg = first_reg.advance(size_id.first - 1); 480 if (first_reg.reg() != last_reg.reg()) { 481 assert(first_reg.byte() == 0 && last_reg.byte() == 3); 482 printf("-%d", last_reg.reg() - regs.lo()); 483 } 484 printf("]"); 485 if (first_reg.byte() != 0 || last_reg.byte() != 3) { 486 printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8); 487 } 488 printf("\n"); 489 } 490} 491 492unsigned 493get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, 494 unsigned idx, RegClass rc) 495{ 496 if (instr->isPseudo()) { 497 /* v_readfirstlane_b32 cannot use SDWA */ 498 if (instr->opcode == aco_opcode::p_as_uniform) 499 return 4; 500 else if (gfx_level >= GFX8) 501 return rc.bytes() % 2 == 0 ? 2 : 1; 502 else 503 return 4; 504 } 505 506 assert(rc.bytes() <= 2); 507 if (instr->isVALU()) { 508 if (can_use_SDWA(gfx_level, instr, false)) 509 return rc.bytes(); 510 if (can_use_opsel(gfx_level, instr->opcode, idx)) 511 return 2; 512 if (instr->format == Format::VOP3P) 513 return 2; 514 } 515 516 switch (instr->opcode) { 517 case aco_opcode::v_cvt_f32_ubyte0: return 1; 518 case aco_opcode::ds_write_b8: 519 case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4; 520 case aco_opcode::buffer_store_byte: 521 case aco_opcode::buffer_store_short: 522 case aco_opcode::buffer_store_format_d16_x: 523 case aco_opcode::flat_store_byte: 524 case aco_opcode::flat_store_short: 525 case aco_opcode::scratch_store_byte: 526 case aco_opcode::scratch_store_short: 527 case aco_opcode::global_store_byte: 528 case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4; 529 default: return 4; 530 } 531} 532 533void 534add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, 535 RegClass rc) 536{ 537 amd_gfx_level gfx_level = ctx.program->gfx_level; 538 if (instr->isPseudo() || byte == 0) 539 return; 540 541 assert(rc.bytes() <= 2); 542 if (instr->isVALU()) { 543 /* check if we can use opsel */ 544 if (instr->format == Format::VOP3) { 545 assert(byte == 2); 546 instr->vop3().opsel |= 1 << idx; 547 return; 548 } 549 if (instr->isVOP3P()) { 550 assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx))); 551 instr->vop3p().opsel_lo |= 1 << idx; 552 instr->vop3p().opsel_hi |= 1 << idx; 553 return; 554 } 555 if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) { 556 switch (byte) { 557 case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break; 558 case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break; 559 case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break; 560 case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break; 561 } 562 return; 563 } 564 565 /* use SDWA */ 566 assert(can_use_SDWA(gfx_level, instr, false)); 567 convert_to_SDWA(gfx_level, instr); 568 return; 569 } 570 571 assert(byte == 2); 572 if (instr->opcode == aco_opcode::ds_write_b8) 573 instr->opcode = aco_opcode::ds_write_b8_d16_hi; 574 else if (instr->opcode == aco_opcode::ds_write_b16) 575 instr->opcode = aco_opcode::ds_write_b16_d16_hi; 576 else if (instr->opcode == aco_opcode::buffer_store_byte) 577 instr->opcode = aco_opcode::buffer_store_byte_d16_hi; 578 else if (instr->opcode == aco_opcode::buffer_store_short) 579 instr->opcode = aco_opcode::buffer_store_short_d16_hi; 580 else if (instr->opcode == aco_opcode::buffer_store_format_d16_x) 581 instr->opcode = aco_opcode::buffer_store_format_d16_hi_x; 582 else if (instr->opcode == aco_opcode::flat_store_byte) 583 instr->opcode = aco_opcode::flat_store_byte_d16_hi; 584 else if (instr->opcode == aco_opcode::flat_store_short) 585 instr->opcode = aco_opcode::flat_store_short_d16_hi; 586 else if (instr->opcode == aco_opcode::scratch_store_byte) 587 instr->opcode = aco_opcode::scratch_store_byte_d16_hi; 588 else if (instr->opcode == aco_opcode::scratch_store_short) 589 instr->opcode = aco_opcode::scratch_store_short_d16_hi; 590 else if (instr->opcode == aco_opcode::global_store_byte) 591 instr->opcode = aco_opcode::global_store_byte_d16_hi; 592 else if (instr->opcode == aco_opcode::global_store_short) 593 instr->opcode = aco_opcode::global_store_short_d16_hi; 594 else 595 unreachable("Something went wrong: Impossible register assignment."); 596 return; 597} 598 599/* minimum_stride, bytes_written */ 600std::pair<unsigned, unsigned> 601get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc) 602{ 603 amd_gfx_level gfx_level = program->gfx_level; 604 605 if (instr->isPseudo()) { 606 if (gfx_level >= GFX8) 607 return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes()); 608 else 609 return std::make_pair(4, rc.size() * 4u); 610 } 611 612 if (instr->isVALU() || instr->isVINTRP()) { 613 assert(rc.bytes() <= 2); 614 615 if (can_use_SDWA(gfx_level, instr, false)) 616 return std::make_pair(rc.bytes(), rc.bytes()); 617 618 unsigned bytes_written = 4u; 619 if (instr_is_16bit(gfx_level, instr->opcode)) 620 bytes_written = 2u; 621 622 unsigned stride = 4u; 623 if (instr->opcode == aco_opcode::v_fma_mixlo_f16 || 624 can_use_opsel(gfx_level, instr->opcode, -1)) 625 stride = 2u; 626 627 return std::make_pair(stride, bytes_written); 628 } 629 630 switch (instr->opcode) { 631 /* D16 loads with _hi version */ 632 case aco_opcode::ds_read_u8_d16: 633 case aco_opcode::ds_read_i8_d16: 634 case aco_opcode::ds_read_u16_d16: 635 case aco_opcode::flat_load_ubyte_d16: 636 case aco_opcode::flat_load_sbyte_d16: 637 case aco_opcode::flat_load_short_d16: 638 case aco_opcode::global_load_ubyte_d16: 639 case aco_opcode::global_load_sbyte_d16: 640 case aco_opcode::global_load_short_d16: 641 case aco_opcode::scratch_load_ubyte_d16: 642 case aco_opcode::scratch_load_sbyte_d16: 643 case aco_opcode::scratch_load_short_d16: 644 case aco_opcode::buffer_load_ubyte_d16: 645 case aco_opcode::buffer_load_sbyte_d16: 646 case aco_opcode::buffer_load_short_d16: 647 case aco_opcode::buffer_load_format_d16_x: { 648 assert(gfx_level >= GFX9); 649 if (!program->dev.sram_ecc_enabled) 650 return std::make_pair(2u, 2u); 651 else 652 return std::make_pair(2u, 4u); 653 } 654 /* 3-component D16 loads */ 655 case aco_opcode::buffer_load_format_d16_xyz: 656 case aco_opcode::tbuffer_load_format_d16_xyz: { 657 assert(gfx_level >= GFX9); 658 if (!program->dev.sram_ecc_enabled) 659 return std::make_pair(4u, 6u); 660 break; 661 } 662 663 default: break; 664 } 665 666 if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) { 667 assert(gfx_level >= GFX9); 668 return std::make_pair(4u, rc.bytes()); 669 } 670 671 return std::make_pair(4, rc.size() * 4u); 672} 673 674void 675add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg) 676{ 677 if (instr->isPseudo()) 678 return; 679 680 if (instr->isVALU()) { 681 amd_gfx_level gfx_level = program->gfx_level; 682 assert(instr->definitions[0].bytes() <= 2); 683 684 if (reg.byte() == 0 && instr_is_16bit(gfx_level, instr->opcode)) 685 return; 686 687 /* check if we can use opsel */ 688 if (instr->format == Format::VOP3) { 689 assert(reg.byte() == 2); 690 assert(can_use_opsel(gfx_level, instr->opcode, -1)); 691 instr->vop3().opsel |= (1 << 3); /* dst in high half */ 692 return; 693 } 694 695 if (instr->opcode == aco_opcode::v_fma_mixlo_f16) { 696 instr->opcode = aco_opcode::v_fma_mixhi_f16; 697 return; 698 } 699 700 /* use SDWA */ 701 assert(can_use_SDWA(gfx_level, instr, false)); 702 convert_to_SDWA(gfx_level, instr); 703 return; 704 } 705 706 if (reg.byte() == 0) 707 return; 708 else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16) 709 instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi; 710 else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16) 711 instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi; 712 else if (instr->opcode == aco_opcode::buffer_load_short_d16) 713 instr->opcode = aco_opcode::buffer_load_short_d16_hi; 714 else if (instr->opcode == aco_opcode::buffer_load_format_d16_x) 715 instr->opcode = aco_opcode::buffer_load_format_d16_hi_x; 716 else if (instr->opcode == aco_opcode::flat_load_ubyte_d16) 717 instr->opcode = aco_opcode::flat_load_ubyte_d16_hi; 718 else if (instr->opcode == aco_opcode::flat_load_sbyte_d16) 719 instr->opcode = aco_opcode::flat_load_sbyte_d16_hi; 720 else if (instr->opcode == aco_opcode::flat_load_short_d16) 721 instr->opcode = aco_opcode::flat_load_short_d16_hi; 722 else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16) 723 instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi; 724 else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16) 725 instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi; 726 else if (instr->opcode == aco_opcode::scratch_load_short_d16) 727 instr->opcode = aco_opcode::scratch_load_short_d16_hi; 728 else if (instr->opcode == aco_opcode::global_load_ubyte_d16) 729 instr->opcode = aco_opcode::global_load_ubyte_d16_hi; 730 else if (instr->opcode == aco_opcode::global_load_sbyte_d16) 731 instr->opcode = aco_opcode::global_load_sbyte_d16_hi; 732 else if (instr->opcode == aco_opcode::global_load_short_d16) 733 instr->opcode = aco_opcode::global_load_short_d16_hi; 734 else if (instr->opcode == aco_opcode::ds_read_u8_d16) 735 instr->opcode = aco_opcode::ds_read_u8_d16_hi; 736 else if (instr->opcode == aco_opcode::ds_read_i8_d16) 737 instr->opcode = aco_opcode::ds_read_i8_d16_hi; 738 else if (instr->opcode == aco_opcode::ds_read_u16_d16) 739 instr->opcode = aco_opcode::ds_read_u16_d16_hi; 740 else 741 unreachable("Something went wrong: Impossible register assignment."); 742} 743 744void 745adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) 746{ 747 uint16_t max_addressible_sgpr = ctx.sgpr_limit; 748 unsigned size = rc.size(); 749 if (rc.type() == RegType::vgpr) { 750 assert(reg >= 256); 751 uint16_t hi = reg - 256 + size - 1; 752 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi); 753 } else if (reg + rc.size() <= max_addressible_sgpr) { 754 uint16_t hi = reg + size - 1; 755 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr)); 756 } 757} 758 759enum UpdateRenames { 760 rename_not_killed_ops = 0x1, 761 fill_killed_ops = 0x2, 762}; 763MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames); 764 765void 766update_renames(ra_ctx& ctx, RegisterFile& reg_file, 767 std::vector<std::pair<Operand, Definition>>& parallelcopies, 768 aco_ptr<Instruction>& instr, UpdateRenames flags) 769{ 770 /* clear operands */ 771 for (std::pair<Operand, Definition>& copy : parallelcopies) { 772 /* the definitions with id are not from this function and already handled */ 773 if (copy.second.isTemp()) 774 continue; 775 reg_file.clear(copy.first); 776 } 777 778 /* allocate id's and rename operands: this is done transparently here */ 779 auto it = parallelcopies.begin(); 780 while (it != parallelcopies.end()) { 781 if (it->second.isTemp()) { 782 ++it; 783 continue; 784 } 785 786 /* check if we moved a definition: change the register and remove copy */ 787 bool is_def = false; 788 for (Definition& def : instr->definitions) { 789 if (def.isTemp() && def.getTemp() == it->first.getTemp()) { 790 // FIXME: ensure that the definition can use this reg 791 def.setFixed(it->second.physReg()); 792 reg_file.fill(def); 793 ctx.assignments[def.tempId()].reg = def.physReg(); 794 it = parallelcopies.erase(it); 795 is_def = true; 796 break; 797 } 798 } 799 if (is_def) 800 continue; 801 802 /* check if we moved another parallelcopy definition */ 803 for (std::pair<Operand, Definition>& other : parallelcopies) { 804 if (!other.second.isTemp()) 805 continue; 806 if (it->first.getTemp() == other.second.getTemp()) { 807 other.second.setFixed(it->second.physReg()); 808 ctx.assignments[other.second.tempId()].reg = other.second.physReg(); 809 it = parallelcopies.erase(it); 810 is_def = true; 811 /* check if we moved an operand, again */ 812 bool fill = true; 813 for (Operand& op : instr->operands) { 814 if (op.isTemp() && op.tempId() == other.second.tempId()) { 815 // FIXME: ensure that the operand can use this reg 816 op.setFixed(other.second.physReg()); 817 fill = (flags & fill_killed_ops) || !op.isKillBeforeDef(); 818 } 819 } 820 if (fill) 821 reg_file.fill(other.second); 822 break; 823 } 824 } 825 if (is_def) 826 continue; 827 828 std::pair<Operand, Definition>& copy = *it; 829 copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass())); 830 ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass()); 831 assert(ctx.assignments.size() == ctx.program->peekAllocationId()); 832 833 /* check if we moved an operand */ 834 bool first = true; 835 bool fill = true; 836 for (unsigned i = 0; i < instr->operands.size(); i++) { 837 Operand& op = instr->operands[i]; 838 if (!op.isTemp()) 839 continue; 840 if (op.tempId() == copy.first.tempId()) { 841 bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef(); 842 for (std::pair<Operand, Definition>& pc : parallelcopies) { 843 PhysReg def_reg = pc.second.physReg(); 844 omit_renaming &= def_reg > copy.first.physReg() 845 ? (copy.first.physReg() + copy.first.size() <= def_reg.reg()) 846 : (def_reg + pc.second.size() <= copy.first.physReg().reg()); 847 } 848 if (omit_renaming) { 849 if (first) 850 op.setFirstKill(true); 851 else 852 op.setKill(true); 853 first = false; 854 continue; 855 } 856 op.setTemp(copy.second.getTemp()); 857 op.setFixed(copy.second.physReg()); 858 859 fill = (flags & fill_killed_ops) || !op.isKillBeforeDef(); 860 } 861 } 862 863 if (fill) 864 reg_file.fill(copy.second); 865 866 ++it; 867 } 868} 869 870std::pair<PhysReg, bool> 871get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) 872{ 873 const PhysRegInterval& bounds = info.bounds; 874 uint32_t size = info.size; 875 uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride; 876 RegClass rc = info.rc; 877 878 DefInfo new_info = info; 879 new_info.rc = RegClass(rc.type(), size); 880 for (unsigned new_stride = 16; new_stride > stride; new_stride /= 2) { 881 if (size % new_stride) 882 continue; 883 new_info.stride = new_stride; 884 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, new_info); 885 if (res.second) 886 return res; 887 } 888 889 auto is_free = [&](PhysReg reg_index) 890 { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; }; 891 892 if (stride == 1) { 893 /* best fit algorithm: find the smallest gap to fit in the variable */ 894 PhysRegInterval best_gap{PhysReg{0}, UINT_MAX}; 895 const unsigned max_gpr = 896 (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr; 897 898 PhysRegIterator reg_it = bounds.begin(); 899 const PhysRegIterator end_it = 900 std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it)); 901 while (reg_it != bounds.end()) { 902 /* Find the next chunk of available register slots */ 903 reg_it = std::find_if(reg_it, end_it, is_free); 904 auto next_nonfree_it = std::find_if_not(reg_it, end_it, is_free); 905 if (reg_it == bounds.end()) { 906 break; 907 } 908 909 if (next_nonfree_it == end_it) { 910 /* All registers past max_used_gpr are free */ 911 next_nonfree_it = bounds.end(); 912 } 913 914 PhysRegInterval gap = PhysRegInterval::from_until(*reg_it, *next_nonfree_it); 915 916 /* early return on exact matches */ 917 if (size == gap.size) { 918 adjust_max_used_regs(ctx, rc, gap.lo()); 919 return {gap.lo(), true}; 920 } 921 922 /* check if it fits and the gap size is smaller */ 923 if (size < gap.size && gap.size < best_gap.size) { 924 best_gap = gap; 925 } 926 927 /* Move past the processed chunk */ 928 reg_it = next_nonfree_it; 929 } 930 931 if (best_gap.size == UINT_MAX) 932 return {{}, false}; 933 934 /* find best position within gap by leaving a good stride for other variables*/ 935 unsigned buffer = best_gap.size - size; 936 if (buffer > 1) { 937 if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) || 938 ((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) || 939 ((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0)) 940 best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer}; 941 } 942 943 adjust_max_used_regs(ctx, rc, best_gap.lo()); 944 return {best_gap.lo(), true}; 945 } 946 947 for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi(); 948 reg_win += stride) { 949 if (reg_file[reg_win.lo()] != 0) { 950 continue; 951 } 952 953 bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free); 954 if (is_valid) { 955 adjust_max_used_regs(ctx, rc, reg_win.lo()); 956 return {reg_win.lo(), true}; 957 } 958 } 959 960 /* do this late because using the upper bytes of a register can require 961 * larger instruction encodings or copies 962 * TODO: don't do this in situations where it doesn't benefit */ 963 if (rc.is_subdword()) { 964 for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) { 965 assert(reg_file[PhysReg{entry.first}] == 0xF0000000); 966 if (!bounds.contains({PhysReg{entry.first}, rc.size()})) 967 continue; 968 969 for (unsigned i = 0; i < 4; i += info.stride) { 970 /* check if there's a block of free bytes large enough to hold the register */ 971 bool reg_found = 972 std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())], 973 [](unsigned v) { return v == 0; }); 974 975 /* check if also the neighboring reg is free if needed */ 976 if (reg_found && i + rc.bytes() > 4) 977 reg_found = (reg_file[PhysReg{entry.first + 1}] == 0); 978 979 if (reg_found) { 980 PhysReg res{entry.first}; 981 res.reg_b += i; 982 adjust_max_used_regs(ctx, rc, entry.first); 983 return {res, true}; 984 } 985 } 986 } 987 } 988 989 return {{}, false}; 990} 991 992/* collect variables from a register area */ 993std::vector<unsigned> 994find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) 995{ 996 std::vector<unsigned> vars; 997 for (PhysReg j : reg_interval) { 998 if (reg_file.is_blocked(j)) 999 continue; 1000 if (reg_file[j] == 0xF0000000) { 1001 for (unsigned k = 0; k < 4; k++) { 1002 unsigned id = reg_file.subdword_regs[j][k]; 1003 if (id && (vars.empty() || id != vars.back())) 1004 vars.emplace_back(id); 1005 } 1006 } else { 1007 unsigned id = reg_file[j]; 1008 if (id && (vars.empty() || id != vars.back())) 1009 vars.emplace_back(id); 1010 } 1011 } 1012 return vars; 1013} 1014 1015/* collect variables from a register area and clear reg_file 1016 * variables are sorted in decreasing size and 1017 * increasing assigned register 1018 */ 1019std::vector<unsigned> 1020collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) 1021{ 1022 std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval); 1023 std::sort(ids.begin(), ids.end(), 1024 [&](unsigned a, unsigned b) 1025 { 1026 assignment& var_a = ctx.assignments[a]; 1027 assignment& var_b = ctx.assignments[b]; 1028 return var_a.rc.bytes() > var_b.rc.bytes() || 1029 (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg); 1030 }); 1031 1032 for (unsigned id : ids) { 1033 assignment& var = ctx.assignments[id]; 1034 reg_file.clear(var.reg, var.rc); 1035 } 1036 return ids; 1037} 1038 1039std::pair<PhysReg, bool> 1040get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, 1041 std::vector<std::pair<Operand, Definition>>& parallelcopies, 1042 aco_ptr<Instruction>& instr, const PhysRegInterval def_reg, 1043 DefInfo info, unsigned id) 1044{ 1045 PhysReg reg = def_reg.lo(); 1046 /* dead operand: return position in vector */ 1047 for (unsigned i = 0; i < instr->operands.size(); i++) { 1048 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id && 1049 instr->operands[i].isKillBeforeDef()) { 1050 assert(!reg_file.test(reg, instr->operands[i].bytes())); 1051 return {reg, info.rc.is_subdword() || reg.byte() == 0}; 1052 } 1053 reg.reg_b += instr->operands[i].bytes(); 1054 } 1055 1056 if (ctx.program->gfx_level <= GFX8) 1057 return {PhysReg(), false}; 1058 1059 /* check if the previous position was in vector */ 1060 assignment& var = ctx.assignments[id]; 1061 if (def_reg.contains(PhysRegInterval{var.reg, info.size})) { 1062 reg = def_reg.lo(); 1063 /* try to use the previous register of the operand */ 1064 for (unsigned i = 0; i < instr->operands.size(); i++) { 1065 if (reg != var.reg) { 1066 reg.reg_b += instr->operands[i].bytes(); 1067 continue; 1068 } 1069 1070 /* check if we can swap positions */ 1071 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() && 1072 instr->operands[i].regClass() == info.rc) { 1073 assignment& op = ctx.assignments[instr->operands[i].tempId()]; 1074 /* if everything matches, create parallelcopy for the killed operand */ 1075 if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && 1076 reg_file.get_id(op.reg) == instr->operands[i].tempId()) { 1077 Definition pc_def = Definition(reg, info.rc); 1078 parallelcopies.emplace_back(instr->operands[i], pc_def); 1079 return {op.reg, true}; 1080 } 1081 } 1082 return {PhysReg(), false}; 1083 } 1084 } 1085 return {PhysReg(), false}; 1086} 1087 1088bool 1089get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, 1090 std::vector<std::pair<Operand, Definition>>& parallelcopies, 1091 const std::vector<unsigned>& vars, const PhysRegInterval bounds, 1092 aco_ptr<Instruction>& instr, const PhysRegInterval def_reg) 1093{ 1094 /* Variables are sorted from large to small and with increasing assigned register */ 1095 for (unsigned id : vars) { 1096 assignment& var = ctx.assignments[id]; 1097 DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1); 1098 uint32_t size = info.size; 1099 1100 /* check if this is a dead operand, then we can re-use the space from the definition 1101 * also use the correct stride for sub-dword operands */ 1102 bool is_dead_operand = false; 1103 std::pair<PhysReg, bool> res{PhysReg(), false}; 1104 if (instr->opcode == aco_opcode::p_create_vector) { 1105 res = 1106 get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id); 1107 } else { 1108 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { 1109 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { 1110 info = DefInfo(ctx, instr, var.rc, i); 1111 if (instr->operands[i].isKillBeforeDef()) { 1112 info.bounds = def_reg; 1113 res = get_reg_simple(ctx, reg_file, info); 1114 is_dead_operand = true; 1115 } 1116 break; 1117 } 1118 } 1119 } 1120 if (!res.second) { 1121 /* Try to find space within the bounds but outside of the definition */ 1122 info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi())); 1123 res = get_reg_simple(ctx, reg_file, info); 1124 if (!res.second && def_reg.hi() <= bounds.hi()) { 1125 unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1); 1126 info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi()); 1127 res = get_reg_simple(ctx, reg_file, info); 1128 } 1129 } 1130 1131 if (res.second) { 1132 /* mark the area as blocked */ 1133 reg_file.block(res.first, var.rc); 1134 1135 /* create parallelcopy pair (without definition id) */ 1136 Temp tmp = Temp(id, var.rc); 1137 Operand pc_op = Operand(tmp); 1138 pc_op.setFixed(var.reg); 1139 Definition pc_def = Definition(res.first, pc_op.regClass()); 1140 parallelcopies.emplace_back(pc_op, pc_def); 1141 continue; 1142 } 1143 1144 PhysReg best_pos = bounds.lo(); 1145 unsigned num_moves = 0xFF; 1146 unsigned num_vars = 0; 1147 1148 /* we use a sliding window to find potential positions */ 1149 unsigned stride = var.rc.is_subdword() ? 1 : info.stride; 1150 for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi(); 1151 reg_win += stride) { 1152 if (!is_dead_operand && intersects(reg_win, def_reg)) 1153 continue; 1154 1155 /* second, check that we have at most k=num_moves elements in the window 1156 * and no element is larger than the currently processed one */ 1157 unsigned k = 0; 1158 unsigned n = 0; 1159 unsigned last_var = 0; 1160 bool found = true; 1161 for (PhysReg j : reg_win) { 1162 if (reg_file[j] == 0 || reg_file[j] == last_var) 1163 continue; 1164 1165 if (reg_file.is_blocked(j) || k > num_moves) { 1166 found = false; 1167 break; 1168 } 1169 if (reg_file[j] == 0xF0000000) { 1170 k += 1; 1171 n++; 1172 continue; 1173 } 1174 /* we cannot split live ranges of linear vgprs inside control flow */ 1175 if (!(ctx.block->kind & block_kind_top_level) && 1176 ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { 1177 found = false; 1178 break; 1179 } 1180 bool is_kill = false; 1181 for (const Operand& op : instr->operands) { 1182 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) { 1183 is_kill = true; 1184 break; 1185 } 1186 } 1187 if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) { 1188 found = false; 1189 break; 1190 } 1191 1192 k += ctx.assignments[reg_file[j]].rc.size(); 1193 last_var = reg_file[j]; 1194 n++; 1195 if (k > num_moves || (k == num_moves && n <= num_vars)) { 1196 found = false; 1197 break; 1198 } 1199 } 1200 1201 if (found) { 1202 best_pos = reg_win.lo(); 1203 num_moves = k; 1204 num_vars = n; 1205 } 1206 } 1207 1208 /* FIXME: we messed up and couldn't find space for the variables to be copied */ 1209 if (num_moves == 0xFF) 1210 return false; 1211 1212 PhysRegInterval reg_win{best_pos, size}; 1213 1214 /* collect variables and block reg file */ 1215 std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win); 1216 1217 /* mark the area as blocked */ 1218 reg_file.block(reg_win.lo(), var.rc); 1219 adjust_max_used_regs(ctx, var.rc, reg_win.lo()); 1220 1221 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, bounds, instr, def_reg)) 1222 return false; 1223 1224 /* create parallelcopy pair (without definition id) */ 1225 Temp tmp = Temp(id, var.rc); 1226 Operand pc_op = Operand(tmp); 1227 pc_op.setFixed(var.reg); 1228 Definition pc_def = Definition(reg_win.lo(), pc_op.regClass()); 1229 parallelcopies.emplace_back(pc_op, pc_def); 1230 } 1231 1232 return true; 1233} 1234 1235std::pair<PhysReg, bool> 1236get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, 1237 std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info, 1238 aco_ptr<Instruction>& instr) 1239{ 1240 const PhysRegInterval& bounds = info.bounds; 1241 uint32_t size = info.size; 1242 uint32_t stride = info.stride; 1243 RegClass rc = info.rc; 1244 1245 /* check how many free regs we have */ 1246 unsigned regs_free = reg_file.count_zero(bounds); 1247 1248 /* mark and count killed operands */ 1249 unsigned killed_ops = 0; 1250 std::bitset<256> is_killed_operand; /* per-register */ 1251 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) { 1252 Operand& op = instr->operands[j]; 1253 if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) && 1254 !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) { 1255 assert(op.isFixed()); 1256 1257 for (unsigned i = 0; i < op.size(); ++i) { 1258 is_killed_operand[(op.physReg() & 0xff) + i] = true; 1259 } 1260 1261 killed_ops += op.getTemp().size(); 1262 } 1263 } 1264 1265 assert(regs_free >= size); 1266 /* we might have to move dead operands to dst in order to make space */ 1267 unsigned op_moves = 0; 1268 1269 if (size > (regs_free - killed_ops)) 1270 op_moves = size - (regs_free - killed_ops); 1271 1272 /* find the best position to place the definition */ 1273 PhysRegInterval best_win = {bounds.lo(), size}; 1274 unsigned num_moves = 0xFF; 1275 unsigned num_vars = 0; 1276 1277 /* we use a sliding window to check potential positions */ 1278 for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi(); 1279 reg_win += stride) { 1280 /* first check if the register window starts in the middle of an 1281 * allocated variable: this is what we have to fix to allow for 1282 * num_moves > size */ 1283 if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) && 1284 reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1))) 1285 continue; 1286 if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) && 1287 reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi())) 1288 continue; 1289 1290 /* second, check that we have at most k=num_moves elements in the window 1291 * and no element is larger than the currently processed one */ 1292 unsigned k = op_moves; 1293 unsigned n = 0; 1294 unsigned remaining_op_moves = op_moves; 1295 unsigned last_var = 0; 1296 bool found = true; 1297 bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0; 1298 for (const PhysReg j : reg_win) { 1299 /* dead operands effectively reduce the number of estimated moves */ 1300 if (is_killed_operand[j & 0xFF]) { 1301 if (remaining_op_moves) { 1302 k--; 1303 remaining_op_moves--; 1304 } 1305 continue; 1306 } 1307 1308 if (reg_file[j] == 0 || reg_file[j] == last_var) 1309 continue; 1310 1311 if (reg_file[j] == 0xF0000000) { 1312 k += 1; 1313 n++; 1314 continue; 1315 } 1316 1317 if (ctx.assignments[reg_file[j]].rc.size() >= size) { 1318 found = false; 1319 break; 1320 } 1321 1322 /* we cannot split live ranges of linear vgprs inside control flow */ 1323 // TODO: ensure that live range splits inside control flow are never necessary 1324 if (!(ctx.block->kind & block_kind_top_level) && 1325 ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { 1326 found = false; 1327 break; 1328 } 1329 1330 k += ctx.assignments[reg_file[j]].rc.size(); 1331 n++; 1332 last_var = reg_file[j]; 1333 } 1334 1335 if (!found || k > num_moves) 1336 continue; 1337 if (k == num_moves && n < num_vars) 1338 continue; 1339 if (!aligned && k == num_moves && n == num_vars) 1340 continue; 1341 1342 if (found) { 1343 best_win = reg_win; 1344 num_moves = k; 1345 num_vars = n; 1346 } 1347 } 1348 1349 if (num_moves == 0xFF) 1350 return {{}, false}; 1351 1352 /* now, we figured the placement for our definition */ 1353 RegisterFile tmp_file(reg_file); 1354 1355 /* p_create_vector: also re-place killed operands in the definition space */ 1356 if (instr->opcode == aco_opcode::p_create_vector) { 1357 for (Operand& op : instr->operands) { 1358 if (op.isTemp() && op.isFirstKillBeforeDef()) 1359 tmp_file.fill(op); 1360 } 1361 } 1362 1363 std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win); 1364 1365 /* re-enable killed operands */ 1366 if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) { 1367 for (Operand& op : instr->operands) { 1368 if (op.isTemp() && op.isFirstKillBeforeDef()) 1369 tmp_file.fill(op); 1370 } 1371 } 1372 1373 std::vector<std::pair<Operand, Definition>> pc; 1374 if (!get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, best_win)) 1375 return {{}, false}; 1376 1377 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); 1378 1379 adjust_max_used_regs(ctx, rc, best_win.lo()); 1380 return {best_win.lo(), true}; 1381} 1382 1383bool 1384get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr, 1385 PhysReg reg) 1386{ 1387 /* catch out-of-range registers */ 1388 if (reg >= PhysReg{512}) 1389 return false; 1390 1391 std::pair<unsigned, unsigned> sdw_def_info; 1392 if (rc.is_subdword()) 1393 sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc); 1394 1395 if (rc.is_subdword() && reg.byte() % sdw_def_info.first) 1396 return false; 1397 if (!rc.is_subdword() && reg.byte()) 1398 return false; 1399 1400 if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0) 1401 return false; 1402 1403 PhysRegInterval reg_win = {reg, rc.size()}; 1404 PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type()); 1405 PhysRegInterval vcc_win = {vcc, 2}; 1406 /* VCC is outside the bounds */ 1407 bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc; 1408 bool is_m0 = rc == s1 && reg == m0; 1409 if (!bounds.contains(reg_win) && !is_vcc && !is_m0) 1410 return false; 1411 1412 if (rc.is_subdword()) { 1413 PhysReg test_reg; 1414 test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1); 1415 if (reg_file.test(test_reg, sdw_def_info.second)) 1416 return false; 1417 } else { 1418 if (reg_file.test(reg, rc.bytes())) 1419 return false; 1420 } 1421 1422 adjust_max_used_regs(ctx, rc, reg_win.lo()); 1423 return true; 1424} 1425 1426bool 1427increase_register_file(ra_ctx& ctx, RegType type) 1428{ 1429 if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) { 1430 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, 1431 ctx.program->max_reg_demand.sgpr)); 1432 } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) { 1433 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, 1434 ctx.program->max_reg_demand.sgpr + 1)); 1435 } else { 1436 return false; 1437 } 1438 return true; 1439} 1440 1441struct IDAndRegClass { 1442 IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {} 1443 1444 unsigned id; 1445 RegClass rc; 1446}; 1447 1448struct IDAndInfo { 1449 IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {} 1450 1451 unsigned id; 1452 DefInfo info; 1453}; 1454 1455/* Reallocates vars by sorting them and placing each variable after the previous 1456 * one. If one of the variables has 0xffffffff as an ID, the register assigned 1457 * for that variable will be returned. 1458 */ 1459PhysReg 1460compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars, 1461 std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start) 1462{ 1463 /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword 1464 * temporary sizes to dwords. 1465 */ 1466 std::vector<IDAndInfo> sorted; 1467 for (IDAndRegClass var : vars) { 1468 DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1); 1469 sorted.emplace_back(var.id, info); 1470 } 1471 1472 std::sort( 1473 sorted.begin(), sorted.end(), 1474 [&ctx](const IDAndInfo& a, const IDAndInfo& b) 1475 { 1476 unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4); 1477 unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4); 1478 if (a_stride > b_stride) 1479 return true; 1480 if (a_stride < b_stride) 1481 return false; 1482 if (a.id == 0xffffffff || b.id == 0xffffffff) 1483 return a.id == 1484 0xffffffff; /* place 0xffffffff before others if possible, not for any reason */ 1485 return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg; 1486 }); 1487 1488 PhysReg next_reg = start; 1489 PhysReg space_reg; 1490 for (IDAndInfo& var : sorted) { 1491 unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4; 1492 next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4)); 1493 1494 /* 0xffffffff is a special variable ID used reserve a space for killed 1495 * operands and definitions. 1496 */ 1497 if (var.id != 0xffffffff) { 1498 if (next_reg != ctx.assignments[var.id].reg) { 1499 RegClass rc = ctx.assignments[var.id].rc; 1500 Temp tmp(var.id, rc); 1501 1502 Operand pc_op(tmp); 1503 pc_op.setFixed(ctx.assignments[var.id].reg); 1504 Definition pc_def(next_reg, rc); 1505 parallelcopies.emplace_back(pc_op, pc_def); 1506 } 1507 } else { 1508 space_reg = next_reg; 1509 } 1510 1511 adjust_max_used_regs(ctx, var.info.rc, next_reg); 1512 1513 next_reg = next_reg.advance(var.info.rc.size() * 4); 1514 } 1515 1516 return space_reg; 1517} 1518 1519bool 1520is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr) 1521{ 1522 PhysReg first{512}; 1523 for (unsigned i = 0; i < instr->operands.size() - 3u; i++) { 1524 Operand op = instr->operands[i + 3]; 1525 1526 if (ctx.assignments[op.tempId()].assigned) { 1527 PhysReg reg = ctx.assignments[op.tempId()].reg; 1528 1529 if (first.reg() == 512) { 1530 PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr); 1531 first = reg.advance(i * -4); 1532 PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u}; 1533 if (!bounds.contains(vec)) /* not enough space for other operands */ 1534 return false; 1535 } else { 1536 if (reg != first.advance(i * 4)) /* not at the best position */ 1537 return false; 1538 } 1539 } else { 1540 /* If there's an unexpected temporary, this operand is unlikely to be 1541 * placed in the best position. 1542 */ 1543 if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4)) 1544 return false; 1545 } 1546 } 1547 1548 return true; 1549} 1550 1551std::pair<PhysReg, bool> 1552get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr) 1553{ 1554 Instruction* vec = ctx.vectors[temp.id()]; 1555 unsigned first_operand = vec->format == Format::MIMG ? 3 : 0; 1556 unsigned our_offset = 0; 1557 for (unsigned i = first_operand; i < vec->operands.size(); i++) { 1558 Operand& op = vec->operands[i]; 1559 if (op.isTemp() && op.tempId() == temp.id()) 1560 break; 1561 else 1562 our_offset += op.bytes(); 1563 } 1564 1565 if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) { 1566 unsigned their_offset = 0; 1567 /* check for every operand of the vector 1568 * - whether the operand is assigned and 1569 * - we can use the register relative to that operand 1570 */ 1571 for (unsigned i = first_operand; i < vec->operands.size(); i++) { 1572 Operand& op = vec->operands[i]; 1573 if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() && 1574 ctx.assignments[op.tempId()].assigned) { 1575 PhysReg reg = ctx.assignments[op.tempId()].reg; 1576 reg.reg_b += (our_offset - their_offset); 1577 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) 1578 return {reg, true}; 1579 1580 /* return if MIMG vaddr components don't remain vector-aligned */ 1581 if (vec->format == Format::MIMG) 1582 return {{}, false}; 1583 } 1584 their_offset += op.bytes(); 1585 } 1586 1587 /* We didn't find a register relative to other vector operands. 1588 * Try to find new space which fits the whole vector. 1589 */ 1590 RegClass vec_rc = RegClass::get(temp.type(), their_offset); 1591 DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1); 1592 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info); 1593 PhysReg reg = res.first; 1594 if (res.second) { 1595 reg.reg_b += our_offset; 1596 /* make sure to only use byte offset if the instruction supports it */ 1597 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) 1598 return {reg, true}; 1599 } 1600 } 1601 return {{}, false}; 1602} 1603 1604PhysReg 1605get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, 1606 std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr, 1607 int operand_index = -1) 1608{ 1609 auto split_vec = ctx.split_vectors.find(temp.id()); 1610 if (split_vec != ctx.split_vectors.end()) { 1611 unsigned offset = 0; 1612 for (Definition def : split_vec->second->definitions) { 1613 if (ctx.assignments[def.tempId()].affinity) { 1614 assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity]; 1615 if (affinity.assigned) { 1616 PhysReg reg = affinity.reg; 1617 reg.reg_b -= offset; 1618 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) 1619 return reg; 1620 } 1621 } 1622 offset += def.bytes(); 1623 } 1624 } 1625 1626 if (ctx.assignments[temp.id()].affinity) { 1627 assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity]; 1628 if (affinity.assigned) { 1629 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg)) 1630 return affinity.reg; 1631 } 1632 } 1633 if (ctx.assignments[temp.id()].vcc) { 1634 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc)) 1635 return vcc; 1636 } 1637 1638 std::pair<PhysReg, bool> res; 1639 1640 if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) { 1641 res = get_reg_vector(ctx, reg_file, temp, instr); 1642 if (res.second) 1643 return res.first; 1644 } 1645 1646 DefInfo info(ctx, instr, temp.regClass(), operand_index); 1647 1648 if (!ctx.policy.skip_optimistic_path) { 1649 /* try to find space without live-range splits */ 1650 res = get_reg_simple(ctx, reg_file, info); 1651 1652 if (res.second) 1653 return res.first; 1654 } 1655 1656 /* try to find space with live-range splits */ 1657 res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr); 1658 1659 if (res.second) 1660 return res.first; 1661 1662 /* try using more registers */ 1663 1664 /* We should only fail here because keeping under the limit would require 1665 * too many moves. */ 1666 assert(reg_file.count_zero(info.bounds) >= info.size); 1667 1668 if (!increase_register_file(ctx, info.rc.type())) { 1669 /* fallback algorithm: reallocate all variables at once */ 1670 unsigned def_size = info.rc.size(); 1671 for (Definition def : instr->definitions) { 1672 if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type()) 1673 def_size += def.regClass().size(); 1674 } 1675 1676 unsigned killed_op_size = 0; 1677 for (Operand op : instr->operands) { 1678 if (op.isTemp() && op.isKillBeforeDef() && op.regClass().type() == info.rc.type()) 1679 killed_op_size += op.regClass().size(); 1680 } 1681 1682 const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type()); 1683 1684 /* reallocate passthrough variables and non-killed operands */ 1685 std::vector<IDAndRegClass> vars; 1686 for (unsigned id : find_vars(ctx, reg_file, regs)) 1687 vars.emplace_back(id, ctx.assignments[id].rc); 1688 vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size))); 1689 1690 PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo()); 1691 1692 /* reallocate killed operands */ 1693 std::vector<IDAndRegClass> killed_op_vars; 1694 for (Operand op : instr->operands) { 1695 if (op.isKillBeforeDef() && op.regClass().type() == info.rc.type()) 1696 killed_op_vars.emplace_back(op.tempId(), op.regClass()); 1697 } 1698 compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space); 1699 1700 /* reallocate definitions */ 1701 std::vector<IDAndRegClass> def_vars; 1702 for (Definition def : instr->definitions) { 1703 if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type()) 1704 def_vars.emplace_back(def.tempId(), def.regClass()); 1705 } 1706 def_vars.emplace_back(0xffffffff, info.rc); 1707 return compact_relocate_vars(ctx, def_vars, parallelcopies, space); 1708 } 1709 1710 return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index); 1711} 1712 1713PhysReg 1714get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, 1715 std::vector<std::pair<Operand, Definition>>& parallelcopies, 1716 aco_ptr<Instruction>& instr) 1717{ 1718 RegClass rc = temp.regClass(); 1719 /* create_vector instructions have different costs w.r.t. register coalescing */ 1720 uint32_t size = rc.size(); 1721 uint32_t bytes = rc.bytes(); 1722 uint32_t stride = get_stride(rc); 1723 PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type()); 1724 1725 // TODO: improve p_create_vector for sub-dword vectors 1726 1727 PhysReg best_pos{0xFFF}; 1728 unsigned num_moves = 0xFF; 1729 bool best_avoid = true; 1730 uint32_t correct_pos_mask = 0; 1731 1732 /* test for each operand which definition placement causes the least shuffle instructions */ 1733 for (unsigned i = 0, offset = 0; i < instr->operands.size(); 1734 offset += instr->operands[i].bytes(), i++) { 1735 // TODO: think about, if we can alias live operands on the same register 1736 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || 1737 instr->operands[i].getTemp().type() != rc.type()) 1738 continue; 1739 1740 if (offset > instr->operands[i].physReg().reg_b) 1741 continue; 1742 1743 unsigned reg_lower = instr->operands[i].physReg().reg_b - offset; 1744 if (reg_lower % 4) 1745 continue; 1746 PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size}; 1747 unsigned k = 0; 1748 1749 /* no need to check multiple times */ 1750 if (reg_win.lo() == best_pos) 1751 continue; 1752 1753 /* check borders */ 1754 // TODO: this can be improved */ 1755 if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0) 1756 continue; 1757 if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 && 1758 reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1))) 1759 continue; 1760 if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 && 1761 reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi())) 1762 continue; 1763 1764 /* count variables to be moved and check "avoid" */ 1765 bool avoid = false; 1766 bool linear_vgpr = false; 1767 for (PhysReg j : reg_win) { 1768 if (reg_file[j] != 0) { 1769 if (reg_file[j] == 0xF0000000) { 1770 PhysReg reg; 1771 reg.reg_b = j * 4; 1772 unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4; 1773 for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++) 1774 k += reg_file.test(reg, 1); 1775 } else { 1776 k += 4; 1777 linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr(); 1778 } 1779 } 1780 avoid |= ctx.war_hint[j]; 1781 } 1782 1783 if (linear_vgpr) { 1784 /* we cannot split live ranges of linear vgprs inside control flow */ 1785 if (ctx.block->kind & block_kind_top_level) 1786 avoid = true; 1787 else 1788 continue; 1789 } 1790 1791 if (avoid && !best_avoid) 1792 continue; 1793 1794 /* count operands in wrong positions */ 1795 uint32_t correct_pos_mask_new = 0; 1796 for (unsigned j = 0, offset2 = 0; j < instr->operands.size(); 1797 offset2 += instr->operands[j].bytes(), j++) { 1798 Operand& op = instr->operands[j]; 1799 if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2) 1800 correct_pos_mask_new |= 1 << j; 1801 else 1802 k += op.bytes(); 1803 } 1804 bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0; 1805 if (k > num_moves || (!aligned && k == num_moves)) 1806 continue; 1807 1808 best_pos = reg_win.lo(); 1809 num_moves = k; 1810 best_avoid = avoid; 1811 correct_pos_mask = correct_pos_mask_new; 1812 } 1813 1814 /* too many moves: try the generic get_reg() function */ 1815 if (num_moves >= 2 * bytes) { 1816 return get_reg(ctx, reg_file, temp, parallelcopies, instr); 1817 } else if (num_moves > bytes) { 1818 DefInfo info(ctx, instr, rc, -1); 1819 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info); 1820 if (res.second) 1821 return res.first; 1822 } 1823 1824 /* re-enable killed operands which are in the wrong position */ 1825 RegisterFile tmp_file(reg_file); 1826 for (Operand& op : instr->operands) { 1827 if (op.isTemp() && op.isFirstKillBeforeDef()) 1828 tmp_file.fill(op); 1829 } 1830 for (unsigned i = 0; i < instr->operands.size(); i++) { 1831 if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill()) 1832 tmp_file.clear(instr->operands[i]); 1833 } 1834 1835 /* collect variables to be moved */ 1836 std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); 1837 1838 bool success = false; 1839 std::vector<std::pair<Operand, Definition>> pc; 1840 success = 1841 get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size}); 1842 1843 if (!success) { 1844 if (!increase_register_file(ctx, temp.type())) { 1845 /* use the fallback algorithm in get_reg() */ 1846 return get_reg(ctx, reg_file, temp, parallelcopies, instr); 1847 } 1848 return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr); 1849 } 1850 1851 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); 1852 adjust_max_used_regs(ctx, rc, best_pos); 1853 1854 return best_pos; 1855} 1856 1857void 1858handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) 1859{ 1860 if (instr->format != Format::PSEUDO) 1861 return; 1862 1863 /* all instructions which use handle_operands() need this information */ 1864 switch (instr->opcode) { 1865 case aco_opcode::p_extract_vector: 1866 case aco_opcode::p_create_vector: 1867 case aco_opcode::p_split_vector: 1868 case aco_opcode::p_parallelcopy: 1869 case aco_opcode::p_wqm: break; 1870 default: return; 1871 } 1872 1873 bool writes_linear = false; 1874 /* if all definitions are logical vgpr, no need to care for SCC */ 1875 for (Definition& def : instr->definitions) { 1876 if (def.getTemp().regClass().is_linear()) 1877 writes_linear = true; 1878 } 1879 /* if all operands are constant, no need to care either */ 1880 bool reads_linear = false; 1881 bool reads_subdword = false; 1882 for (Operand& op : instr->operands) { 1883 if (op.isTemp() && op.getTemp().regClass().is_linear()) 1884 reads_linear = true; 1885 if (op.isTemp() && op.regClass().is_subdword()) 1886 reads_subdword = true; 1887 } 1888 bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) || 1889 (ctx.program->gfx_level <= GFX7 && reads_subdword); 1890 if (!needs_scratch_reg) 1891 return; 1892 1893 instr->pseudo().tmp_in_scc = reg_file[scc]; 1894 1895 int reg = ctx.max_used_sgpr; 1896 for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--) 1897 ; 1898 if (reg < 0) { 1899 reg = ctx.max_used_sgpr + 1; 1900 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++) 1901 ; 1902 if (reg == ctx.program->max_reg_demand.sgpr) { 1903 assert(reads_subdword && reg_file[m0] == 0); 1904 reg = m0; 1905 } 1906 } 1907 1908 adjust_max_used_regs(ctx, s1, reg); 1909 instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg}; 1910} 1911 1912bool 1913operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, 1914 RegClass rc) 1915{ 1916 if (instr->operands[idx].isFixed()) 1917 return instr->operands[idx].physReg() == reg; 1918 1919 bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 || 1920 instr->opcode == aco_opcode::v_writelane_b32_e64; 1921 if (gfx_level <= GFX9 && is_writelane && idx <= 1) { 1922 /* v_writelane_b32 can take two sgprs but only if one is m0. */ 1923 bool is_other_sgpr = 1924 instr->operands[!idx].isTemp() && 1925 (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0); 1926 if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) { 1927 instr->operands[idx].setFixed(m0); 1928 return reg == m0; 1929 } 1930 } 1931 1932 if (reg.byte()) { 1933 unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc); 1934 if (reg.byte() % stride) 1935 return false; 1936 } 1937 1938 switch (instr->format) { 1939 case Format::SMEM: 1940 return reg != scc && reg != exec && 1941 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */ 1942 (reg != vcc || (instr->definitions.empty() && idx == 2) || 1943 gfx_level >= GFX10); /* sdata can be vcc */ 1944 default: 1945 // TODO: there are more instructions with restrictions on registers 1946 return true; 1947 } 1948} 1949 1950void 1951get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, 1952 std::vector<std::pair<Operand, Definition>>& parallelcopy, 1953 aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index) 1954{ 1955 /* check if the operand is fixed */ 1956 PhysReg src = ctx.assignments[operand.tempId()].reg; 1957 PhysReg dst; 1958 if (operand.isFixed()) { 1959 assert(operand.physReg() != src); 1960 1961 /* check if target reg is blocked, and move away the blocking var */ 1962 if (register_file.test(operand.physReg(), operand.bytes())) { 1963 PhysRegInterval target{operand.physReg(), operand.size()}; 1964 1965 RegisterFile tmp_file(register_file); 1966 1967 std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, target); 1968 1969 tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src 1970 tmp_file.block(operand.physReg(), operand.regClass()); 1971 1972 DefInfo info(ctx, instr, operand.regClass(), -1); 1973 get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr, 1974 PhysRegInterval()); 1975 } 1976 dst = operand.physReg(); 1977 1978 } else { 1979 /* clear the operand in case it's only a stride mismatch */ 1980 register_file.clear(src, operand.regClass()); 1981 dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index); 1982 } 1983 1984 Operand pc_op = operand; 1985 pc_op.setFixed(src); 1986 Definition pc_def = Definition(dst, pc_op.regClass()); 1987 parallelcopy.emplace_back(pc_op, pc_def); 1988 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops); 1989} 1990 1991PhysReg 1992get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file, 1993 std::vector<aco_ptr<Instruction>>& instructions, Block& block, 1994 aco_ptr<Instruction>& phi, Temp tmp) 1995{ 1996 std::vector<std::pair<Operand, Definition>> parallelcopy; 1997 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi); 1998 update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops); 1999 2000 /* process parallelcopy */ 2001 for (std::pair<Operand, Definition> pc : parallelcopy) { 2002 /* see if it's a copy from a different phi */ 2003 // TODO: prefer moving some previous phis over live-ins 2004 // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a 2005 // problem in practice since they can only be fixed to exec) 2006 Instruction* prev_phi = NULL; 2007 std::vector<aco_ptr<Instruction>>::iterator phi_it; 2008 for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) { 2009 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) 2010 prev_phi = phi_it->get(); 2011 } 2012 if (prev_phi) { 2013 /* if so, just update that phi's register */ 2014 prev_phi->definitions[0].setFixed(pc.second.physReg()); 2015 register_file.fill(prev_phi->definitions[0]); 2016 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), 2017 pc.second.regClass()}; 2018 continue; 2019 } 2020 2021 /* rename */ 2022 std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId()); 2023 Temp orig = pc.first.getTemp(); 2024 if (orig_it != ctx.orig_names.end()) 2025 orig = orig_it->second; 2026 else 2027 ctx.orig_names[pc.second.tempId()] = orig; 2028 ctx.renames[block.index][orig.id()] = pc.second.getTemp(); 2029 2030 /* otherwise, this is a live-in and we need to create a new phi 2031 * to move it in this block's predecessors */ 2032 aco_opcode opcode = 2033 pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; 2034 std::vector<unsigned>& preds = 2035 pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; 2036 aco_ptr<Instruction> new_phi{ 2037 create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; 2038 new_phi->definitions[0] = pc.second; 2039 for (unsigned i = 0; i < preds.size(); i++) 2040 new_phi->operands[i] = Operand(pc.first); 2041 instructions.emplace_back(std::move(new_phi)); 2042 2043 /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is 2044 * a loop header. 2045 */ 2046 live_in.erase(orig.id()); 2047 } 2048 2049 return reg; 2050} 2051 2052void 2053get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file, 2054 std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in) 2055{ 2056 /* move all phis to instructions */ 2057 for (aco_ptr<Instruction>& phi : block.instructions) { 2058 if (!is_phi(phi)) 2059 break; 2060 if (!phi->definitions[0].isKill()) 2061 instructions.emplace_back(std::move(phi)); 2062 } 2063 2064 /* assign phis with all-matching registers to that register */ 2065 for (aco_ptr<Instruction>& phi : instructions) { 2066 Definition& definition = phi->definitions[0]; 2067 if (definition.isFixed()) 2068 continue; 2069 2070 if (!phi->operands[0].isTemp()) 2071 continue; 2072 2073 PhysReg reg = phi->operands[0].physReg(); 2074 auto OpsSame = [=](const Operand& op) -> bool 2075 { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); }; 2076 bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame); 2077 if (!all_same) 2078 continue; 2079 2080 if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) 2081 continue; 2082 2083 definition.setFixed(reg); 2084 register_file.fill(definition); 2085 ctx.assignments[definition.tempId()].set(definition); 2086 } 2087 2088 /* try to find a register that is used by at least one operand */ 2089 for (aco_ptr<Instruction>& phi : instructions) { 2090 Definition& definition = phi->definitions[0]; 2091 if (definition.isFixed()) 2092 continue; 2093 2094 /* use affinity if available */ 2095 if (ctx.assignments[definition.tempId()].affinity && 2096 ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) { 2097 assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity]; 2098 assert(affinity.rc == definition.regClass()); 2099 if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) { 2100 definition.setFixed(affinity.reg); 2101 register_file.fill(definition); 2102 ctx.assignments[definition.tempId()].set(definition); 2103 continue; 2104 } 2105 } 2106 2107 /* by going backwards, we aim to avoid copies in else-blocks */ 2108 for (int i = phi->operands.size() - 1; i >= 0; i--) { 2109 const Operand& op = phi->operands[i]; 2110 if (!op.isTemp() || !op.isFixed()) 2111 continue; 2112 2113 PhysReg reg = op.physReg(); 2114 if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) { 2115 definition.setFixed(reg); 2116 register_file.fill(definition); 2117 ctx.assignments[definition.tempId()].set(definition); 2118 break; 2119 } 2120 } 2121 } 2122 2123 /* find registers for phis where the register was blocked or no operand was assigned */ 2124 2125 /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */ 2126 for (unsigned i = 0; i < instructions.size(); i++) { 2127 aco_ptr<Instruction>& phi = instructions[i]; 2128 Definition& definition = phi->definitions[0]; 2129 if (definition.isFixed()) 2130 continue; 2131 2132 definition.setFixed( 2133 get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp())); 2134 2135 register_file.fill(definition); 2136 ctx.assignments[definition.tempId()].set(definition); 2137 } 2138} 2139 2140Temp 2141read_variable(ra_ctx& ctx, Temp val, unsigned block_idx) 2142{ 2143 std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id()); 2144 if (it == ctx.renames[block_idx].end()) 2145 return val; 2146 else 2147 return it->second; 2148} 2149 2150Temp 2151handle_live_in(ra_ctx& ctx, Temp val, Block* block) 2152{ 2153 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds; 2154 if (preds.size() == 0) 2155 return val; 2156 2157 if (preds.size() == 1) { 2158 /* if the block has only one predecessor, just look there for the name */ 2159 return read_variable(ctx, val, preds[0]); 2160 } 2161 2162 /* there are multiple predecessors and the block is sealed */ 2163 Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp)); 2164 2165 /* get the rename from each predecessor and check if they are the same */ 2166 Temp new_val; 2167 bool needs_phi = false; 2168 for (unsigned i = 0; i < preds.size(); i++) { 2169 ops[i] = read_variable(ctx, val, preds[i]); 2170 if (i == 0) 2171 new_val = ops[i]; 2172 else 2173 needs_phi |= !(new_val == ops[i]); 2174 } 2175 2176 if (needs_phi) { 2177 assert(!val.regClass().is_linear_vgpr()); 2178 2179 /* the variable has been renamed differently in the predecessors: we need to insert a phi */ 2180 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; 2181 aco_ptr<Instruction> phi{ 2182 create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; 2183 new_val = ctx.program->allocateTmp(val.regClass()); 2184 phi->definitions[0] = Definition(new_val); 2185 ctx.assignments.emplace_back(); 2186 assert(ctx.assignments.size() == ctx.program->peekAllocationId()); 2187 for (unsigned i = 0; i < preds.size(); i++) { 2188 /* update the operands so that it uses the new affinity */ 2189 phi->operands[i] = Operand(ops[i]); 2190 assert(ctx.assignments[ops[i].id()].assigned); 2191 assert(ops[i].regClass() == new_val.regClass()); 2192 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg); 2193 } 2194 block->instructions.insert(block->instructions.begin(), std::move(phi)); 2195 } 2196 2197 return new_val; 2198} 2199 2200void 2201handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx, 2202 uint32_t loop_exit_idx) 2203{ 2204 Block& loop_header = ctx.program->blocks[loop_header_idx]; 2205 std::unordered_map<unsigned, Temp> renames; 2206 2207 /* create phis for variables renamed during the loop */ 2208 for (unsigned t : live_in) { 2209 Temp val = Temp(t, ctx.program->temp_rc[t]); 2210 Temp prev = read_variable(ctx, val, loop_header_idx - 1); 2211 Temp renamed = handle_live_in(ctx, val, &loop_header); 2212 if (renamed == prev) 2213 continue; 2214 2215 /* insert additional renames at block end, but don't overwrite */ 2216 renames[prev.id()] = renamed; 2217 ctx.orig_names[renamed.id()] = val; 2218 for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) { 2219 auto it = ctx.renames[idx].emplace(val.id(), renamed); 2220 /* if insertion is unsuccessful, update if necessary */ 2221 if (!it.second && it.first->second == prev) 2222 it.first->second = renamed; 2223 } 2224 2225 /* update loop-carried values of the phi created by handle_live_in() */ 2226 for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) { 2227 Operand& op = loop_header.instructions[0]->operands[i]; 2228 if (op.getTemp() == prev) 2229 op.setTemp(renamed); 2230 } 2231 2232 /* use the assignment from the loop preheader and fix def reg */ 2233 assignment& var = ctx.assignments[prev.id()]; 2234 ctx.assignments[renamed.id()] = var; 2235 loop_header.instructions[0]->definitions[0].setFixed(var.reg); 2236 } 2237 2238 /* rename loop carried phi operands */ 2239 for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) { 2240 aco_ptr<Instruction>& phi = loop_header.instructions[i]; 2241 if (!is_phi(phi)) 2242 break; 2243 const std::vector<unsigned>& preds = 2244 phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds; 2245 for (unsigned j = 1; j < phi->operands.size(); j++) { 2246 Operand& op = phi->operands[j]; 2247 if (!op.isTemp()) 2248 continue; 2249 2250 /* Find the original name, since this operand might not use the original name if the phi 2251 * was created after init_reg_file(). 2252 */ 2253 std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(op.tempId()); 2254 Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp(); 2255 2256 op.setTemp(read_variable(ctx, orig, preds[j])); 2257 op.setFixed(ctx.assignments[op.tempId()].reg); 2258 } 2259 } 2260 2261 /* return early if no new phi was created */ 2262 if (renames.empty()) 2263 return; 2264 2265 /* propagate new renames through loop */ 2266 for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) { 2267 Block& current = ctx.program->blocks[idx]; 2268 /* rename all uses in this block */ 2269 for (aco_ptr<Instruction>& instr : current.instructions) { 2270 /* phis are renamed after RA */ 2271 if (idx == loop_header_idx && is_phi(instr)) 2272 continue; 2273 2274 for (Operand& op : instr->operands) { 2275 if (!op.isTemp()) 2276 continue; 2277 2278 auto rename = renames.find(op.tempId()); 2279 if (rename != renames.end()) { 2280 assert(rename->second.id()); 2281 op.setTemp(rename->second); 2282 } 2283 } 2284 } 2285 } 2286} 2287 2288/** 2289 * This function serves the purpose to correctly initialize the register file 2290 * at the beginning of a block (before any existing phis). 2291 * In order to do so, all live-in variables are entered into the RegisterFile. 2292 * Reg-to-reg moves (renames) from previous blocks are taken into account and 2293 * the SSA is repaired by inserting corresponding phi-nodes. 2294 */ 2295RegisterFile 2296init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block) 2297{ 2298 if (block.kind & block_kind_loop_exit) { 2299 uint32_t header = ctx.loop_header.back(); 2300 ctx.loop_header.pop_back(); 2301 handle_loop_phis(ctx, live_out_per_block[header], header, block.index); 2302 } 2303 2304 RegisterFile register_file; 2305 const IDSet& live_in = live_out_per_block[block.index]; 2306 assert(block.index != 0 || live_in.empty()); 2307 2308 if (block.kind & block_kind_loop_header) { 2309 ctx.loop_header.emplace_back(block.index); 2310 /* already rename phis incoming value */ 2311 for (aco_ptr<Instruction>& instr : block.instructions) { 2312 if (!is_phi(instr)) 2313 break; 2314 Operand& operand = instr->operands[0]; 2315 if (operand.isTemp()) { 2316 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1)); 2317 operand.setFixed(ctx.assignments[operand.tempId()].reg); 2318 } 2319 } 2320 for (unsigned t : live_in) { 2321 Temp val = Temp(t, ctx.program->temp_rc[t]); 2322 Temp renamed = read_variable(ctx, val, block.index - 1); 2323 if (renamed != val) 2324 ctx.renames[block.index][val.id()] = renamed; 2325 assignment& var = ctx.assignments[renamed.id()]; 2326 assert(var.assigned); 2327 register_file.fill(Definition(renamed.id(), var.reg, var.rc)); 2328 } 2329 } else { 2330 /* rename phi operands */ 2331 for (aco_ptr<Instruction>& instr : block.instructions) { 2332 if (!is_phi(instr)) 2333 break; 2334 const std::vector<unsigned>& preds = 2335 instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; 2336 2337 for (unsigned i = 0; i < instr->operands.size(); i++) { 2338 Operand& operand = instr->operands[i]; 2339 if (!operand.isTemp()) 2340 continue; 2341 operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i])); 2342 operand.setFixed(ctx.assignments[operand.tempId()].reg); 2343 } 2344 } 2345 for (unsigned t : live_in) { 2346 Temp val = Temp(t, ctx.program->temp_rc[t]); 2347 Temp renamed = handle_live_in(ctx, val, &block); 2348 assignment& var = ctx.assignments[renamed.id()]; 2349 /* due to live-range splits, the live-in might be a phi, now */ 2350 if (var.assigned) { 2351 register_file.fill(Definition(renamed.id(), var.reg, var.rc)); 2352 } 2353 if (renamed != val) { 2354 ctx.renames[block.index].emplace(t, renamed); 2355 ctx.orig_names[renamed.id()] = val; 2356 } 2357 } 2358 } 2359 2360 return register_file; 2361} 2362 2363void 2364get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block) 2365{ 2366 std::vector<std::vector<Temp>> phi_ressources; 2367 std::unordered_map<unsigned, unsigned> temp_to_phi_ressources; 2368 2369 for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend(); 2370 block_rit++) { 2371 Block& block = *block_rit; 2372 2373 /* first, compute the death points of all live vars within the block */ 2374 IDSet& live = live_out_per_block[block.index]; 2375 2376 std::vector<aco_ptr<Instruction>>::reverse_iterator rit; 2377 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) { 2378 aco_ptr<Instruction>& instr = *rit; 2379 if (is_phi(instr)) 2380 break; 2381 2382 /* add vector affinities */ 2383 if (instr->opcode == aco_opcode::p_create_vector) { 2384 for (const Operand& op : instr->operands) { 2385 if (op.isTemp() && op.isFirstKill() && 2386 op.getTemp().type() == instr->definitions[0].getTemp().type()) 2387 ctx.vectors[op.tempId()] = instr.get(); 2388 } 2389 } else if (instr->format == Format::MIMG && instr->operands.size() > 4) { 2390 for (unsigned i = 3; i < instr->operands.size(); i++) 2391 ctx.vectors[instr->operands[i].tempId()] = instr.get(); 2392 } else if (instr->opcode == aco_opcode::p_split_vector && 2393 instr->operands[0].isFirstKillBeforeDef()) { 2394 ctx.split_vectors[instr->operands[0].tempId()] = instr.get(); 2395 } else if (instr->isVOPC() && !instr->isVOP3()) { 2396 if (!instr->isSDWA() || ctx.program->gfx_level == GFX8) 2397 ctx.assignments[instr->definitions[0].tempId()].vcc = true; 2398 } else if (instr->isVOP2() && !instr->isVOP3()) { 2399 if (instr->operands.size() == 3 && instr->operands[2].isTemp() && 2400 instr->operands[2].regClass().type() == RegType::sgpr) 2401 ctx.assignments[instr->operands[2].tempId()].vcc = true; 2402 if (instr->definitions.size() == 2) 2403 ctx.assignments[instr->definitions[1].tempId()].vcc = true; 2404 } else if (instr->opcode == aco_opcode::s_and_b32 || 2405 instr->opcode == aco_opcode::s_and_b64) { 2406 /* If SCC is used by a branch, we might be able to use 2407 * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC. 2408 */ 2409 if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() && 2410 instr->operands[1].isFixed() && instr->operands[1].physReg() == exec) 2411 ctx.assignments[instr->operands[0].tempId()].vcc = true; 2412 } 2413 2414 /* add operands to live variables */ 2415 for (const Operand& op : instr->operands) { 2416 if (op.isTemp()) 2417 live.insert(op.tempId()); 2418 } 2419 2420 /* erase definitions from live */ 2421 for (unsigned i = 0; i < instr->definitions.size(); i++) { 2422 const Definition& def = instr->definitions[i]; 2423 if (!def.isTemp()) 2424 continue; 2425 live.erase(def.tempId()); 2426 /* mark last-seen phi operand */ 2427 std::unordered_map<unsigned, unsigned>::iterator it = 2428 temp_to_phi_ressources.find(def.tempId()); 2429 if (it != temp_to_phi_ressources.end() && 2430 def.regClass() == phi_ressources[it->second][0].regClass()) { 2431 phi_ressources[it->second][0] = def.getTemp(); 2432 /* try to coalesce phi affinities with parallelcopies */ 2433 Operand op = Operand(); 2434 switch (instr->opcode) { 2435 case aco_opcode::p_parallelcopy: op = instr->operands[i]; break; 2436 2437 case aco_opcode::v_interp_p2_f32: 2438 case aco_opcode::v_writelane_b32: 2439 case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break; 2440 2441 case aco_opcode::v_fma_f32: 2442 case aco_opcode::v_fma_f16: 2443 case aco_opcode::v_pk_fma_f16: 2444 if (ctx.program->gfx_level < GFX10) 2445 continue; 2446 FALLTHROUGH; 2447 case aco_opcode::v_mad_f32: 2448 case aco_opcode::v_mad_f16: 2449 if (instr->usesModifiers()) 2450 continue; 2451 op = instr->operands[2]; 2452 break; 2453 2454 case aco_opcode::v_mad_legacy_f32: 2455 case aco_opcode::v_fma_legacy_f32: 2456 if (instr->usesModifiers() || !ctx.program->dev.has_mac_legacy32) 2457 continue; 2458 op = instr->operands[2]; 2459 break; 2460 2461 default: continue; 2462 } 2463 2464 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) { 2465 phi_ressources[it->second].emplace_back(op.getTemp()); 2466 temp_to_phi_ressources[op.tempId()] = it->second; 2467 } 2468 } 2469 } 2470 } 2471 2472 /* collect phi affinities */ 2473 for (; rit != block.instructions.rend(); ++rit) { 2474 aco_ptr<Instruction>& instr = *rit; 2475 assert(is_phi(instr)); 2476 2477 live.erase(instr->definitions[0].tempId()); 2478 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) 2479 continue; 2480 2481 assert(instr->definitions[0].isTemp()); 2482 std::unordered_map<unsigned, unsigned>::iterator it = 2483 temp_to_phi_ressources.find(instr->definitions[0].tempId()); 2484 unsigned index = phi_ressources.size(); 2485 std::vector<Temp>* affinity_related; 2486 if (it != temp_to_phi_ressources.end()) { 2487 index = it->second; 2488 phi_ressources[index][0] = instr->definitions[0].getTemp(); 2489 affinity_related = &phi_ressources[index]; 2490 } else { 2491 phi_ressources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()}); 2492 affinity_related = &phi_ressources.back(); 2493 } 2494 2495 for (const Operand& op : instr->operands) { 2496 if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) { 2497 affinity_related->emplace_back(op.getTemp()); 2498 if (block.kind & block_kind_loop_header) 2499 continue; 2500 temp_to_phi_ressources[op.tempId()] = index; 2501 } 2502 } 2503 } 2504 2505 /* visit the loop header phis first in order to create nested affinities */ 2506 if (block.kind & block_kind_loop_exit) { 2507 /* find loop header */ 2508 auto header_rit = block_rit; 2509 while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth) 2510 header_rit++; 2511 2512 for (aco_ptr<Instruction>& phi : header_rit->instructions) { 2513 if (!is_phi(phi)) 2514 break; 2515 if (phi->definitions[0].isKill() || phi->definitions[0].isFixed()) 2516 continue; 2517 2518 /* create an (empty) merge-set for the phi-related variables */ 2519 auto it = temp_to_phi_ressources.find(phi->definitions[0].tempId()); 2520 unsigned index = phi_ressources.size(); 2521 if (it == temp_to_phi_ressources.end()) { 2522 temp_to_phi_ressources[phi->definitions[0].tempId()] = index; 2523 phi_ressources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()}); 2524 } else { 2525 index = it->second; 2526 } 2527 for (unsigned i = 1; i < phi->operands.size(); i++) { 2528 const Operand& op = phi->operands[i]; 2529 if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) { 2530 temp_to_phi_ressources[op.tempId()] = index; 2531 } 2532 } 2533 } 2534 } 2535 } 2536 /* create affinities */ 2537 for (std::vector<Temp>& vec : phi_ressources) { 2538 for (unsigned i = 1; i < vec.size(); i++) 2539 if (vec[i].id() != vec[0].id()) 2540 ctx.assignments[vec[i].id()].affinity = vec[0].id(); 2541 } 2542} 2543 2544void 2545optimize_encoding_vop2(Program* program, ra_ctx& ctx, RegisterFile& register_file, 2546 aco_ptr<Instruction>& instr) 2547{ 2548 /* try to optimize v_mad_f32 -> v_mac_f32 */ 2549 if ((instr->opcode != aco_opcode::v_mad_f32 && 2550 (instr->opcode != aco_opcode::v_fma_f32 || program->gfx_level < GFX10) && 2551 instr->opcode != aco_opcode::v_mad_f16 && instr->opcode != aco_opcode::v_mad_legacy_f16 && 2552 (instr->opcode != aco_opcode::v_fma_f16 || program->gfx_level < GFX10) && 2553 (instr->opcode != aco_opcode::v_pk_fma_f16 || program->gfx_level < GFX10) && 2554 (instr->opcode != aco_opcode::v_mad_legacy_f32 || !program->dev.has_mac_legacy32) && 2555 (instr->opcode != aco_opcode::v_fma_legacy_f32 || !program->dev.has_mac_legacy32) && 2556 (instr->opcode != aco_opcode::v_dot4_i32_i8 || program->family == CHIP_VEGA20)) || 2557 !instr->operands[2].isTemp() || !instr->operands[2].isKillBeforeDef() || 2558 instr->operands[2].getTemp().type() != RegType::vgpr || 2559 ((!instr->operands[0].isTemp() || instr->operands[0].getTemp().type() != RegType::vgpr) && 2560 (!instr->operands[1].isTemp() || instr->operands[1].getTemp().type() != RegType::vgpr)) || 2561 instr->usesModifiers() || instr->operands[0].physReg().byte() != 0 || 2562 instr->operands[1].physReg().byte() != 0 || instr->operands[2].physReg().byte() != 0) 2563 return; 2564 2565 if (!instr->operands[1].isTemp() || instr->operands[1].getTemp().type() != RegType::vgpr) 2566 std::swap(instr->operands[0], instr->operands[1]); 2567 2568 unsigned def_id = instr->definitions[0].tempId(); 2569 if (ctx.assignments[def_id].affinity) { 2570 assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity]; 2571 if (affinity.assigned && affinity.reg != instr->operands[2].physReg() && 2572 !register_file.test(affinity.reg, instr->operands[2].bytes())) 2573 return; 2574 } 2575 2576 static_assert(sizeof(VOP2_instruction) <= sizeof(VOP3_instruction), 2577 "Invalid direct instruction cast."); 2578 static_assert(sizeof(VOP2_instruction) <= sizeof(VOP3P_instruction), 2579 "Invalid direct instruction cast."); 2580 instr->format = Format::VOP2; 2581 switch (instr->opcode) { 2582 case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break; 2583 case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break; 2584 case aco_opcode::v_mad_f16: 2585 case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break; 2586 case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break; 2587 case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break; 2588 case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break; 2589 case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break; 2590 case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break; 2591 default: break; 2592 } 2593} 2594 2595void 2596optimize_encoding_sopk(Program* program, ra_ctx& ctx, RegisterFile& register_file, 2597 aco_ptr<Instruction>& instr) 2598{ 2599 /* try to optimize sop2 with literal source to sopk */ 2600 if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_mul_i32 && 2601 instr->opcode != aco_opcode::s_cselect_b32) 2602 return; 2603 2604 uint32_t literal_idx = 0; 2605 2606 if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral()) 2607 literal_idx = 1; 2608 2609 if (!instr->operands[!literal_idx].isTemp() || 2610 !instr->operands[!literal_idx].isKillBeforeDef() || 2611 instr->operands[!literal_idx].getTemp().type() != RegType::sgpr || 2612 instr->operands[!literal_idx].physReg() >= 128) 2613 return; 2614 2615 if (!instr->operands[literal_idx].isLiteral()) 2616 return; 2617 2618 const uint32_t i16_mask = 0xffff8000u; 2619 uint32_t value = instr->operands[literal_idx].constantValue(); 2620 if ((value & i16_mask) && (value & i16_mask) != i16_mask) 2621 return; 2622 2623 unsigned def_id = instr->definitions[0].tempId(); 2624 if (ctx.assignments[def_id].affinity) { 2625 assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity]; 2626 if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() && 2627 !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes())) 2628 return; 2629 } 2630 2631 static_assert(sizeof(SOPK_instruction) <= sizeof(SOP2_instruction), 2632 "Invalid direct instruction cast."); 2633 instr->format = Format::SOPK; 2634 SOPK_instruction* instr_sopk = &instr->sopk(); 2635 2636 instr_sopk->imm = instr_sopk->operands[literal_idx].constantValue() & 0xffff; 2637 if (literal_idx == 0) 2638 std::swap(instr_sopk->operands[0], instr_sopk->operands[1]); 2639 if (instr_sopk->operands.size() > 2) 2640 std::swap(instr_sopk->operands[1], instr_sopk->operands[2]); 2641 instr_sopk->operands.pop_back(); 2642 2643 switch (instr_sopk->opcode) { 2644 case aco_opcode::s_add_i32: instr_sopk->opcode = aco_opcode::s_addk_i32; break; 2645 case aco_opcode::s_mul_i32: instr_sopk->opcode = aco_opcode::s_mulk_i32; break; 2646 case aco_opcode::s_cselect_b32: instr_sopk->opcode = aco_opcode::s_cmovk_i32; break; 2647 default: unreachable("illegal instruction"); 2648 } 2649} 2650 2651void 2652optimize_encoding(Program* program, ra_ctx& ctx, RegisterFile& register_file, 2653 aco_ptr<Instruction>& instr) 2654{ 2655 if (instr->isVALU()) 2656 optimize_encoding_vop2(program, ctx, register_file, instr); 2657 if (instr->isSALU()) 2658 optimize_encoding_sopk(program, ctx, register_file, instr); 2659} 2660 2661} /* end namespace */ 2662 2663void 2664register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy) 2665{ 2666 ra_ctx ctx(program, policy); 2667 get_affinities(ctx, live_out_per_block); 2668 2669 for (Block& block : program->blocks) { 2670 ctx.block = █ 2671 2672 /* initialize register file */ 2673 RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block); 2674 ctx.war_hint.reset(); 2675 2676 std::vector<aco_ptr<Instruction>> instructions; 2677 2678 /* this is a slight adjustment from the paper as we already have phi nodes: 2679 * We consider them incomplete phis and only handle the definition. */ 2680 get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]); 2681 2682 /* If this is a merge block, the state of the register file at the branch instruction of the 2683 * predecessors corresponds to the state after phis at the merge block. So, we allocate a 2684 * register for the predecessor's branch definitions as if there was a phi. 2685 */ 2686 if (!block.linear_preds.empty() && 2687 (block.linear_preds.size() != 1 || 2688 program->blocks[block.linear_preds[0]].linear_succs.size() == 1)) { 2689 PhysReg br_reg = get_reg_phi(ctx, live_out_per_block[block.index], register_file, 2690 instructions, block, ctx.phi_dummy, Temp(0, s2)); 2691 for (unsigned pred : block.linear_preds) { 2692 program->blocks[pred].scc_live_out = register_file[scc]; 2693 aco_ptr<Instruction>& br = program->blocks[pred].instructions.back(); 2694 2695 assert(br->definitions.size() == 1 && br->definitions[0].regClass() == s2 && 2696 br->definitions[0].isKill()); 2697 2698 br->definitions[0].setFixed(br_reg); 2699 } 2700 } 2701 2702 /* Handle all other instructions of the block */ 2703 auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); }; 2704 std::vector<aco_ptr<Instruction>>::iterator instr_it = 2705 std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi); 2706 for (; instr_it != block.instructions.end(); ++instr_it) { 2707 aco_ptr<Instruction>& instr = *instr_it; 2708 2709 /* parallelcopies from p_phi are inserted here which means 2710 * live ranges of killed operands end here as well */ 2711 if (instr->opcode == aco_opcode::p_logical_end) { 2712 /* no need to process this instruction any further */ 2713 if (block.logical_succs.size() != 1) { 2714 instructions.emplace_back(std::move(instr)); 2715 continue; 2716 } 2717 2718 Block& succ = program->blocks[block.logical_succs[0]]; 2719 unsigned idx = 0; 2720 for (; idx < succ.logical_preds.size(); idx++) { 2721 if (succ.logical_preds[idx] == block.index) 2722 break; 2723 } 2724 for (aco_ptr<Instruction>& phi : succ.instructions) { 2725 if (phi->opcode == aco_opcode::p_phi) { 2726 if (phi->operands[idx].isTemp() && 2727 phi->operands[idx].getTemp().type() == RegType::sgpr && 2728 phi->operands[idx].isFirstKillBeforeDef()) { 2729 Definition phi_op( 2730 read_variable(ctx, phi->operands[idx].getTemp(), block.index)); 2731 phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg); 2732 register_file.clear(phi_op); 2733 } 2734 } else if (phi->opcode != aco_opcode::p_linear_phi) { 2735 break; 2736 } 2737 } 2738 instructions.emplace_back(std::move(instr)); 2739 continue; 2740 } 2741 2742 /* unconditional branches are handled after phis of the target */ 2743 if (instr->opcode == aco_opcode::p_branch) { 2744 /* last instruction of the block */ 2745 instructions.emplace_back(std::move(instr)); 2746 break; 2747 } 2748 2749 std::vector<std::pair<Operand, Definition>> parallelcopy; 2750 2751 assert(!is_phi(instr)); 2752 2753 bool temp_in_scc = register_file[scc]; 2754 2755 /* handle operands */ 2756 for (unsigned i = 0; i < instr->operands.size(); ++i) { 2757 auto& operand = instr->operands[i]; 2758 if (!operand.isTemp()) 2759 continue; 2760 2761 /* rename operands */ 2762 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index)); 2763 assert(ctx.assignments[operand.tempId()].assigned); 2764 2765 PhysReg reg = ctx.assignments[operand.tempId()].reg; 2766 if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass())) 2767 operand.setFixed(reg); 2768 else 2769 get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i); 2770 2771 if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) || 2772 (instr->isDS() && instr->ds().gds)) { 2773 for (unsigned j = 0; j < operand.size(); j++) 2774 ctx.war_hint.set(operand.physReg().reg() + j); 2775 } 2776 } 2777 2778 /* remove dead vars from register file */ 2779 for (const Operand& op : instr->operands) { 2780 if (op.isTemp() && op.isFirstKillBeforeDef()) 2781 register_file.clear(op); 2782 } 2783 2784 optimize_encoding(program, ctx, register_file, instr); 2785 2786 /* Handle definitions which must have the same register as an operand. 2787 * We expect that the definition has the same size as the operand, otherwise the new 2788 * location for the operand (if it's not killed) might intersect with the old one. 2789 * We can't read from the old location because it's corrupted, and we can't write the new 2790 * location because that's used by a live-through operand. 2791 */ 2792 if (instr->opcode == aco_opcode::v_interp_p2_f32 || 2793 instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 || 2794 instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 || 2795 instr->opcode == aco_opcode::v_mac_legacy_f32 || 2796 instr->opcode == aco_opcode::v_fmac_legacy_f32 || 2797 instr->opcode == aco_opcode::v_pk_fmac_f16 || 2798 instr->opcode == aco_opcode::v_writelane_b32 || 2799 instr->opcode == aco_opcode::v_writelane_b32_e64 || 2800 instr->opcode == aco_opcode::v_dot4c_i32_i8) { 2801 assert(instr->definitions[0].bytes() == instr->operands[2].bytes() || 2802 instr->operands[2].regClass() == v1); 2803 instr->definitions[0].setFixed(instr->operands[2].physReg()); 2804 } else if (instr->opcode == aco_opcode::s_addk_i32 || 2805 instr->opcode == aco_opcode::s_mulk_i32 || 2806 instr->opcode == aco_opcode::s_cmovk_i32) { 2807 assert(instr->definitions[0].bytes() == instr->operands[0].bytes()); 2808 instr->definitions[0].setFixed(instr->operands[0].physReg()); 2809 } else if (instr->isMUBUF() && instr->definitions.size() == 1 && 2810 instr->operands.size() == 4) { 2811 assert(instr->definitions[0].bytes() == instr->operands[3].bytes()); 2812 instr->definitions[0].setFixed(instr->operands[3].physReg()); 2813 } else if (instr->isMIMG() && instr->definitions.size() == 1 && 2814 !instr->operands[2].isUndefined()) { 2815 assert(instr->definitions[0].bytes() == instr->operands[2].bytes()); 2816 instr->definitions[0].setFixed(instr->operands[2].physReg()); 2817 } 2818 2819 ctx.defs_done.reset(); 2820 2821 /* handle fixed definitions first */ 2822 for (unsigned i = 0; i < instr->definitions.size(); ++i) { 2823 auto& definition = instr->definitions[i]; 2824 if (!definition.isFixed()) 2825 continue; 2826 2827 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg()); 2828 /* check if the target register is blocked */ 2829 if (register_file.test(definition.physReg(), definition.bytes())) { 2830 const PhysRegInterval def_regs{definition.physReg(), definition.size()}; 2831 2832 /* create parallelcopy pair to move blocking vars */ 2833 std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs); 2834 2835 RegisterFile tmp_file(register_file); 2836 /* re-enable the killed operands, so that we don't move the blocking vars there */ 2837 for (const Operand& op : instr->operands) { 2838 if (op.isTemp() && op.isFirstKillBeforeDef()) 2839 tmp_file.fill(op); 2840 } 2841 2842 ASSERTED bool success = false; 2843 DefInfo info(ctx, instr, definition.regClass(), -1); 2844 success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr, 2845 def_regs); 2846 assert(success); 2847 2848 update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); 2849 } 2850 ctx.defs_done.set(i); 2851 2852 if (!definition.isTemp()) 2853 continue; 2854 2855 ctx.assignments[definition.tempId()].set(definition); 2856 register_file.fill(definition); 2857 } 2858 2859 /* handle all other definitions */ 2860 for (unsigned i = 0; i < instr->definitions.size(); ++i) { 2861 Definition* definition = &instr->definitions[i]; 2862 2863 if (definition->isFixed() || !definition->isTemp()) 2864 continue; 2865 2866 /* find free reg */ 2867 if (instr->opcode == aco_opcode::p_split_vector) { 2868 PhysReg reg = instr->operands[0].physReg(); 2869 RegClass rc = definition->regClass(); 2870 for (unsigned j = 0; j < i; j++) 2871 reg.reg_b += instr->definitions[j].bytes(); 2872 if (get_reg_specified(ctx, register_file, rc, instr, reg)) { 2873 definition->setFixed(reg); 2874 } else if (i == 0) { 2875 RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes()); 2876 DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1); 2877 std::pair<PhysReg, bool> res = get_reg_simple(ctx, register_file, info); 2878 reg = res.first; 2879 if (res.second && get_reg_specified(ctx, register_file, rc, instr, reg)) 2880 definition->setFixed(reg); 2881 } else if (instr->definitions[i - 1].isFixed()) { 2882 reg = instr->definitions[i - 1].physReg(); 2883 reg.reg_b += instr->definitions[i - 1].bytes(); 2884 if (get_reg_specified(ctx, register_file, rc, instr, reg)) 2885 definition->setFixed(reg); 2886 } 2887 } else if (instr->opcode == aco_opcode::p_wqm || 2888 instr->opcode == aco_opcode::p_parallelcopy) { 2889 PhysReg reg = instr->operands[i].physReg(); 2890 if (instr->operands[i].isTemp() && 2891 instr->operands[i].getTemp().type() == definition->getTemp().type() && 2892 !register_file.test(reg, definition->bytes())) 2893 definition->setFixed(reg); 2894 } else if (instr->opcode == aco_opcode::p_extract_vector) { 2895 PhysReg reg = instr->operands[0].physReg(); 2896 reg.reg_b += definition->bytes() * instr->operands[1].constantValue(); 2897 if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg)) 2898 definition->setFixed(reg); 2899 } else if (instr->opcode == aco_opcode::p_create_vector) { 2900 PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(), 2901 parallelcopy, instr); 2902 update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); 2903 definition->setFixed(reg); 2904 } 2905 2906 if (!definition->isFixed()) { 2907 Temp tmp = definition->getTemp(); 2908 if (definition->regClass().is_subdword() && definition->bytes() < 4) { 2909 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr); 2910 definition->setFixed(reg); 2911 if (reg.byte() || register_file.test(reg, 4)) { 2912 add_subdword_definition(program, instr, reg); 2913 definition = &instr->definitions[i]; /* add_subdword_definition can invalidate 2914 the reference */ 2915 } 2916 } else { 2917 definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr)); 2918 } 2919 update_renames(ctx, register_file, parallelcopy, instr, 2920 instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops 2921 : (UpdateRenames)0); 2922 } 2923 2924 assert( 2925 definition->isFixed() && 2926 ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) || 2927 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256))); 2928 ctx.defs_done.set(i); 2929 ctx.assignments[definition->tempId()].set(*definition); 2930 register_file.fill(*definition); 2931 } 2932 2933 handle_pseudo(ctx, register_file, instr.get()); 2934 2935 /* kill definitions and late-kill operands and ensure that sub-dword operands can actually 2936 * be read */ 2937 for (const Definition& def : instr->definitions) { 2938 if (def.isTemp() && def.isKill()) 2939 register_file.clear(def); 2940 } 2941 for (unsigned i = 0; i < instr->operands.size(); i++) { 2942 const Operand& op = instr->operands[i]; 2943 if (op.isTemp() && op.isFirstKill() && op.isLateKill()) 2944 register_file.clear(op); 2945 if (op.isTemp() && op.physReg().byte() != 0) 2946 add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass()); 2947 } 2948 2949 /* emit parallelcopy */ 2950 if (!parallelcopy.empty()) { 2951 aco_ptr<Pseudo_instruction> pc; 2952 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, 2953 Format::PSEUDO, parallelcopy.size(), 2954 parallelcopy.size())); 2955 bool linear_vgpr = false; 2956 bool sgpr_operands_alias_defs = false; 2957 uint64_t sgpr_operands[4] = {0, 0, 0, 0}; 2958 for (unsigned i = 0; i < parallelcopy.size(); i++) { 2959 linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr(); 2960 2961 if (temp_in_scc && parallelcopy[i].first.isTemp() && 2962 parallelcopy[i].first.getTemp().type() == RegType::sgpr) { 2963 if (!sgpr_operands_alias_defs) { 2964 unsigned reg = parallelcopy[i].first.physReg().reg(); 2965 unsigned size = parallelcopy[i].first.getTemp().size(); 2966 sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size); 2967 2968 reg = parallelcopy[i].second.physReg().reg(); 2969 size = parallelcopy[i].second.getTemp().size(); 2970 if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) 2971 sgpr_operands_alias_defs = true; 2972 } 2973 } 2974 2975 pc->operands[i] = parallelcopy[i].first; 2976 pc->definitions[i] = parallelcopy[i].second; 2977 assert(pc->operands[i].size() == pc->definitions[i].size()); 2978 2979 /* it might happen that the operand is already renamed. we have to restore the 2980 * original name. */ 2981 std::unordered_map<unsigned, Temp>::iterator it = 2982 ctx.orig_names.find(pc->operands[i].tempId()); 2983 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp(); 2984 ctx.orig_names[pc->definitions[i].tempId()] = orig; 2985 ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp(); 2986 } 2987 2988 if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) { 2989 /* disable definitions and re-enable operands */ 2990 RegisterFile tmp_file(register_file); 2991 for (const Definition& def : instr->definitions) { 2992 if (def.isTemp() && !def.isKill()) 2993 tmp_file.clear(def); 2994 } 2995 for (const Operand& op : instr->operands) { 2996 if (op.isTemp() && op.isFirstKill()) 2997 tmp_file.block(op.physReg(), op.regClass()); 2998 } 2999 3000 handle_pseudo(ctx, tmp_file, pc.get()); 3001 } else { 3002 pc->tmp_in_scc = false; 3003 } 3004 3005 instructions.emplace_back(std::move(pc)); 3006 } 3007 3008 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */ 3009 bool instr_needs_vop3 = 3010 !instr->isVOP3() && 3011 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) || 3012 (instr->opcode == aco_opcode::v_cndmask_b32 && 3013 !(instr->operands[2].physReg() == vcc)) || 3014 ((instr->opcode == aco_opcode::v_add_co_u32 || 3015 instr->opcode == aco_opcode::v_addc_co_u32 || 3016 instr->opcode == aco_opcode::v_sub_co_u32 || 3017 instr->opcode == aco_opcode::v_subb_co_u32 || 3018 instr->opcode == aco_opcode::v_subrev_co_u32 || 3019 instr->opcode == aco_opcode::v_subbrev_co_u32) && 3020 !(instr->definitions[1].physReg() == vcc)) || 3021 ((instr->opcode == aco_opcode::v_addc_co_u32 || 3022 instr->opcode == aco_opcode::v_subb_co_u32 || 3023 instr->opcode == aco_opcode::v_subbrev_co_u32) && 3024 !(instr->operands[2].physReg() == vcc))); 3025 if (instr_needs_vop3) { 3026 3027 /* if the first operand is a literal, we have to move it to a reg */ 3028 if (instr->operands.size() && instr->operands[0].isLiteral() && 3029 program->gfx_level < GFX10) { 3030 bool can_sgpr = true; 3031 /* check, if we have to move to vgpr */ 3032 for (const Operand& op : instr->operands) { 3033 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) { 3034 can_sgpr = false; 3035 break; 3036 } 3037 } 3038 /* disable definitions and re-enable operands */ 3039 RegisterFile tmp_file(register_file); 3040 for (const Definition& def : instr->definitions) 3041 tmp_file.clear(def); 3042 for (const Operand& op : instr->operands) { 3043 if (op.isTemp() && op.isFirstKill()) 3044 tmp_file.block(op.physReg(), op.regClass()); 3045 } 3046 Temp tmp = program->allocateTmp(can_sgpr ? s1 : v1); 3047 ctx.assignments.emplace_back(); 3048 PhysReg reg = get_reg(ctx, tmp_file, tmp, parallelcopy, instr); 3049 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops); 3050 3051 aco_ptr<Instruction> mov; 3052 if (can_sgpr) 3053 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, 3054 Format::SOP1, 1, 1)); 3055 else 3056 mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, 3057 Format::VOP1, 1, 1)); 3058 mov->operands[0] = instr->operands[0]; 3059 mov->definitions[0] = Definition(tmp); 3060 mov->definitions[0].setFixed(reg); 3061 3062 instr->operands[0] = Operand(tmp); 3063 instr->operands[0].setFixed(reg); 3064 instr->operands[0].setFirstKill(true); 3065 3066 instructions.emplace_back(std::move(mov)); 3067 } 3068 3069 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */ 3070 aco_ptr<Instruction> tmp = std::move(instr); 3071 Format format = asVOP3(tmp->format); 3072 instr.reset(create_instruction<VOP3_instruction>( 3073 tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); 3074 std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin()); 3075 std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin()); 3076 } 3077 3078 instructions.emplace_back(std::move(*instr_it)); 3079 3080 } /* end for Instr */ 3081 3082 block.instructions = std::move(instructions); 3083 } /* end for BB */ 3084 3085 /* num_gpr = rnd_up(max_used_gpr + 1) */ 3086 program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1); 3087 program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1); 3088 3089 program->progress = CompilationProgress::after_ra; 3090} 3091 3092} // namespace aco 3093