1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 
27 #include <algorithm>
28 #include <array>
29 #include <bitset>
30 #include <map>
31 #include <set>
32 #include <unordered_map>
33 #include <vector>
34 
35 namespace aco {
36 namespace {
37 
38 struct ra_ctx;
39 
40 unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
41                                      unsigned idx, RegClass rc);
42 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
43                           RegClass rc);
44 std::pair<unsigned, unsigned>
45 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc);
46 void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg);
47 
48 struct assignment {
49    PhysReg reg;
50    RegClass rc;
51    union {
52       struct {
53          bool assigned : 1;
54          bool vcc : 1;
55       };
56       uint8_t _ = 0;
57    };
58    uint32_t affinity = 0;
59    assignment() = default;
assignmentaco::__anon6956::assignment60    assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {}
setaco::__anon6956::assignment61    void set(const Definition& def)
62    {
63       assigned = true;
64       reg = def.physReg();
65       rc = def.regClass();
66    }
67 };
68 
69 struct ra_ctx {
70 
71    Program* program;
72    Block* block = NULL;
73    std::vector<assignment> assignments;
74    std::vector<std::unordered_map<unsigned, Temp>> renames;
75    std::vector<uint32_t> loop_header;
76    std::unordered_map<unsigned, Temp> orig_names;
77    std::unordered_map<unsigned, Instruction*> vectors;
78    std::unordered_map<unsigned, Instruction*> split_vectors;
79    aco_ptr<Instruction> pseudo_dummy;
80    aco_ptr<Instruction> phi_dummy;
81    uint16_t max_used_sgpr = 0;
82    uint16_t max_used_vgpr = 0;
83    uint16_t sgpr_limit;
84    uint16_t vgpr_limit;
85    std::bitset<512> war_hint;
86    std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
87 
88    ra_test_policy policy;
89 
ra_ctxaco::__anon6956::ra_ctx90    ra_ctx(Program* program_, ra_test_policy policy_)
91        : program(program_), assignments(program->peekAllocationId()),
92          renames(program->blocks.size()), policy(policy_)
93    {
94       pseudo_dummy.reset(
95          create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
96       phi_dummy.reset(
97          create_instruction<Instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
98       sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
99       vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
100    }
101 };
102 
103 /* Iterator type for making PhysRegInterval compatible with range-based for */
104 struct PhysRegIterator {
105    using difference_type = int;
106    using value_type = unsigned;
107    using reference = const unsigned&;
108    using pointer = const unsigned*;
109    using iterator_category = std::bidirectional_iterator_tag;
110 
111    PhysReg reg;
112 
operator *aco::__anon6956::PhysRegIterator113    PhysReg operator*() const { return reg; }
114 
operator ++aco::__anon6956::PhysRegIterator115    PhysRegIterator& operator++()
116    {
117       reg.reg_b += 4;
118       return *this;
119    }
120 
operator --aco::__anon6956::PhysRegIterator121    PhysRegIterator& operator--()
122    {
123       reg.reg_b -= 4;
124       return *this;
125    }
126 
operator ==aco::__anon6956::PhysRegIterator127    bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
128 
operator !=aco::__anon6956::PhysRegIterator129    bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
130 
operator <aco::__anon6956::PhysRegIterator131    bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
132 };
133 
134 /* Half-open register interval used in "sliding window"-style for-loops */
135 struct PhysRegInterval {
136    PhysReg lo_;
137    unsigned size;
138 
139    /* Inclusive lower bound */
loaco::__anon6956::PhysRegInterval140    PhysReg lo() const { return lo_; }
141 
142    /* Exclusive upper bound */
hiaco::__anon6956::PhysRegInterval143    PhysReg hi() const { return PhysReg{lo() + size}; }
144 
operator +=aco::__anon6956::PhysRegInterval145    PhysRegInterval& operator+=(uint32_t stride)
146    {
147       lo_ = PhysReg{lo_.reg() + stride};
148       return *this;
149    }
150 
operator !=aco::__anon6956::PhysRegInterval151    bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
152 
153    /* Construct a half-open interval, excluding the end register */
from_untilaco::__anon6956::PhysRegInterval154    static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
155 
containsaco::__anon6956::PhysRegInterval156    bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
157 
containsaco::__anon6956::PhysRegInterval158    bool contains(const PhysRegInterval& needle) const
159    {
160       return needle.lo() >= lo() && needle.hi() <= hi();
161    }
162 
beginaco::__anon6956::PhysRegInterval163    PhysRegIterator begin() const { return {lo_}; }
164 
endaco::__anon6956::PhysRegInterval165    PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
166 };
167 
168 bool
intersects(const PhysRegInterval& a, const PhysRegInterval& b)169 intersects(const PhysRegInterval& a, const PhysRegInterval& b)
170 {
171    return a.hi() > b.lo() && b.hi() > a.lo();
172 }
173 
174 /* Gets the stride for full (non-subdword) registers */
175 uint32_t
get_stride(RegClass rc)176 get_stride(RegClass rc)
177 {
178    if (rc.type() == RegType::vgpr) {
179       return 1;
180    } else {
181       uint32_t size = rc.size();
182       if (size == 2) {
183          return 2;
184       } else if (size >= 4) {
185          return 4;
186       } else {
187          return 1;
188       }
189    }
190 }
191 
192 PhysRegInterval
get_reg_bounds(Program* program, RegType type)193 get_reg_bounds(Program* program, RegType type)
194 {
195    if (type == RegType::vgpr) {
196       return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr};
197    } else {
198       return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr};
199    }
200 }
201 
202 struct DefInfo {
203    PhysRegInterval bounds;
204    uint8_t size;
205    uint8_t stride;
206    RegClass rc;
207 
DefInfoaco::__anon6956::DefInfo208    DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)
209    {
210       size = rc.size();
211       stride = get_stride(rc);
212 
213       bounds = get_reg_bounds(ctx.program, rc.type());
214 
215       if (rc.is_subdword() && operand >= 0) {
216          /* stride in bytes */
217          stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc);
218       } else if (rc.is_subdword()) {
219          std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc);
220          stride = info.first;
221          if (info.second > rc.bytes()) {
222             rc = RegClass::get(rc.type(), info.second);
223             size = rc.size();
224             /* we might still be able to put the definition in the high half,
225              * but that's only useful for affinities and this information isn't
226              * used for them */
227             stride = align(stride, info.second);
228             if (!rc.is_subdword())
229                stride = DIV_ROUND_UP(stride, 4);
230          }
231          assert(stride > 0);
232       } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) {
233          /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug
234           *
235           * The register use is not calculated correctly, and the hardware assumes a
236           * full dword per component. Don't use the last registers of the register file.
237           * Otherwise, the instruction will be skipped.
238           *
239           * https://reviews.llvm.org/D81172
240           */
241          bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF;
242          assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");
243 
244          if (imageGather4D16Bug)
245             bounds.size -= rc.bytes() / 4;
246       }
247    }
248 };
249 
250 class RegisterFile {
251 public:
RegisterFile()252    RegisterFile() { regs.fill(0); }
253 
254    std::array<uint32_t, 512> regs;
255    std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
256 
operator [](PhysReg index) const257    const uint32_t& operator[](PhysReg index) const { return regs[index]; }
258 
operator [](PhysReg index)259    uint32_t& operator[](PhysReg index) { return regs[index]; }
260 
count_zero(PhysRegInterval reg_interval)261    unsigned count_zero(PhysRegInterval reg_interval)
262    {
263       unsigned res = 0;
264       for (PhysReg reg : reg_interval)
265          res += !regs[reg];
266       return res;
267    }
268 
269    /* Returns true if any of the bytes in the given range are allocated or blocked */
test(PhysReg start, unsigned num_bytes)270    bool test(PhysReg start, unsigned num_bytes)
271    {
272       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
273          assert(i <= 511);
274          if (regs[i] & 0x0FFFFFFF)
275             return true;
276          if (regs[i] == 0xF0000000) {
277             assert(subdword_regs.find(i) != subdword_regs.end());
278             for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
279                if (subdword_regs[i][j])
280                   return true;
281             }
282          }
283       }
284       return false;
285    }
286 
block(PhysReg start, RegClass rc)287    void block(PhysReg start, RegClass rc)
288    {
289       if (rc.is_subdword())
290          fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
291       else
292          fill(start, rc.size(), 0xFFFFFFFF);
293    }
294 
is_blocked(PhysReg start)295    bool is_blocked(PhysReg start)
296    {
297       if (regs[start] == 0xFFFFFFFF)
298          return true;
299       if (regs[start] == 0xF0000000) {
300          for (unsigned i = start.byte(); i < 4; i++)
301             if (subdword_regs[start][i] == 0xFFFFFFFF)
302                return true;
303       }
304       return false;
305    }
306 
is_empty_or_blocked(PhysReg start)307    bool is_empty_or_blocked(PhysReg start)
308    {
309       /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
310        * incremented value to 1 */
311       if (regs[start] == 0xF0000000) {
312          return subdword_regs[start][start.byte()] + 1 <= 1;
313       }
314       return regs[start] + 1 <= 1;
315    }
316 
clear(PhysReg start, RegClass rc)317    void clear(PhysReg start, RegClass rc)
318    {
319       if (rc.is_subdword())
320          fill_subdword(start, rc.bytes(), 0);
321       else
322          fill(start, rc.size(), 0);
323    }
324 
fill(Operand op)325    void fill(Operand op)
326    {
327       if (op.regClass().is_subdword())
328          fill_subdword(op.physReg(), op.bytes(), op.tempId());
329       else
330          fill(op.physReg(), op.size(), op.tempId());
331    }
332 
clear(Operand op)333    void clear(Operand op) { clear(op.physReg(), op.regClass()); }
334 
fill(Definition def)335    void fill(Definition def)
336    {
337       if (def.regClass().is_subdword())
338          fill_subdword(def.physReg(), def.bytes(), def.tempId());
339       else
340          fill(def.physReg(), def.size(), def.tempId());
341    }
342 
clear(Definition def)343    void clear(Definition def) { clear(def.physReg(), def.regClass()); }
344 
get_id(PhysReg reg)345    unsigned get_id(PhysReg reg)
346    {
347       return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
348    }
349 
350 private:
fill(PhysReg start, unsigned size, uint32_t val)351    void fill(PhysReg start, unsigned size, uint32_t val)
352    {
353       for (unsigned i = 0; i < size; i++)
354          regs[start + i] = val;
355    }
356 
fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)357    void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)
358    {
359       fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
360       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
361          /* emplace or get */
362          std::array<uint32_t, 4>& sub =
363             subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
364          for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
365             sub[j] = val;
366 
367          if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
368             subdword_regs.erase(i);
369             regs[i] = 0;
370          }
371       }
372    }
373 };
374 
375 std::vector<unsigned> find_vars(ra_ctx& ctx, RegisterFile& reg_file,
376                                 const PhysRegInterval reg_interval);
377 
378 /* helper function for debugging */
379 UNUSED void
print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)380 print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
381 {
382    if (reg_file[reg] == 0xFFFFFFFF) {
383       printf("☐");
384    } else if (reg_file[reg]) {
385       const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
386       if (show_subdword_alloc) {
387          const char* block_chars[] = {
388             // clang-format off
389             "?", "▘", "▝", "▀",
390             "▖", "▌", "▞", "▛",
391             "▗", "▚", "▐", "▜",
392             "▄", "▙", "▟", "▉"
393             // clang-format on
394          };
395          unsigned index = 0;
396          for (int i = 0; i < 4; ++i) {
397             if (reg_file.subdword_regs.at(reg)[i]) {
398                index |= 1 << i;
399             }
400          }
401          printf("%s", block_chars[index]);
402       } else {
403          /* Indicate filled register slot */
404          if (!has_adjacent_variable) {
405             printf("█");
406          } else {
407             /* Use a slightly shorter box to leave a small gap between adjacent variables */
408             printf("▉");
409          }
410       }
411    } else {
412       printf("·");
413    }
414 }
415 
416 /* helper function for debugging */
417 UNUSED void
print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)418 print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
419 {
420    PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr);
421    char reg_char = vgprs ? 'v' : 's';
422    const int max_regs_per_line = 64;
423 
424    /* print markers */
425    printf("       ");
426    for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) {
427       printf("%-3.2u ", i);
428    }
429    printf("\n");
430 
431    /* print usage */
432    auto line_begin_it = regs.begin();
433    while (line_begin_it != regs.end()) {
434       const int regs_in_line =
435          std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
436 
437       if (line_begin_it == regs.begin()) {
438          printf("%cgprs: ", reg_char);
439       } else {
440          printf("  %+4d ", std::distance(regs.begin(), line_begin_it));
441       }
442       const auto line_end_it = std::next(line_begin_it, regs_in_line);
443 
444       for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {
445          bool has_adjacent_variable =
446             (std::next(reg_it) != line_end_it &&
447              reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);
448          print_reg(reg_file, *reg_it, has_adjacent_variable);
449       }
450 
451       line_begin_it = line_end_it;
452       printf("\n");
453    }
454 
455    const unsigned free_regs =
456       std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
457    printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
458 
459    /* print assignments ordered by registers */
460    std::map<PhysReg, std::pair<unsigned, unsigned>>
461       regs_to_vars; /* maps to byte size and temp id */
462    for (unsigned id : find_vars(ctx, reg_file, regs)) {
463       const assignment& var = ctx.assignments[id];
464       PhysReg reg = var.reg;
465       ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id));
466       assert(inserted.second);
467    }
468 
469    for (const auto& reg_and_var : regs_to_vars) {
470       const auto& first_reg = reg_and_var.first;
471       const auto& size_id = reg_and_var.second;
472 
473       printf("%%%u ", size_id.second);
474       if (ctx.orig_names.count(size_id.second) &&
475           ctx.orig_names[size_id.second].id() != size_id.second) {
476          printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
477       }
478       printf("= %c[%d", reg_char, first_reg.reg() - regs.lo());
479       PhysReg last_reg = first_reg.advance(size_id.first - 1);
480       if (first_reg.reg() != last_reg.reg()) {
481          assert(first_reg.byte() == 0 && last_reg.byte() == 3);
482          printf("-%d", last_reg.reg() - regs.lo());
483       }
484       printf("]");
485       if (first_reg.byte() != 0 || last_reg.byte() != 3) {
486          printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8);
487       }
488       printf("\n");
489    }
490 }
491 
492 unsigned
get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, unsigned idx, RegClass rc)493 get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
494                             unsigned idx, RegClass rc)
495 {
496    if (instr->isPseudo()) {
497       /* v_readfirstlane_b32 cannot use SDWA */
498       if (instr->opcode == aco_opcode::p_as_uniform)
499          return 4;
500       else if (gfx_level >= GFX8)
501          return rc.bytes() % 2 == 0 ? 2 : 1;
502       else
503          return 4;
504    }
505 
506    assert(rc.bytes() <= 2);
507    if (instr->isVALU()) {
508       if (can_use_SDWA(gfx_level, instr, false))
509          return rc.bytes();
510       if (can_use_opsel(gfx_level, instr->opcode, idx))
511          return 2;
512       if (instr->format == Format::VOP3P)
513          return 2;
514    }
515 
516    switch (instr->opcode) {
517    case aco_opcode::v_cvt_f32_ubyte0: return 1;
518    case aco_opcode::ds_write_b8:
519    case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
520    case aco_opcode::buffer_store_byte:
521    case aco_opcode::buffer_store_short:
522    case aco_opcode::buffer_store_format_d16_x:
523    case aco_opcode::flat_store_byte:
524    case aco_opcode::flat_store_short:
525    case aco_opcode::scratch_store_byte:
526    case aco_opcode::scratch_store_short:
527    case aco_opcode::global_store_byte:
528    case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4;
529    default: return 4;
530    }
531 }
532 
533 void
add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, RegClass rc)534 add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
535                      RegClass rc)
536 {
537    amd_gfx_level gfx_level = ctx.program->gfx_level;
538    if (instr->isPseudo() || byte == 0)
539       return;
540 
541    assert(rc.bytes() <= 2);
542    if (instr->isVALU()) {
543       /* check if we can use opsel */
544       if (instr->format == Format::VOP3) {
545          assert(byte == 2);
546          instr->vop3().opsel |= 1 << idx;
547          return;
548       }
549       if (instr->isVOP3P()) {
550          assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx)));
551          instr->vop3p().opsel_lo |= 1 << idx;
552          instr->vop3p().opsel_hi |= 1 << idx;
553          return;
554       }
555       if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
556          switch (byte) {
557          case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
558          case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
559          case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
560          case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
561          }
562          return;
563       }
564 
565       /* use SDWA */
566       assert(can_use_SDWA(gfx_level, instr, false));
567       convert_to_SDWA(gfx_level, instr);
568       return;
569    }
570 
571    assert(byte == 2);
572    if (instr->opcode == aco_opcode::ds_write_b8)
573       instr->opcode = aco_opcode::ds_write_b8_d16_hi;
574    else if (instr->opcode == aco_opcode::ds_write_b16)
575       instr->opcode = aco_opcode::ds_write_b16_d16_hi;
576    else if (instr->opcode == aco_opcode::buffer_store_byte)
577       instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
578    else if (instr->opcode == aco_opcode::buffer_store_short)
579       instr->opcode = aco_opcode::buffer_store_short_d16_hi;
580    else if (instr->opcode == aco_opcode::buffer_store_format_d16_x)
581       instr->opcode = aco_opcode::buffer_store_format_d16_hi_x;
582    else if (instr->opcode == aco_opcode::flat_store_byte)
583       instr->opcode = aco_opcode::flat_store_byte_d16_hi;
584    else if (instr->opcode == aco_opcode::flat_store_short)
585       instr->opcode = aco_opcode::flat_store_short_d16_hi;
586    else if (instr->opcode == aco_opcode::scratch_store_byte)
587       instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
588    else if (instr->opcode == aco_opcode::scratch_store_short)
589       instr->opcode = aco_opcode::scratch_store_short_d16_hi;
590    else if (instr->opcode == aco_opcode::global_store_byte)
591       instr->opcode = aco_opcode::global_store_byte_d16_hi;
592    else if (instr->opcode == aco_opcode::global_store_short)
593       instr->opcode = aco_opcode::global_store_short_d16_hi;
594    else
595       unreachable("Something went wrong: Impossible register assignment.");
596    return;
597 }
598 
599 /* minimum_stride, bytes_written */
600 std::pair<unsigned, unsigned>
get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc)601 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc)
602 {
603    amd_gfx_level gfx_level = program->gfx_level;
604 
605    if (instr->isPseudo()) {
606       if (gfx_level >= GFX8)
607          return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
608       else
609          return std::make_pair(4, rc.size() * 4u);
610    }
611 
612    if (instr->isVALU() || instr->isVINTRP()) {
613       assert(rc.bytes() <= 2);
614 
615       if (can_use_SDWA(gfx_level, instr, false))
616          return std::make_pair(rc.bytes(), rc.bytes());
617 
618       unsigned bytes_written = 4u;
619       if (instr_is_16bit(gfx_level, instr->opcode))
620          bytes_written = 2u;
621 
622       unsigned stride = 4u;
623       if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
624           can_use_opsel(gfx_level, instr->opcode, -1))
625          stride = 2u;
626 
627       return std::make_pair(stride, bytes_written);
628    }
629 
630    switch (instr->opcode) {
631    /* D16 loads with _hi version */
632    case aco_opcode::ds_read_u8_d16:
633    case aco_opcode::ds_read_i8_d16:
634    case aco_opcode::ds_read_u16_d16:
635    case aco_opcode::flat_load_ubyte_d16:
636    case aco_opcode::flat_load_sbyte_d16:
637    case aco_opcode::flat_load_short_d16:
638    case aco_opcode::global_load_ubyte_d16:
639    case aco_opcode::global_load_sbyte_d16:
640    case aco_opcode::global_load_short_d16:
641    case aco_opcode::scratch_load_ubyte_d16:
642    case aco_opcode::scratch_load_sbyte_d16:
643    case aco_opcode::scratch_load_short_d16:
644    case aco_opcode::buffer_load_ubyte_d16:
645    case aco_opcode::buffer_load_sbyte_d16:
646    case aco_opcode::buffer_load_short_d16:
647    case aco_opcode::buffer_load_format_d16_x: {
648       assert(gfx_level >= GFX9);
649       if (!program->dev.sram_ecc_enabled)
650          return std::make_pair(2u, 2u);
651       else
652          return std::make_pair(2u, 4u);
653    }
654    /* 3-component D16 loads */
655    case aco_opcode::buffer_load_format_d16_xyz:
656    case aco_opcode::tbuffer_load_format_d16_xyz: {
657       assert(gfx_level >= GFX9);
658       if (!program->dev.sram_ecc_enabled)
659          return std::make_pair(4u, 6u);
660       break;
661    }
662 
663    default: break;
664    }
665 
666    if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) {
667       assert(gfx_level >= GFX9);
668       return std::make_pair(4u, rc.bytes());
669    }
670 
671    return std::make_pair(4, rc.size() * 4u);
672 }
673 
674 void
add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg)675 add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg)
676 {
677    if (instr->isPseudo())
678       return;
679 
680    if (instr->isVALU()) {
681       amd_gfx_level gfx_level = program->gfx_level;
682       assert(instr->definitions[0].bytes() <= 2);
683 
684       if (reg.byte() == 0 && instr_is_16bit(gfx_level, instr->opcode))
685          return;
686 
687       /* check if we can use opsel */
688       if (instr->format == Format::VOP3) {
689          assert(reg.byte() == 2);
690          assert(can_use_opsel(gfx_level, instr->opcode, -1));
691          instr->vop3().opsel |= (1 << 3); /* dst in high half */
692          return;
693       }
694 
695       if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
696          instr->opcode = aco_opcode::v_fma_mixhi_f16;
697          return;
698       }
699 
700       /* use SDWA */
701       assert(can_use_SDWA(gfx_level, instr, false));
702       convert_to_SDWA(gfx_level, instr);
703       return;
704    }
705 
706    if (reg.byte() == 0)
707       return;
708    else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
709       instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
710    else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
711       instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
712    else if (instr->opcode == aco_opcode::buffer_load_short_d16)
713       instr->opcode = aco_opcode::buffer_load_short_d16_hi;
714    else if (instr->opcode == aco_opcode::buffer_load_format_d16_x)
715       instr->opcode = aco_opcode::buffer_load_format_d16_hi_x;
716    else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
717       instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
718    else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
719       instr->opcode = aco_opcode::flat_load_sbyte_d16_hi;
720    else if (instr->opcode == aco_opcode::flat_load_short_d16)
721       instr->opcode = aco_opcode::flat_load_short_d16_hi;
722    else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
723       instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
724    else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16)
725       instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi;
726    else if (instr->opcode == aco_opcode::scratch_load_short_d16)
727       instr->opcode = aco_opcode::scratch_load_short_d16_hi;
728    else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
729       instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
730    else if (instr->opcode == aco_opcode::global_load_sbyte_d16)
731       instr->opcode = aco_opcode::global_load_sbyte_d16_hi;
732    else if (instr->opcode == aco_opcode::global_load_short_d16)
733       instr->opcode = aco_opcode::global_load_short_d16_hi;
734    else if (instr->opcode == aco_opcode::ds_read_u8_d16)
735       instr->opcode = aco_opcode::ds_read_u8_d16_hi;
736    else if (instr->opcode == aco_opcode::ds_read_i8_d16)
737       instr->opcode = aco_opcode::ds_read_i8_d16_hi;
738    else if (instr->opcode == aco_opcode::ds_read_u16_d16)
739       instr->opcode = aco_opcode::ds_read_u16_d16_hi;
740    else
741       unreachable("Something went wrong: Impossible register assignment.");
742 }
743 
744 void
adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)745 adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
746 {
747    uint16_t max_addressible_sgpr = ctx.sgpr_limit;
748    unsigned size = rc.size();
749    if (rc.type() == RegType::vgpr) {
750       assert(reg >= 256);
751       uint16_t hi = reg - 256 + size - 1;
752       ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
753    } else if (reg + rc.size() <= max_addressible_sgpr) {
754       uint16_t hi = reg + size - 1;
755       ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
756    }
757 }
758 
759 enum UpdateRenames {
760    rename_not_killed_ops = 0x1,
761    fill_killed_ops = 0x2,
762 };
763 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
764 
765 void
update_renames(ra_ctx& ctx, RegisterFile& reg_file, std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr, UpdateRenames flags)766 update_renames(ra_ctx& ctx, RegisterFile& reg_file,
767                std::vector<std::pair<Operand, Definition>>& parallelcopies,
768                aco_ptr<Instruction>& instr, UpdateRenames flags)
769 {
770    /* clear operands */
771    for (std::pair<Operand, Definition>& copy : parallelcopies) {
772       /* the definitions with id are not from this function and already handled */
773       if (copy.second.isTemp())
774          continue;
775       reg_file.clear(copy.first);
776    }
777 
778    /* allocate id's and rename operands: this is done transparently here */
779    auto it = parallelcopies.begin();
780    while (it != parallelcopies.end()) {
781       if (it->second.isTemp()) {
782          ++it;
783          continue;
784       }
785 
786       /* check if we moved a definition: change the register and remove copy */
787       bool is_def = false;
788       for (Definition& def : instr->definitions) {
789          if (def.isTemp() && def.getTemp() == it->first.getTemp()) {
790             // FIXME: ensure that the definition can use this reg
791             def.setFixed(it->second.physReg());
792             reg_file.fill(def);
793             ctx.assignments[def.tempId()].reg = def.physReg();
794             it = parallelcopies.erase(it);
795             is_def = true;
796             break;
797          }
798       }
799       if (is_def)
800          continue;
801 
802       /* check if we moved another parallelcopy definition */
803       for (std::pair<Operand, Definition>& other : parallelcopies) {
804          if (!other.second.isTemp())
805             continue;
806          if (it->first.getTemp() == other.second.getTemp()) {
807             other.second.setFixed(it->second.physReg());
808             ctx.assignments[other.second.tempId()].reg = other.second.physReg();
809             it = parallelcopies.erase(it);
810             is_def = true;
811             /* check if we moved an operand, again */
812             bool fill = true;
813             for (Operand& op : instr->operands) {
814                if (op.isTemp() && op.tempId() == other.second.tempId()) {
815                   // FIXME: ensure that the operand can use this reg
816                   op.setFixed(other.second.physReg());
817                   fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
818                }
819             }
820             if (fill)
821                reg_file.fill(other.second);
822             break;
823          }
824       }
825       if (is_def)
826          continue;
827 
828       std::pair<Operand, Definition>& copy = *it;
829       copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
830       ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
831       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
832 
833       /* check if we moved an operand */
834       bool first = true;
835       bool fill = true;
836       for (unsigned i = 0; i < instr->operands.size(); i++) {
837          Operand& op = instr->operands[i];
838          if (!op.isTemp())
839             continue;
840          if (op.tempId() == copy.first.tempId()) {
841             bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef();
842             for (std::pair<Operand, Definition>& pc : parallelcopies) {
843                PhysReg def_reg = pc.second.physReg();
844                omit_renaming &= def_reg > copy.first.physReg()
845                                    ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
846                                    : (def_reg + pc.second.size() <= copy.first.physReg().reg());
847             }
848             if (omit_renaming) {
849                if (first)
850                   op.setFirstKill(true);
851                else
852                   op.setKill(true);
853                first = false;
854                continue;
855             }
856             op.setTemp(copy.second.getTemp());
857             op.setFixed(copy.second.physReg());
858 
859             fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
860          }
861       }
862 
863       if (fill)
864          reg_file.fill(copy.second);
865 
866       ++it;
867    }
868 }
869 
870 std::pair<PhysReg, bool>
get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)871 get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
872 {
873    const PhysRegInterval& bounds = info.bounds;
874    uint32_t size = info.size;
875    uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
876    RegClass rc = info.rc;
877 
878    DefInfo new_info = info;
879    new_info.rc = RegClass(rc.type(), size);
880    for (unsigned new_stride = 16; new_stride > stride; new_stride /= 2) {
881       if (size % new_stride)
882          continue;
883       new_info.stride = new_stride;
884       std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, new_info);
885       if (res.second)
886          return res;
887    }
888 
889    auto is_free = [&](PhysReg reg_index)
890    { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
891 
892    if (stride == 1) {
893       /* best fit algorithm: find the smallest gap to fit in the variable */
894       PhysRegInterval best_gap{PhysReg{0}, UINT_MAX};
895       const unsigned max_gpr =
896          (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr;
897 
898       PhysRegIterator reg_it = bounds.begin();
899       const PhysRegIterator end_it =
900          std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it));
901       while (reg_it != bounds.end()) {
902          /* Find the next chunk of available register slots */
903          reg_it = std::find_if(reg_it, end_it, is_free);
904          auto next_nonfree_it = std::find_if_not(reg_it, end_it, is_free);
905          if (reg_it == bounds.end()) {
906             break;
907          }
908 
909          if (next_nonfree_it == end_it) {
910             /* All registers past max_used_gpr are free */
911             next_nonfree_it = bounds.end();
912          }
913 
914          PhysRegInterval gap = PhysRegInterval::from_until(*reg_it, *next_nonfree_it);
915 
916          /* early return on exact matches */
917          if (size == gap.size) {
918             adjust_max_used_regs(ctx, rc, gap.lo());
919             return {gap.lo(), true};
920          }
921 
922          /* check if it fits and the gap size is smaller */
923          if (size < gap.size && gap.size < best_gap.size) {
924             best_gap = gap;
925          }
926 
927          /* Move past the processed chunk */
928          reg_it = next_nonfree_it;
929       }
930 
931       if (best_gap.size == UINT_MAX)
932          return {{}, false};
933 
934       /* find best position within gap by leaving a good stride for other variables*/
935       unsigned buffer = best_gap.size - size;
936       if (buffer > 1) {
937          if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) ||
938              ((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) ||
939              ((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0))
940             best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer};
941       }
942 
943       adjust_max_used_regs(ctx, rc, best_gap.lo());
944       return {best_gap.lo(), true};
945    }
946 
947    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
948         reg_win += stride) {
949       if (reg_file[reg_win.lo()] != 0) {
950          continue;
951       }
952 
953       bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free);
954       if (is_valid) {
955          adjust_max_used_regs(ctx, rc, reg_win.lo());
956          return {reg_win.lo(), true};
957       }
958    }
959 
960    /* do this late because using the upper bytes of a register can require
961     * larger instruction encodings or copies
962     * TODO: don't do this in situations where it doesn't benefit */
963    if (rc.is_subdword()) {
964       for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) {
965          assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
966          if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
967             continue;
968 
969          for (unsigned i = 0; i < 4; i += info.stride) {
970             /* check if there's a block of free bytes large enough to hold the register */
971             bool reg_found =
972                std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())],
973                            [](unsigned v) { return v == 0; });
974 
975             /* check if also the neighboring reg is free if needed */
976             if (reg_found && i + rc.bytes() > 4)
977                reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
978 
979             if (reg_found) {
980                PhysReg res{entry.first};
981                res.reg_b += i;
982                adjust_max_used_regs(ctx, rc, entry.first);
983                return {res, true};
984             }
985          }
986       }
987    }
988 
989    return {{}, false};
990 }
991 
992 /* collect variables from a register area */
993 std::vector<unsigned>
find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)994 find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
995 {
996    std::vector<unsigned> vars;
997    for (PhysReg j : reg_interval) {
998       if (reg_file.is_blocked(j))
999          continue;
1000       if (reg_file[j] == 0xF0000000) {
1001          for (unsigned k = 0; k < 4; k++) {
1002             unsigned id = reg_file.subdword_regs[j][k];
1003             if (id && (vars.empty() || id != vars.back()))
1004                vars.emplace_back(id);
1005          }
1006       } else {
1007          unsigned id = reg_file[j];
1008          if (id && (vars.empty() || id != vars.back()))
1009             vars.emplace_back(id);
1010       }
1011    }
1012    return vars;
1013 }
1014 
1015 /* collect variables from a register area and clear reg_file
1016  * variables are sorted in decreasing size and
1017  * increasing assigned register
1018  */
1019 std::vector<unsigned>
collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)1020 collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
1021 {
1022    std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
1023    std::sort(ids.begin(), ids.end(),
1024              [&](unsigned a, unsigned b)
1025              {
1026                 assignment& var_a = ctx.assignments[a];
1027                 assignment& var_b = ctx.assignments[b];
1028                 return var_a.rc.bytes() > var_b.rc.bytes() ||
1029                        (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg);
1030              });
1031 
1032    for (unsigned id : ids) {
1033       assignment& var = ctx.assignments[id];
1034       reg_file.clear(var.reg, var.rc);
1035    }
1036    return ids;
1037 }
1038 
1039 std::pair<PhysReg, bool>
get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr, const PhysRegInterval def_reg, DefInfo info, unsigned id)1040 get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
1041                                std::vector<std::pair<Operand, Definition>>& parallelcopies,
1042                                aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
1043                                DefInfo info, unsigned id)
1044 {
1045    PhysReg reg = def_reg.lo();
1046    /* dead operand: return position in vector */
1047    for (unsigned i = 0; i < instr->operands.size(); i++) {
1048       if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id &&
1049           instr->operands[i].isKillBeforeDef()) {
1050          assert(!reg_file.test(reg, instr->operands[i].bytes()));
1051          return {reg, info.rc.is_subdword() || reg.byte() == 0};
1052       }
1053       reg.reg_b += instr->operands[i].bytes();
1054    }
1055 
1056    if (ctx.program->gfx_level <= GFX8)
1057       return {PhysReg(), false};
1058 
1059    /* check if the previous position was in vector */
1060    assignment& var = ctx.assignments[id];
1061    if (def_reg.contains(PhysRegInterval{var.reg, info.size})) {
1062       reg = def_reg.lo();
1063       /* try to use the previous register of the operand */
1064       for (unsigned i = 0; i < instr->operands.size(); i++) {
1065          if (reg != var.reg) {
1066             reg.reg_b += instr->operands[i].bytes();
1067             continue;
1068          }
1069 
1070          /* check if we can swap positions */
1071          if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
1072              instr->operands[i].regClass() == info.rc) {
1073             assignment& op = ctx.assignments[instr->operands[i].tempId()];
1074             /* if everything matches, create parallelcopy for the killed operand */
1075             if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) &&
1076                 reg_file.get_id(op.reg) == instr->operands[i].tempId()) {
1077                Definition pc_def = Definition(reg, info.rc);
1078                parallelcopies.emplace_back(instr->operands[i], pc_def);
1079                return {op.reg, true};
1080             }
1081          }
1082          return {PhysReg(), false};
1083       }
1084    }
1085    return {PhysReg(), false};
1086 }
1087 
1088 bool
get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, std::vector<std::pair<Operand, Definition>>& parallelcopies, const std::vector<unsigned>& vars, const PhysRegInterval bounds, aco_ptr<Instruction>& instr, const PhysRegInterval def_reg)1089 get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
1090                     std::vector<std::pair<Operand, Definition>>& parallelcopies,
1091                     const std::vector<unsigned>& vars, const PhysRegInterval bounds,
1092                     aco_ptr<Instruction>& instr, const PhysRegInterval def_reg)
1093 {
1094    /* Variables are sorted from large to small and with increasing assigned register */
1095    for (unsigned id : vars) {
1096       assignment& var = ctx.assignments[id];
1097       DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
1098       uint32_t size = info.size;
1099 
1100       /* check if this is a dead operand, then we can re-use the space from the definition
1101        * also use the correct stride for sub-dword operands */
1102       bool is_dead_operand = false;
1103       std::pair<PhysReg, bool> res{PhysReg(), false};
1104       if (instr->opcode == aco_opcode::p_create_vector) {
1105          res =
1106             get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id);
1107       } else {
1108          for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
1109             if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
1110                info = DefInfo(ctx, instr, var.rc, i);
1111                if (instr->operands[i].isKillBeforeDef()) {
1112                   info.bounds = def_reg;
1113                   res = get_reg_simple(ctx, reg_file, info);
1114                   is_dead_operand = true;
1115                }
1116                break;
1117             }
1118          }
1119       }
1120       if (!res.second) {
1121          /* Try to find space within the bounds but outside of the definition */
1122          info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
1123          res = get_reg_simple(ctx, reg_file, info);
1124          if (!res.second && def_reg.hi() <= bounds.hi()) {
1125             unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
1126             info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
1127             res = get_reg_simple(ctx, reg_file, info);
1128          }
1129       }
1130 
1131       if (res.second) {
1132          /* mark the area as blocked */
1133          reg_file.block(res.first, var.rc);
1134 
1135          /* create parallelcopy pair (without definition id) */
1136          Temp tmp = Temp(id, var.rc);
1137          Operand pc_op = Operand(tmp);
1138          pc_op.setFixed(var.reg);
1139          Definition pc_def = Definition(res.first, pc_op.regClass());
1140          parallelcopies.emplace_back(pc_op, pc_def);
1141          continue;
1142       }
1143 
1144       PhysReg best_pos = bounds.lo();
1145       unsigned num_moves = 0xFF;
1146       unsigned num_vars = 0;
1147 
1148       /* we use a sliding window to find potential positions */
1149       unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
1150       for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1151            reg_win += stride) {
1152          if (!is_dead_operand && intersects(reg_win, def_reg))
1153             continue;
1154 
1155          /* second, check that we have at most k=num_moves elements in the window
1156           * and no element is larger than the currently processed one */
1157          unsigned k = 0;
1158          unsigned n = 0;
1159          unsigned last_var = 0;
1160          bool found = true;
1161          for (PhysReg j : reg_win) {
1162             if (reg_file[j] == 0 || reg_file[j] == last_var)
1163                continue;
1164 
1165             if (reg_file.is_blocked(j) || k > num_moves) {
1166                found = false;
1167                break;
1168             }
1169             if (reg_file[j] == 0xF0000000) {
1170                k += 1;
1171                n++;
1172                continue;
1173             }
1174             /* we cannot split live ranges of linear vgprs inside control flow */
1175             if (!(ctx.block->kind & block_kind_top_level) &&
1176                 ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1177                found = false;
1178                break;
1179             }
1180             bool is_kill = false;
1181             for (const Operand& op : instr->operands) {
1182                if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
1183                   is_kill = true;
1184                   break;
1185                }
1186             }
1187             if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
1188                found = false;
1189                break;
1190             }
1191 
1192             k += ctx.assignments[reg_file[j]].rc.size();
1193             last_var = reg_file[j];
1194             n++;
1195             if (k > num_moves || (k == num_moves && n <= num_vars)) {
1196                found = false;
1197                break;
1198             }
1199          }
1200 
1201          if (found) {
1202             best_pos = reg_win.lo();
1203             num_moves = k;
1204             num_vars = n;
1205          }
1206       }
1207 
1208       /* FIXME: we messed up and couldn't find space for the variables to be copied */
1209       if (num_moves == 0xFF)
1210          return false;
1211 
1212       PhysRegInterval reg_win{best_pos, size};
1213 
1214       /* collect variables and block reg file */
1215       std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win);
1216 
1217       /* mark the area as blocked */
1218       reg_file.block(reg_win.lo(), var.rc);
1219       adjust_max_used_regs(ctx, var.rc, reg_win.lo());
1220 
1221       if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, bounds, instr, def_reg))
1222          return false;
1223 
1224       /* create parallelcopy pair (without definition id) */
1225       Temp tmp = Temp(id, var.rc);
1226       Operand pc_op = Operand(tmp);
1227       pc_op.setFixed(var.reg);
1228       Definition pc_def = Definition(reg_win.lo(), pc_op.regClass());
1229       parallelcopies.emplace_back(pc_op, pc_def);
1230    }
1231 
1232    return true;
1233 }
1234 
1235 std::pair<PhysReg, bool>
get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info, aco_ptr<Instruction>& instr)1236 get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
1237              std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
1238              aco_ptr<Instruction>& instr)
1239 {
1240    const PhysRegInterval& bounds = info.bounds;
1241    uint32_t size = info.size;
1242    uint32_t stride = info.stride;
1243    RegClass rc = info.rc;
1244 
1245    /* check how many free regs we have */
1246    unsigned regs_free = reg_file.count_zero(bounds);
1247 
1248    /* mark and count killed operands */
1249    unsigned killed_ops = 0;
1250    std::bitset<256> is_killed_operand; /* per-register */
1251    for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
1252       Operand& op = instr->operands[j];
1253       if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&
1254           !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {
1255          assert(op.isFixed());
1256 
1257          for (unsigned i = 0; i < op.size(); ++i) {
1258             is_killed_operand[(op.physReg() & 0xff) + i] = true;
1259          }
1260 
1261          killed_ops += op.getTemp().size();
1262       }
1263    }
1264 
1265    assert(regs_free >= size);
1266    /* we might have to move dead operands to dst in order to make space */
1267    unsigned op_moves = 0;
1268 
1269    if (size > (regs_free - killed_ops))
1270       op_moves = size - (regs_free - killed_ops);
1271 
1272    /* find the best position to place the definition */
1273    PhysRegInterval best_win = {bounds.lo(), size};
1274    unsigned num_moves = 0xFF;
1275    unsigned num_vars = 0;
1276 
1277    /* we use a sliding window to check potential positions */
1278    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1279         reg_win += stride) {
1280       /* first check if the register window starts in the middle of an
1281        * allocated variable: this is what we have to fix to allow for
1282        * num_moves > size */
1283       if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) &&
1284           reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1285          continue;
1286       if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) &&
1287           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1288          continue;
1289 
1290       /* second, check that we have at most k=num_moves elements in the window
1291        * and no element is larger than the currently processed one */
1292       unsigned k = op_moves;
1293       unsigned n = 0;
1294       unsigned remaining_op_moves = op_moves;
1295       unsigned last_var = 0;
1296       bool found = true;
1297       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1298       for (const PhysReg j : reg_win) {
1299          /* dead operands effectively reduce the number of estimated moves */
1300          if (is_killed_operand[j & 0xFF]) {
1301             if (remaining_op_moves) {
1302                k--;
1303                remaining_op_moves--;
1304             }
1305             continue;
1306          }
1307 
1308          if (reg_file[j] == 0 || reg_file[j] == last_var)
1309             continue;
1310 
1311          if (reg_file[j] == 0xF0000000) {
1312             k += 1;
1313             n++;
1314             continue;
1315          }
1316 
1317          if (ctx.assignments[reg_file[j]].rc.size() >= size) {
1318             found = false;
1319             break;
1320          }
1321 
1322          /* we cannot split live ranges of linear vgprs inside control flow */
1323          // TODO: ensure that live range splits inside control flow are never necessary
1324          if (!(ctx.block->kind & block_kind_top_level) &&
1325              ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1326             found = false;
1327             break;
1328          }
1329 
1330          k += ctx.assignments[reg_file[j]].rc.size();
1331          n++;
1332          last_var = reg_file[j];
1333       }
1334 
1335       if (!found || k > num_moves)
1336          continue;
1337       if (k == num_moves && n < num_vars)
1338          continue;
1339       if (!aligned && k == num_moves && n == num_vars)
1340          continue;
1341 
1342       if (found) {
1343          best_win = reg_win;
1344          num_moves = k;
1345          num_vars = n;
1346       }
1347    }
1348 
1349    if (num_moves == 0xFF)
1350       return {{}, false};
1351 
1352    /* now, we figured the placement for our definition */
1353    RegisterFile tmp_file(reg_file);
1354 
1355    /* p_create_vector: also re-place killed operands in the definition space */
1356    if (instr->opcode == aco_opcode::p_create_vector) {
1357       for (Operand& op : instr->operands) {
1358          if (op.isTemp() && op.isFirstKillBeforeDef())
1359             tmp_file.fill(op);
1360       }
1361    }
1362 
1363    std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win);
1364 
1365    /* re-enable killed operands */
1366    if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) {
1367       for (Operand& op : instr->operands) {
1368          if (op.isTemp() && op.isFirstKillBeforeDef())
1369             tmp_file.fill(op);
1370       }
1371    }
1372 
1373    std::vector<std::pair<Operand, Definition>> pc;
1374    if (!get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, best_win))
1375       return {{}, false};
1376 
1377    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1378 
1379    adjust_max_used_regs(ctx, rc, best_win.lo());
1380    return {best_win.lo(), true};
1381 }
1382 
1383 bool
get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr, PhysReg reg)1384 get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr,
1385                   PhysReg reg)
1386 {
1387    /* catch out-of-range registers */
1388    if (reg >= PhysReg{512})
1389       return false;
1390 
1391    std::pair<unsigned, unsigned> sdw_def_info;
1392    if (rc.is_subdword())
1393       sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc);
1394 
1395    if (rc.is_subdword() && reg.byte() % sdw_def_info.first)
1396       return false;
1397    if (!rc.is_subdword() && reg.byte())
1398       return false;
1399 
1400    if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0)
1401       return false;
1402 
1403    PhysRegInterval reg_win = {reg, rc.size()};
1404    PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
1405    PhysRegInterval vcc_win = {vcc, 2};
1406    /* VCC is outside the bounds */
1407    bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc;
1408    bool is_m0 = rc == s1 && reg == m0;
1409    if (!bounds.contains(reg_win) && !is_vcc && !is_m0)
1410       return false;
1411 
1412    if (rc.is_subdword()) {
1413       PhysReg test_reg;
1414       test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1);
1415       if (reg_file.test(test_reg, sdw_def_info.second))
1416          return false;
1417    } else {
1418       if (reg_file.test(reg, rc.bytes()))
1419          return false;
1420    }
1421 
1422    adjust_max_used_regs(ctx, rc, reg_win.lo());
1423    return true;
1424 }
1425 
1426 bool
increase_register_file(ra_ctx& ctx, RegType type)1427 increase_register_file(ra_ctx& ctx, RegType type)
1428 {
1429    if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
1430       update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,
1431                                                           ctx.program->max_reg_demand.sgpr));
1432    } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
1433       update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,
1434                                                           ctx.program->max_reg_demand.sgpr + 1));
1435    } else {
1436       return false;
1437    }
1438    return true;
1439 }
1440 
1441 struct IDAndRegClass {
IDAndRegClassaco::__anon6956::IDAndRegClass1442    IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {}
1443 
1444    unsigned id;
1445    RegClass rc;
1446 };
1447 
1448 struct IDAndInfo {
IDAndInfoaco::__anon6956::IDAndInfo1449    IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {}
1450 
1451    unsigned id;
1452    DefInfo info;
1453 };
1454 
1455 /* Reallocates vars by sorting them and placing each variable after the previous
1456  * one. If one of the variables has 0xffffffff as an ID, the register assigned
1457  * for that variable will be returned.
1458  */
1459 PhysReg
compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars, std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)1460 compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
1461                       std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
1462 {
1463    /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
1464     * temporary sizes to dwords.
1465     */
1466    std::vector<IDAndInfo> sorted;
1467    for (IDAndRegClass var : vars) {
1468       DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1);
1469       sorted.emplace_back(var.id, info);
1470    }
1471 
1472    std::sort(
1473       sorted.begin(), sorted.end(),
1474       [&ctx](const IDAndInfo& a, const IDAndInfo& b)
1475       {
1476          unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
1477          unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
1478          if (a_stride > b_stride)
1479             return true;
1480          if (a_stride < b_stride)
1481             return false;
1482          if (a.id == 0xffffffff || b.id == 0xffffffff)
1483             return a.id ==
1484                    0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
1485          return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
1486       });
1487 
1488    PhysReg next_reg = start;
1489    PhysReg space_reg;
1490    for (IDAndInfo& var : sorted) {
1491       unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4;
1492       next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4));
1493 
1494       /* 0xffffffff is a special variable ID used reserve a space for killed
1495        * operands and definitions.
1496        */
1497       if (var.id != 0xffffffff) {
1498          if (next_reg != ctx.assignments[var.id].reg) {
1499             RegClass rc = ctx.assignments[var.id].rc;
1500             Temp tmp(var.id, rc);
1501 
1502             Operand pc_op(tmp);
1503             pc_op.setFixed(ctx.assignments[var.id].reg);
1504             Definition pc_def(next_reg, rc);
1505             parallelcopies.emplace_back(pc_op, pc_def);
1506          }
1507       } else {
1508          space_reg = next_reg;
1509       }
1510 
1511       adjust_max_used_regs(ctx, var.info.rc, next_reg);
1512 
1513       next_reg = next_reg.advance(var.info.rc.size() * 4);
1514    }
1515 
1516    return space_reg;
1517 }
1518 
1519 bool
is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)1520 is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
1521 {
1522    PhysReg first{512};
1523    for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {
1524       Operand op = instr->operands[i + 3];
1525 
1526       if (ctx.assignments[op.tempId()].assigned) {
1527          PhysReg reg = ctx.assignments[op.tempId()].reg;
1528 
1529          if (first.reg() == 512) {
1530             PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr);
1531             first = reg.advance(i * -4);
1532             PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u};
1533             if (!bounds.contains(vec)) /* not enough space for other operands */
1534                return false;
1535          } else {
1536             if (reg != first.advance(i * 4)) /* not at the best position */
1537                return false;
1538          }
1539       } else {
1540          /* If there's an unexpected temporary, this operand is unlikely to be
1541           * placed in the best position.
1542           */
1543          if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4))
1544             return false;
1545       }
1546    }
1547 
1548    return true;
1549 }
1550 
1551 std::pair<PhysReg, bool>
get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)1552 get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)
1553 {
1554    Instruction* vec = ctx.vectors[temp.id()];
1555    unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;
1556    unsigned our_offset = 0;
1557    for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1558       Operand& op = vec->operands[i];
1559       if (op.isTemp() && op.tempId() == temp.id())
1560          break;
1561       else
1562          our_offset += op.bytes();
1563    }
1564 
1565    if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) {
1566       unsigned their_offset = 0;
1567       /* check for every operand of the vector
1568        * - whether the operand is assigned and
1569        * - we can use the register relative to that operand
1570        */
1571       for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1572          Operand& op = vec->operands[i];
1573          if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&
1574              ctx.assignments[op.tempId()].assigned) {
1575             PhysReg reg = ctx.assignments[op.tempId()].reg;
1576             reg.reg_b += (our_offset - their_offset);
1577             if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
1578                return {reg, true};
1579 
1580             /* return if MIMG vaddr components don't remain vector-aligned */
1581             if (vec->format == Format::MIMG)
1582                return {{}, false};
1583          }
1584          their_offset += op.bytes();
1585       }
1586 
1587       /* We didn't find a register relative to other vector operands.
1588        * Try to find new space which fits the whole vector.
1589        */
1590       RegClass vec_rc = RegClass::get(temp.type(), their_offset);
1591       DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
1592       std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
1593       PhysReg reg = res.first;
1594       if (res.second) {
1595          reg.reg_b += our_offset;
1596          /* make sure to only use byte offset if the instruction supports it */
1597          if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
1598             return {reg, true};
1599       }
1600    }
1601    return {{}, false};
1602 }
1603 
1604 PhysReg
get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr, int operand_index = -1)1605 get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
1606         std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
1607         int operand_index = -1)
1608 {
1609    auto split_vec = ctx.split_vectors.find(temp.id());
1610    if (split_vec != ctx.split_vectors.end()) {
1611       unsigned offset = 0;
1612       for (Definition def : split_vec->second->definitions) {
1613          if (ctx.assignments[def.tempId()].affinity) {
1614             assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
1615             if (affinity.assigned) {
1616                PhysReg reg = affinity.reg;
1617                reg.reg_b -= offset;
1618                if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
1619                   return reg;
1620             }
1621          }
1622          offset += def.bytes();
1623       }
1624    }
1625 
1626    if (ctx.assignments[temp.id()].affinity) {
1627       assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
1628       if (affinity.assigned) {
1629          if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg))
1630             return affinity.reg;
1631       }
1632    }
1633    if (ctx.assignments[temp.id()].vcc) {
1634       if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc))
1635          return vcc;
1636    }
1637 
1638    std::pair<PhysReg, bool> res;
1639 
1640    if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
1641       res = get_reg_vector(ctx, reg_file, temp, instr);
1642       if (res.second)
1643          return res.first;
1644    }
1645 
1646    DefInfo info(ctx, instr, temp.regClass(), operand_index);
1647 
1648    if (!ctx.policy.skip_optimistic_path) {
1649       /* try to find space without live-range splits */
1650       res = get_reg_simple(ctx, reg_file, info);
1651 
1652       if (res.second)
1653          return res.first;
1654    }
1655 
1656    /* try to find space with live-range splits */
1657    res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
1658 
1659    if (res.second)
1660       return res.first;
1661 
1662    /* try using more registers */
1663 
1664    /* We should only fail here because keeping under the limit would require
1665     * too many moves. */
1666    assert(reg_file.count_zero(info.bounds) >= info.size);
1667 
1668    if (!increase_register_file(ctx, info.rc.type())) {
1669       /* fallback algorithm: reallocate all variables at once */
1670       unsigned def_size = info.rc.size();
1671       for (Definition def : instr->definitions) {
1672          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1673             def_size += def.regClass().size();
1674       }
1675 
1676       unsigned killed_op_size = 0;
1677       for (Operand op : instr->operands) {
1678          if (op.isTemp() && op.isKillBeforeDef() && op.regClass().type() == info.rc.type())
1679             killed_op_size += op.regClass().size();
1680       }
1681 
1682       const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type());
1683 
1684       /* reallocate passthrough variables and non-killed operands */
1685       std::vector<IDAndRegClass> vars;
1686       for (unsigned id : find_vars(ctx, reg_file, regs))
1687          vars.emplace_back(id, ctx.assignments[id].rc);
1688       vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
1689 
1690       PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
1691 
1692       /* reallocate killed operands */
1693       std::vector<IDAndRegClass> killed_op_vars;
1694       for (Operand op : instr->operands) {
1695          if (op.isKillBeforeDef() && op.regClass().type() == info.rc.type())
1696             killed_op_vars.emplace_back(op.tempId(), op.regClass());
1697       }
1698       compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space);
1699 
1700       /* reallocate definitions */
1701       std::vector<IDAndRegClass> def_vars;
1702       for (Definition def : instr->definitions) {
1703          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1704             def_vars.emplace_back(def.tempId(), def.regClass());
1705       }
1706       def_vars.emplace_back(0xffffffff, info.rc);
1707       return compact_relocate_vars(ctx, def_vars, parallelcopies, space);
1708    }
1709 
1710    return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
1711 }
1712 
1713 PhysReg
get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr)1714 get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
1715                       std::vector<std::pair<Operand, Definition>>& parallelcopies,
1716                       aco_ptr<Instruction>& instr)
1717 {
1718    RegClass rc = temp.regClass();
1719    /* create_vector instructions have different costs w.r.t. register coalescing */
1720    uint32_t size = rc.size();
1721    uint32_t bytes = rc.bytes();
1722    uint32_t stride = get_stride(rc);
1723    PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
1724 
1725    // TODO: improve p_create_vector for sub-dword vectors
1726 
1727    PhysReg best_pos{0xFFF};
1728    unsigned num_moves = 0xFF;
1729    bool best_avoid = true;
1730    uint32_t correct_pos_mask = 0;
1731 
1732    /* test for each operand which definition placement causes the least shuffle instructions */
1733    for (unsigned i = 0, offset = 0; i < instr->operands.size();
1734         offset += instr->operands[i].bytes(), i++) {
1735       // TODO: think about, if we can alias live operands on the same register
1736       if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||
1737           instr->operands[i].getTemp().type() != rc.type())
1738          continue;
1739 
1740       if (offset > instr->operands[i].physReg().reg_b)
1741          continue;
1742 
1743       unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;
1744       if (reg_lower % 4)
1745          continue;
1746       PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};
1747       unsigned k = 0;
1748 
1749       /* no need to check multiple times */
1750       if (reg_win.lo() == best_pos)
1751          continue;
1752 
1753       /* check borders */
1754       // TODO: this can be improved */
1755       if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)
1756          continue;
1757       if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&
1758           reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1759          continue;
1760       if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&
1761           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1762          continue;
1763 
1764       /* count variables to be moved and check "avoid" */
1765       bool avoid = false;
1766       bool linear_vgpr = false;
1767       for (PhysReg j : reg_win) {
1768          if (reg_file[j] != 0) {
1769             if (reg_file[j] == 0xF0000000) {
1770                PhysReg reg;
1771                reg.reg_b = j * 4;
1772                unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4;
1773                for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)
1774                   k += reg_file.test(reg, 1);
1775             } else {
1776                k += 4;
1777                linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
1778             }
1779          }
1780          avoid |= ctx.war_hint[j];
1781       }
1782 
1783       if (linear_vgpr) {
1784          /* we cannot split live ranges of linear vgprs inside control flow */
1785          if (ctx.block->kind & block_kind_top_level)
1786             avoid = true;
1787          else
1788             continue;
1789       }
1790 
1791       if (avoid && !best_avoid)
1792          continue;
1793 
1794       /* count operands in wrong positions */
1795       uint32_t correct_pos_mask_new = 0;
1796       for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
1797            offset2 += instr->operands[j].bytes(), j++) {
1798          Operand& op = instr->operands[j];
1799          if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2)
1800             correct_pos_mask_new |= 1 << j;
1801          else
1802             k += op.bytes();
1803       }
1804       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1805       if (k > num_moves || (!aligned && k == num_moves))
1806          continue;
1807 
1808       best_pos = reg_win.lo();
1809       num_moves = k;
1810       best_avoid = avoid;
1811       correct_pos_mask = correct_pos_mask_new;
1812    }
1813 
1814    /* too many moves: try the generic get_reg() function */
1815    if (num_moves >= 2 * bytes) {
1816       return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1817    } else if (num_moves > bytes) {
1818       DefInfo info(ctx, instr, rc, -1);
1819       std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
1820       if (res.second)
1821          return res.first;
1822    }
1823 
1824    /* re-enable killed operands which are in the wrong position */
1825    RegisterFile tmp_file(reg_file);
1826    for (Operand& op : instr->operands) {
1827       if (op.isTemp() && op.isFirstKillBeforeDef())
1828          tmp_file.fill(op);
1829    }
1830    for (unsigned i = 0; i < instr->operands.size(); i++) {
1831       if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill())
1832          tmp_file.clear(instr->operands[i]);
1833    }
1834 
1835    /* collect variables to be moved */
1836    std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
1837 
1838    bool success = false;
1839    std::vector<std::pair<Operand, Definition>> pc;
1840    success =
1841       get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size});
1842 
1843    if (!success) {
1844       if (!increase_register_file(ctx, temp.type())) {
1845          /* use the fallback algorithm in get_reg() */
1846          return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1847       }
1848       return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr);
1849    }
1850 
1851    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1852    adjust_max_used_regs(ctx, rc, best_pos);
1853 
1854    return best_pos;
1855 }
1856 
1857 void
handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)1858 handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
1859 {
1860    if (instr->format != Format::PSEUDO)
1861       return;
1862 
1863    /* all instructions which use handle_operands() need this information */
1864    switch (instr->opcode) {
1865    case aco_opcode::p_extract_vector:
1866    case aco_opcode::p_create_vector:
1867    case aco_opcode::p_split_vector:
1868    case aco_opcode::p_parallelcopy:
1869    case aco_opcode::p_wqm: break;
1870    default: return;
1871    }
1872 
1873    bool writes_linear = false;
1874    /* if all definitions are logical vgpr, no need to care for SCC */
1875    for (Definition& def : instr->definitions) {
1876       if (def.getTemp().regClass().is_linear())
1877          writes_linear = true;
1878    }
1879    /* if all operands are constant, no need to care either */
1880    bool reads_linear = false;
1881    bool reads_subdword = false;
1882    for (Operand& op : instr->operands) {
1883       if (op.isTemp() && op.getTemp().regClass().is_linear())
1884          reads_linear = true;
1885       if (op.isTemp() && op.regClass().is_subdword())
1886          reads_subdword = true;
1887    }
1888    bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) ||
1889                             (ctx.program->gfx_level <= GFX7 && reads_subdword);
1890    if (!needs_scratch_reg)
1891       return;
1892 
1893    instr->pseudo().tmp_in_scc = reg_file[scc];
1894 
1895    int reg = ctx.max_used_sgpr;
1896    for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
1897       ;
1898    if (reg < 0) {
1899       reg = ctx.max_used_sgpr + 1;
1900       for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
1901          ;
1902       if (reg == ctx.program->max_reg_demand.sgpr) {
1903          assert(reads_subdword && reg_file[m0] == 0);
1904          reg = m0;
1905       }
1906    }
1907 
1908    adjust_max_used_regs(ctx, s1, reg);
1909    instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
1910 }
1911 
1912 bool
operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, RegClass rc)1913 operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
1914                     RegClass rc)
1915 {
1916    if (instr->operands[idx].isFixed())
1917       return instr->operands[idx].physReg() == reg;
1918 
1919    bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
1920                        instr->opcode == aco_opcode::v_writelane_b32_e64;
1921    if (gfx_level <= GFX9 && is_writelane && idx <= 1) {
1922       /* v_writelane_b32 can take two sgprs but only if one is m0. */
1923       bool is_other_sgpr =
1924          instr->operands[!idx].isTemp() &&
1925          (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0);
1926       if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) {
1927          instr->operands[idx].setFixed(m0);
1928          return reg == m0;
1929       }
1930    }
1931 
1932    if (reg.byte()) {
1933       unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
1934       if (reg.byte() % stride)
1935          return false;
1936    }
1937 
1938    switch (instr->format) {
1939    case Format::SMEM:
1940       return reg != scc && reg != exec &&
1941              (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1942              (reg != vcc || (instr->definitions.empty() && idx == 2) ||
1943               gfx_level >= GFX10); /* sdata can be vcc */
1944    default:
1945       // TODO: there are more instructions with restrictions on registers
1946       return true;
1947    }
1948 }
1949 
1950 void
get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, std::vector<std::pair<Operand, Definition>>& parallelcopy, aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)1951 get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
1952                     std::vector<std::pair<Operand, Definition>>& parallelcopy,
1953                     aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
1954 {
1955    /* check if the operand is fixed */
1956    PhysReg src = ctx.assignments[operand.tempId()].reg;
1957    PhysReg dst;
1958    if (operand.isFixed()) {
1959       assert(operand.physReg() != src);
1960 
1961       /* check if target reg is blocked, and move away the blocking var */
1962       if (register_file.test(operand.physReg(), operand.bytes())) {
1963          PhysRegInterval target{operand.physReg(), operand.size()};
1964 
1965          RegisterFile tmp_file(register_file);
1966 
1967          std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, target);
1968 
1969          tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src
1970          tmp_file.block(operand.physReg(), operand.regClass());
1971 
1972          DefInfo info(ctx, instr, operand.regClass(), -1);
1973          get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr,
1974                              PhysRegInterval());
1975       }
1976       dst = operand.physReg();
1977 
1978    } else {
1979       /* clear the operand in case it's only a stride mismatch */
1980       register_file.clear(src, operand.regClass());
1981       dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
1982    }
1983 
1984    Operand pc_op = operand;
1985    pc_op.setFixed(src);
1986    Definition pc_def = Definition(dst, pc_op.regClass());
1987    parallelcopy.emplace_back(pc_op, pc_def);
1988    update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
1989 }
1990 
1991 PhysReg
get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file, std::vector<aco_ptr<Instruction>>& instructions, Block& block, aco_ptr<Instruction>& phi, Temp tmp)1992 get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file,
1993             std::vector<aco_ptr<Instruction>>& instructions, Block& block,
1994             aco_ptr<Instruction>& phi, Temp tmp)
1995 {
1996    std::vector<std::pair<Operand, Definition>> parallelcopy;
1997    PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi);
1998    update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
1999 
2000    /* process parallelcopy */
2001    for (std::pair<Operand, Definition> pc : parallelcopy) {
2002       /* see if it's a copy from a different phi */
2003       // TODO: prefer moving some previous phis over live-ins
2004       // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
2005       // problem in practice since they can only be fixed to exec)
2006       Instruction* prev_phi = NULL;
2007       std::vector<aco_ptr<Instruction>>::iterator phi_it;
2008       for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
2009          if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
2010             prev_phi = phi_it->get();
2011       }
2012       if (prev_phi) {
2013          /* if so, just update that phi's register */
2014          prev_phi->definitions[0].setFixed(pc.second.physReg());
2015          register_file.fill(prev_phi->definitions[0]);
2016          ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
2017                                                                pc.second.regClass()};
2018          continue;
2019       }
2020 
2021       /* rename */
2022       std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
2023       Temp orig = pc.first.getTemp();
2024       if (orig_it != ctx.orig_names.end())
2025          orig = orig_it->second;
2026       else
2027          ctx.orig_names[pc.second.tempId()] = orig;
2028       ctx.renames[block.index][orig.id()] = pc.second.getTemp();
2029 
2030       /* otherwise, this is a live-in and we need to create a new phi
2031        * to move it in this block's predecessors */
2032       aco_opcode opcode =
2033          pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2034       std::vector<unsigned>& preds =
2035          pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
2036       aco_ptr<Instruction> new_phi{
2037          create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
2038       new_phi->definitions[0] = pc.second;
2039       for (unsigned i = 0; i < preds.size(); i++)
2040          new_phi->operands[i] = Operand(pc.first);
2041       instructions.emplace_back(std::move(new_phi));
2042 
2043       /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is
2044        * a loop header.
2045        */
2046       live_in.erase(orig.id());
2047    }
2048 
2049    return reg;
2050 }
2051 
2052 void
get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file, std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)2053 get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
2054                   std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
2055 {
2056    /* move all phis to instructions */
2057    for (aco_ptr<Instruction>& phi : block.instructions) {
2058       if (!is_phi(phi))
2059          break;
2060       if (!phi->definitions[0].isKill())
2061          instructions.emplace_back(std::move(phi));
2062    }
2063 
2064    /* assign phis with all-matching registers to that register */
2065    for (aco_ptr<Instruction>& phi : instructions) {
2066       Definition& definition = phi->definitions[0];
2067       if (definition.isFixed())
2068          continue;
2069 
2070       if (!phi->operands[0].isTemp())
2071          continue;
2072 
2073       PhysReg reg = phi->operands[0].physReg();
2074       auto OpsSame = [=](const Operand& op) -> bool
2075       { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
2076       bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
2077       if (!all_same)
2078          continue;
2079 
2080       if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg))
2081          continue;
2082 
2083       definition.setFixed(reg);
2084       register_file.fill(definition);
2085       ctx.assignments[definition.tempId()].set(definition);
2086    }
2087 
2088    /* try to find a register that is used by at least one operand */
2089    for (aco_ptr<Instruction>& phi : instructions) {
2090       Definition& definition = phi->definitions[0];
2091       if (definition.isFixed())
2092          continue;
2093 
2094       /* use affinity if available */
2095       if (ctx.assignments[definition.tempId()].affinity &&
2096           ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
2097          assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
2098          assert(affinity.rc == definition.regClass());
2099          if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) {
2100             definition.setFixed(affinity.reg);
2101             register_file.fill(definition);
2102             ctx.assignments[definition.tempId()].set(definition);
2103             continue;
2104          }
2105       }
2106 
2107       /* by going backwards, we aim to avoid copies in else-blocks */
2108       for (int i = phi->operands.size() - 1; i >= 0; i--) {
2109          const Operand& op = phi->operands[i];
2110          if (!op.isTemp() || !op.isFixed())
2111             continue;
2112 
2113          PhysReg reg = op.physReg();
2114          if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
2115             definition.setFixed(reg);
2116             register_file.fill(definition);
2117             ctx.assignments[definition.tempId()].set(definition);
2118             break;
2119          }
2120       }
2121    }
2122 
2123    /* find registers for phis where the register was blocked or no operand was assigned */
2124 
2125    /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */
2126    for (unsigned i = 0; i < instructions.size(); i++) {
2127       aco_ptr<Instruction>& phi = instructions[i];
2128       Definition& definition = phi->definitions[0];
2129       if (definition.isFixed())
2130          continue;
2131 
2132       definition.setFixed(
2133          get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp()));
2134 
2135       register_file.fill(definition);
2136       ctx.assignments[definition.tempId()].set(definition);
2137    }
2138 }
2139 
2140 Temp
read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)2141 read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
2142 {
2143    std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
2144    if (it == ctx.renames[block_idx].end())
2145       return val;
2146    else
2147       return it->second;
2148 }
2149 
2150 Temp
handle_live_in(ra_ctx& ctx, Temp val, Block* block)2151 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
2152 {
2153    std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
2154    if (preds.size() == 0)
2155       return val;
2156 
2157    if (preds.size() == 1) {
2158       /* if the block has only one predecessor, just look there for the name */
2159       return read_variable(ctx, val, preds[0]);
2160    }
2161 
2162    /* there are multiple predecessors and the block is sealed */
2163    Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));
2164 
2165    /* get the rename from each predecessor and check if they are the same */
2166    Temp new_val;
2167    bool needs_phi = false;
2168    for (unsigned i = 0; i < preds.size(); i++) {
2169       ops[i] = read_variable(ctx, val, preds[i]);
2170       if (i == 0)
2171          new_val = ops[i];
2172       else
2173          needs_phi |= !(new_val == ops[i]);
2174    }
2175 
2176    if (needs_phi) {
2177       assert(!val.regClass().is_linear_vgpr());
2178 
2179       /* the variable has been renamed differently in the predecessors: we need to insert a phi */
2180       aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2181       aco_ptr<Instruction> phi{
2182          create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
2183       new_val = ctx.program->allocateTmp(val.regClass());
2184       phi->definitions[0] = Definition(new_val);
2185       ctx.assignments.emplace_back();
2186       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
2187       for (unsigned i = 0; i < preds.size(); i++) {
2188          /* update the operands so that it uses the new affinity */
2189          phi->operands[i] = Operand(ops[i]);
2190          assert(ctx.assignments[ops[i].id()].assigned);
2191          assert(ops[i].regClass() == new_val.regClass());
2192          phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
2193       }
2194       block->instructions.insert(block->instructions.begin(), std::move(phi));
2195    }
2196 
2197    return new_val;
2198 }
2199 
2200 void
handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx, uint32_t loop_exit_idx)2201 handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
2202                  uint32_t loop_exit_idx)
2203 {
2204    Block& loop_header = ctx.program->blocks[loop_header_idx];
2205    std::unordered_map<unsigned, Temp> renames;
2206 
2207    /* create phis for variables renamed during the loop */
2208    for (unsigned t : live_in) {
2209       Temp val = Temp(t, ctx.program->temp_rc[t]);
2210       Temp prev = read_variable(ctx, val, loop_header_idx - 1);
2211       Temp renamed = handle_live_in(ctx, val, &loop_header);
2212       if (renamed == prev)
2213          continue;
2214 
2215       /* insert additional renames at block end, but don't overwrite */
2216       renames[prev.id()] = renamed;
2217       ctx.orig_names[renamed.id()] = val;
2218       for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2219          auto it = ctx.renames[idx].emplace(val.id(), renamed);
2220          /* if insertion is unsuccessful, update if necessary */
2221          if (!it.second && it.first->second == prev)
2222             it.first->second = renamed;
2223       }
2224 
2225       /* update loop-carried values of the phi created by handle_live_in() */
2226       for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) {
2227          Operand& op = loop_header.instructions[0]->operands[i];
2228          if (op.getTemp() == prev)
2229             op.setTemp(renamed);
2230       }
2231 
2232       /* use the assignment from the loop preheader and fix def reg */
2233       assignment& var = ctx.assignments[prev.id()];
2234       ctx.assignments[renamed.id()] = var;
2235       loop_header.instructions[0]->definitions[0].setFixed(var.reg);
2236    }
2237 
2238    /* rename loop carried phi operands */
2239    for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) {
2240       aco_ptr<Instruction>& phi = loop_header.instructions[i];
2241       if (!is_phi(phi))
2242          break;
2243       const std::vector<unsigned>& preds =
2244          phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
2245       for (unsigned j = 1; j < phi->operands.size(); j++) {
2246          Operand& op = phi->operands[j];
2247          if (!op.isTemp())
2248             continue;
2249 
2250          /* Find the original name, since this operand might not use the original name if the phi
2251           * was created after init_reg_file().
2252           */
2253          std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(op.tempId());
2254          Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp();
2255 
2256          op.setTemp(read_variable(ctx, orig, preds[j]));
2257          op.setFixed(ctx.assignments[op.tempId()].reg);
2258       }
2259    }
2260 
2261    /* return early if no new phi was created */
2262    if (renames.empty())
2263       return;
2264 
2265    /* propagate new renames through loop */
2266    for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2267       Block& current = ctx.program->blocks[idx];
2268       /* rename all uses in this block */
2269       for (aco_ptr<Instruction>& instr : current.instructions) {
2270          /* phis are renamed after RA */
2271          if (idx == loop_header_idx && is_phi(instr))
2272             continue;
2273 
2274          for (Operand& op : instr->operands) {
2275             if (!op.isTemp())
2276                continue;
2277 
2278             auto rename = renames.find(op.tempId());
2279             if (rename != renames.end()) {
2280                assert(rename->second.id());
2281                op.setTemp(rename->second);
2282             }
2283          }
2284       }
2285    }
2286 }
2287 
2288 /**
2289  * This function serves the purpose to correctly initialize the register file
2290  * at the beginning of a block (before any existing phis).
2291  * In order to do so, all live-in variables are entered into the RegisterFile.
2292  * Reg-to-reg moves (renames) from previous blocks are taken into account and
2293  * the SSA is repaired by inserting corresponding phi-nodes.
2294  */
2295 RegisterFile
init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)2296 init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
2297 {
2298    if (block.kind & block_kind_loop_exit) {
2299       uint32_t header = ctx.loop_header.back();
2300       ctx.loop_header.pop_back();
2301       handle_loop_phis(ctx, live_out_per_block[header], header, block.index);
2302    }
2303 
2304    RegisterFile register_file;
2305    const IDSet& live_in = live_out_per_block[block.index];
2306    assert(block.index != 0 || live_in.empty());
2307 
2308    if (block.kind & block_kind_loop_header) {
2309       ctx.loop_header.emplace_back(block.index);
2310       /* already rename phis incoming value */
2311       for (aco_ptr<Instruction>& instr : block.instructions) {
2312          if (!is_phi(instr))
2313             break;
2314          Operand& operand = instr->operands[0];
2315          if (operand.isTemp()) {
2316             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1));
2317             operand.setFixed(ctx.assignments[operand.tempId()].reg);
2318          }
2319       }
2320       for (unsigned t : live_in) {
2321          Temp val = Temp(t, ctx.program->temp_rc[t]);
2322          Temp renamed = read_variable(ctx, val, block.index - 1);
2323          if (renamed != val)
2324             ctx.renames[block.index][val.id()] = renamed;
2325          assignment& var = ctx.assignments[renamed.id()];
2326          assert(var.assigned);
2327          register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2328       }
2329    } else {
2330       /* rename phi operands */
2331       for (aco_ptr<Instruction>& instr : block.instructions) {
2332          if (!is_phi(instr))
2333             break;
2334          const std::vector<unsigned>& preds =
2335             instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
2336 
2337          for (unsigned i = 0; i < instr->operands.size(); i++) {
2338             Operand& operand = instr->operands[i];
2339             if (!operand.isTemp())
2340                continue;
2341             operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2342             operand.setFixed(ctx.assignments[operand.tempId()].reg);
2343          }
2344       }
2345       for (unsigned t : live_in) {
2346          Temp val = Temp(t, ctx.program->temp_rc[t]);
2347          Temp renamed = handle_live_in(ctx, val, &block);
2348          assignment& var = ctx.assignments[renamed.id()];
2349          /* due to live-range splits, the live-in might be a phi, now */
2350          if (var.assigned) {
2351             register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2352          }
2353          if (renamed != val) {
2354             ctx.renames[block.index].emplace(t, renamed);
2355             ctx.orig_names[renamed.id()] = val;
2356          }
2357       }
2358    }
2359 
2360    return register_file;
2361 }
2362 
2363 void
get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)2364 get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
2365 {
2366    std::vector<std::vector<Temp>> phi_ressources;
2367    std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
2368 
2369    for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
2370         block_rit++) {
2371       Block& block = *block_rit;
2372 
2373       /* first, compute the death points of all live vars within the block */
2374       IDSet& live = live_out_per_block[block.index];
2375 
2376       std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
2377       for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
2378          aco_ptr<Instruction>& instr = *rit;
2379          if (is_phi(instr))
2380             break;
2381 
2382          /* add vector affinities */
2383          if (instr->opcode == aco_opcode::p_create_vector) {
2384             for (const Operand& op : instr->operands) {
2385                if (op.isTemp() && op.isFirstKill() &&
2386                    op.getTemp().type() == instr->definitions[0].getTemp().type())
2387                   ctx.vectors[op.tempId()] = instr.get();
2388             }
2389          } else if (instr->format == Format::MIMG && instr->operands.size() > 4) {
2390             for (unsigned i = 3; i < instr->operands.size(); i++)
2391                ctx.vectors[instr->operands[i].tempId()] = instr.get();
2392          } else if (instr->opcode == aco_opcode::p_split_vector &&
2393                     instr->operands[0].isFirstKillBeforeDef()) {
2394             ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
2395          } else if (instr->isVOPC() && !instr->isVOP3()) {
2396             if (!instr->isSDWA() || ctx.program->gfx_level == GFX8)
2397                ctx.assignments[instr->definitions[0].tempId()].vcc = true;
2398          } else if (instr->isVOP2() && !instr->isVOP3()) {
2399             if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
2400                 instr->operands[2].regClass().type() == RegType::sgpr)
2401                ctx.assignments[instr->operands[2].tempId()].vcc = true;
2402             if (instr->definitions.size() == 2)
2403                ctx.assignments[instr->definitions[1].tempId()].vcc = true;
2404          } else if (instr->opcode == aco_opcode::s_and_b32 ||
2405                     instr->opcode == aco_opcode::s_and_b64) {
2406             /* If SCC is used by a branch, we might be able to use
2407              * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC.
2408              */
2409             if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() &&
2410                 instr->operands[1].isFixed() && instr->operands[1].physReg() == exec)
2411                ctx.assignments[instr->operands[0].tempId()].vcc = true;
2412          }
2413 
2414          /* add operands to live variables */
2415          for (const Operand& op : instr->operands) {
2416             if (op.isTemp())
2417                live.insert(op.tempId());
2418          }
2419 
2420          /* erase definitions from live */
2421          for (unsigned i = 0; i < instr->definitions.size(); i++) {
2422             const Definition& def = instr->definitions[i];
2423             if (!def.isTemp())
2424                continue;
2425             live.erase(def.tempId());
2426             /* mark last-seen phi operand */
2427             std::unordered_map<unsigned, unsigned>::iterator it =
2428                temp_to_phi_ressources.find(def.tempId());
2429             if (it != temp_to_phi_ressources.end() &&
2430                 def.regClass() == phi_ressources[it->second][0].regClass()) {
2431                phi_ressources[it->second][0] = def.getTemp();
2432                /* try to coalesce phi affinities with parallelcopies */
2433                Operand op = Operand();
2434                switch (instr->opcode) {
2435                case aco_opcode::p_parallelcopy: op = instr->operands[i]; break;
2436 
2437                case aco_opcode::v_interp_p2_f32:
2438                case aco_opcode::v_writelane_b32:
2439                case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break;
2440 
2441                case aco_opcode::v_fma_f32:
2442                case aco_opcode::v_fma_f16:
2443                case aco_opcode::v_pk_fma_f16:
2444                   if (ctx.program->gfx_level < GFX10)
2445                      continue;
2446                   FALLTHROUGH;
2447                case aco_opcode::v_mad_f32:
2448                case aco_opcode::v_mad_f16:
2449                   if (instr->usesModifiers())
2450                      continue;
2451                   op = instr->operands[2];
2452                   break;
2453 
2454                case aco_opcode::v_mad_legacy_f32:
2455                case aco_opcode::v_fma_legacy_f32:
2456                   if (instr->usesModifiers() || !ctx.program->dev.has_mac_legacy32)
2457                      continue;
2458                   op = instr->operands[2];
2459                   break;
2460 
2461                default: continue;
2462                }
2463 
2464                if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
2465                   phi_ressources[it->second].emplace_back(op.getTemp());
2466                   temp_to_phi_ressources[op.tempId()] = it->second;
2467                }
2468             }
2469          }
2470       }
2471 
2472       /* collect phi affinities */
2473       for (; rit != block.instructions.rend(); ++rit) {
2474          aco_ptr<Instruction>& instr = *rit;
2475          assert(is_phi(instr));
2476 
2477          live.erase(instr->definitions[0].tempId());
2478          if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
2479             continue;
2480 
2481          assert(instr->definitions[0].isTemp());
2482          std::unordered_map<unsigned, unsigned>::iterator it =
2483             temp_to_phi_ressources.find(instr->definitions[0].tempId());
2484          unsigned index = phi_ressources.size();
2485          std::vector<Temp>* affinity_related;
2486          if (it != temp_to_phi_ressources.end()) {
2487             index = it->second;
2488             phi_ressources[index][0] = instr->definitions[0].getTemp();
2489             affinity_related = &phi_ressources[index];
2490          } else {
2491             phi_ressources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
2492             affinity_related = &phi_ressources.back();
2493          }
2494 
2495          for (const Operand& op : instr->operands) {
2496             if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
2497                affinity_related->emplace_back(op.getTemp());
2498                if (block.kind & block_kind_loop_header)
2499                   continue;
2500                temp_to_phi_ressources[op.tempId()] = index;
2501             }
2502          }
2503       }
2504 
2505       /* visit the loop header phis first in order to create nested affinities */
2506       if (block.kind & block_kind_loop_exit) {
2507          /* find loop header */
2508          auto header_rit = block_rit;
2509          while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
2510             header_rit++;
2511 
2512          for (aco_ptr<Instruction>& phi : header_rit->instructions) {
2513             if (!is_phi(phi))
2514                break;
2515             if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
2516                continue;
2517 
2518             /* create an (empty) merge-set for the phi-related variables */
2519             auto it = temp_to_phi_ressources.find(phi->definitions[0].tempId());
2520             unsigned index = phi_ressources.size();
2521             if (it == temp_to_phi_ressources.end()) {
2522                temp_to_phi_ressources[phi->definitions[0].tempId()] = index;
2523                phi_ressources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
2524             } else {
2525                index = it->second;
2526             }
2527             for (unsigned i = 1; i < phi->operands.size(); i++) {
2528                const Operand& op = phi->operands[i];
2529                if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
2530                   temp_to_phi_ressources[op.tempId()] = index;
2531                }
2532             }
2533          }
2534       }
2535    }
2536    /* create affinities */
2537    for (std::vector<Temp>& vec : phi_ressources) {
2538       for (unsigned i = 1; i < vec.size(); i++)
2539          if (vec[i].id() != vec[0].id())
2540             ctx.assignments[vec[i].id()].affinity = vec[0].id();
2541    }
2542 }
2543 
2544 void
optimize_encoding_vop2(Program* program, ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)2545 optimize_encoding_vop2(Program* program, ra_ctx& ctx, RegisterFile& register_file,
2546                        aco_ptr<Instruction>& instr)
2547 {
2548    /* try to optimize v_mad_f32 -> v_mac_f32 */
2549    if ((instr->opcode != aco_opcode::v_mad_f32 &&
2550         (instr->opcode != aco_opcode::v_fma_f32 || program->gfx_level < GFX10) &&
2551         instr->opcode != aco_opcode::v_mad_f16 && instr->opcode != aco_opcode::v_mad_legacy_f16 &&
2552         (instr->opcode != aco_opcode::v_fma_f16 || program->gfx_level < GFX10) &&
2553         (instr->opcode != aco_opcode::v_pk_fma_f16 || program->gfx_level < GFX10) &&
2554         (instr->opcode != aco_opcode::v_mad_legacy_f32 || !program->dev.has_mac_legacy32) &&
2555         (instr->opcode != aco_opcode::v_fma_legacy_f32 || !program->dev.has_mac_legacy32) &&
2556         (instr->opcode != aco_opcode::v_dot4_i32_i8 || program->family == CHIP_VEGA20)) ||
2557        !instr->operands[2].isTemp() || !instr->operands[2].isKillBeforeDef() ||
2558        instr->operands[2].getTemp().type() != RegType::vgpr ||
2559        ((!instr->operands[0].isTemp() || instr->operands[0].getTemp().type() != RegType::vgpr) &&
2560         (!instr->operands[1].isTemp() || instr->operands[1].getTemp().type() != RegType::vgpr)) ||
2561        instr->usesModifiers() || instr->operands[0].physReg().byte() != 0 ||
2562        instr->operands[1].physReg().byte() != 0 || instr->operands[2].physReg().byte() != 0)
2563       return;
2564 
2565    if (!instr->operands[1].isTemp() || instr->operands[1].getTemp().type() != RegType::vgpr)
2566       std::swap(instr->operands[0], instr->operands[1]);
2567 
2568    unsigned def_id = instr->definitions[0].tempId();
2569    if (ctx.assignments[def_id].affinity) {
2570       assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2571       if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
2572           !register_file.test(affinity.reg, instr->operands[2].bytes()))
2573          return;
2574    }
2575 
2576    static_assert(sizeof(VOP2_instruction) <= sizeof(VOP3_instruction),
2577                  "Invalid direct instruction cast.");
2578    static_assert(sizeof(VOP2_instruction) <= sizeof(VOP3P_instruction),
2579                  "Invalid direct instruction cast.");
2580    instr->format = Format::VOP2;
2581    switch (instr->opcode) {
2582    case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
2583    case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
2584    case aco_opcode::v_mad_f16:
2585    case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
2586    case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
2587    case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
2588    case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
2589    case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break;
2590    case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break;
2591    default: break;
2592    }
2593 }
2594 
2595 void
optimize_encoding_sopk(Program* program, ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)2596 optimize_encoding_sopk(Program* program, ra_ctx& ctx, RegisterFile& register_file,
2597                        aco_ptr<Instruction>& instr)
2598 {
2599    /* try to optimize sop2 with literal source to sopk */
2600    if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_mul_i32 &&
2601        instr->opcode != aco_opcode::s_cselect_b32)
2602       return;
2603 
2604    uint32_t literal_idx = 0;
2605 
2606    if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral())
2607       literal_idx = 1;
2608 
2609    if (!instr->operands[!literal_idx].isTemp() ||
2610        !instr->operands[!literal_idx].isKillBeforeDef() ||
2611        instr->operands[!literal_idx].getTemp().type() != RegType::sgpr ||
2612        instr->operands[!literal_idx].physReg() >= 128)
2613       return;
2614 
2615    if (!instr->operands[literal_idx].isLiteral())
2616       return;
2617 
2618    const uint32_t i16_mask = 0xffff8000u;
2619    uint32_t value = instr->operands[literal_idx].constantValue();
2620    if ((value & i16_mask) && (value & i16_mask) != i16_mask)
2621       return;
2622 
2623    unsigned def_id = instr->definitions[0].tempId();
2624    if (ctx.assignments[def_id].affinity) {
2625       assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2626       if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() &&
2627           !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes()))
2628          return;
2629    }
2630 
2631    static_assert(sizeof(SOPK_instruction) <= sizeof(SOP2_instruction),
2632                  "Invalid direct instruction cast.");
2633    instr->format = Format::SOPK;
2634    SOPK_instruction* instr_sopk = &instr->sopk();
2635 
2636    instr_sopk->imm = instr_sopk->operands[literal_idx].constantValue() & 0xffff;
2637    if (literal_idx == 0)
2638       std::swap(instr_sopk->operands[0], instr_sopk->operands[1]);
2639    if (instr_sopk->operands.size() > 2)
2640       std::swap(instr_sopk->operands[1], instr_sopk->operands[2]);
2641    instr_sopk->operands.pop_back();
2642 
2643    switch (instr_sopk->opcode) {
2644    case aco_opcode::s_add_i32: instr_sopk->opcode = aco_opcode::s_addk_i32; break;
2645    case aco_opcode::s_mul_i32: instr_sopk->opcode = aco_opcode::s_mulk_i32; break;
2646    case aco_opcode::s_cselect_b32: instr_sopk->opcode = aco_opcode::s_cmovk_i32; break;
2647    default: unreachable("illegal instruction");
2648    }
2649 }
2650 
2651 void
optimize_encoding(Program* program, ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)2652 optimize_encoding(Program* program, ra_ctx& ctx, RegisterFile& register_file,
2653                   aco_ptr<Instruction>& instr)
2654 {
2655    if (instr->isVALU())
2656       optimize_encoding_vop2(program, ctx, register_file, instr);
2657    if (instr->isSALU())
2658       optimize_encoding_sopk(program, ctx, register_file, instr);
2659 }
2660 
2661 } /* end namespace */
2662 
2663 void
register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)2664 register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
2665 {
2666    ra_ctx ctx(program, policy);
2667    get_affinities(ctx, live_out_per_block);
2668 
2669    for (Block& block : program->blocks) {
2670       ctx.block = &block;
2671 
2672       /* initialize register file */
2673       RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
2674       ctx.war_hint.reset();
2675 
2676       std::vector<aco_ptr<Instruction>> instructions;
2677 
2678       /* this is a slight adjustment from the paper as we already have phi nodes:
2679        * We consider them incomplete phis and only handle the definition. */
2680       get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]);
2681 
2682       /* If this is a merge block, the state of the register file at the branch instruction of the
2683        * predecessors corresponds to the state after phis at the merge block. So, we allocate a
2684        * register for the predecessor's branch definitions as if there was a phi.
2685        */
2686       if (!block.linear_preds.empty() &&
2687           (block.linear_preds.size() != 1 ||
2688            program->blocks[block.linear_preds[0]].linear_succs.size() == 1)) {
2689          PhysReg br_reg = get_reg_phi(ctx, live_out_per_block[block.index], register_file,
2690                                       instructions, block, ctx.phi_dummy, Temp(0, s2));
2691          for (unsigned pred : block.linear_preds) {
2692             program->blocks[pred].scc_live_out = register_file[scc];
2693             aco_ptr<Instruction>& br = program->blocks[pred].instructions.back();
2694 
2695             assert(br->definitions.size() == 1 && br->definitions[0].regClass() == s2 &&
2696                    br->definitions[0].isKill());
2697 
2698             br->definitions[0].setFixed(br_reg);
2699          }
2700       }
2701 
2702       /* Handle all other instructions of the block */
2703       auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
2704       std::vector<aco_ptr<Instruction>>::iterator instr_it =
2705          std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
2706       for (; instr_it != block.instructions.end(); ++instr_it) {
2707          aco_ptr<Instruction>& instr = *instr_it;
2708 
2709          /* parallelcopies from p_phi are inserted here which means
2710           * live ranges of killed operands end here as well */
2711          if (instr->opcode == aco_opcode::p_logical_end) {
2712             /* no need to process this instruction any further */
2713             if (block.logical_succs.size() != 1) {
2714                instructions.emplace_back(std::move(instr));
2715                continue;
2716             }
2717 
2718             Block& succ = program->blocks[block.logical_succs[0]];
2719             unsigned idx = 0;
2720             for (; idx < succ.logical_preds.size(); idx++) {
2721                if (succ.logical_preds[idx] == block.index)
2722                   break;
2723             }
2724             for (aco_ptr<Instruction>& phi : succ.instructions) {
2725                if (phi->opcode == aco_opcode::p_phi) {
2726                   if (phi->operands[idx].isTemp() &&
2727                       phi->operands[idx].getTemp().type() == RegType::sgpr &&
2728                       phi->operands[idx].isFirstKillBeforeDef()) {
2729                      Definition phi_op(
2730                         read_variable(ctx, phi->operands[idx].getTemp(), block.index));
2731                      phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg);
2732                      register_file.clear(phi_op);
2733                   }
2734                } else if (phi->opcode != aco_opcode::p_linear_phi) {
2735                   break;
2736                }
2737             }
2738             instructions.emplace_back(std::move(instr));
2739             continue;
2740          }
2741 
2742          /* unconditional branches are handled after phis of the target */
2743          if (instr->opcode == aco_opcode::p_branch) {
2744             /* last instruction of the block */
2745             instructions.emplace_back(std::move(instr));
2746             break;
2747          }
2748 
2749          std::vector<std::pair<Operand, Definition>> parallelcopy;
2750 
2751          assert(!is_phi(instr));
2752 
2753          bool temp_in_scc = register_file[scc];
2754 
2755          /* handle operands */
2756          for (unsigned i = 0; i < instr->operands.size(); ++i) {
2757             auto& operand = instr->operands[i];
2758             if (!operand.isTemp())
2759                continue;
2760 
2761             /* rename operands */
2762             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
2763             assert(ctx.assignments[operand.tempId()].assigned);
2764 
2765             PhysReg reg = ctx.assignments[operand.tempId()].reg;
2766             if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
2767                operand.setFixed(reg);
2768             else
2769                get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
2770 
2771             if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) ||
2772                 (instr->isDS() && instr->ds().gds)) {
2773                for (unsigned j = 0; j < operand.size(); j++)
2774                   ctx.war_hint.set(operand.physReg().reg() + j);
2775             }
2776          }
2777 
2778          /* remove dead vars from register file */
2779          for (const Operand& op : instr->operands) {
2780             if (op.isTemp() && op.isFirstKillBeforeDef())
2781                register_file.clear(op);
2782          }
2783 
2784          optimize_encoding(program, ctx, register_file, instr);
2785 
2786          /* Handle definitions which must have the same register as an operand.
2787           * We expect that the definition has the same size as the operand, otherwise the new
2788           * location for the operand (if it's not killed) might intersect with the old one.
2789           * We can't read from the old location because it's corrupted, and we can't write the new
2790           * location because that's used by a live-through operand.
2791           */
2792          if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
2793              instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 ||
2794              instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 ||
2795              instr->opcode == aco_opcode::v_mac_legacy_f32 ||
2796              instr->opcode == aco_opcode::v_fmac_legacy_f32 ||
2797              instr->opcode == aco_opcode::v_pk_fmac_f16 ||
2798              instr->opcode == aco_opcode::v_writelane_b32 ||
2799              instr->opcode == aco_opcode::v_writelane_b32_e64 ||
2800              instr->opcode == aco_opcode::v_dot4c_i32_i8) {
2801             assert(instr->definitions[0].bytes() == instr->operands[2].bytes() ||
2802                    instr->operands[2].regClass() == v1);
2803             instr->definitions[0].setFixed(instr->operands[2].physReg());
2804          } else if (instr->opcode == aco_opcode::s_addk_i32 ||
2805                     instr->opcode == aco_opcode::s_mulk_i32 ||
2806                     instr->opcode == aco_opcode::s_cmovk_i32) {
2807             assert(instr->definitions[0].bytes() == instr->operands[0].bytes());
2808             instr->definitions[0].setFixed(instr->operands[0].physReg());
2809          } else if (instr->isMUBUF() && instr->definitions.size() == 1 &&
2810                     instr->operands.size() == 4) {
2811             assert(instr->definitions[0].bytes() == instr->operands[3].bytes());
2812             instr->definitions[0].setFixed(instr->operands[3].physReg());
2813          } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
2814                     !instr->operands[2].isUndefined()) {
2815             assert(instr->definitions[0].bytes() == instr->operands[2].bytes());
2816             instr->definitions[0].setFixed(instr->operands[2].physReg());
2817          }
2818 
2819          ctx.defs_done.reset();
2820 
2821          /* handle fixed definitions first */
2822          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
2823             auto& definition = instr->definitions[i];
2824             if (!definition.isFixed())
2825                continue;
2826 
2827             adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
2828             /* check if the target register is blocked */
2829             if (register_file.test(definition.physReg(), definition.bytes())) {
2830                const PhysRegInterval def_regs{definition.physReg(), definition.size()};
2831 
2832                /* create parallelcopy pair to move blocking vars */
2833                std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs);
2834 
2835                RegisterFile tmp_file(register_file);
2836                /* re-enable the killed operands, so that we don't move the blocking vars there */
2837                for (const Operand& op : instr->operands) {
2838                   if (op.isTemp() && op.isFirstKillBeforeDef())
2839                      tmp_file.fill(op);
2840                }
2841 
2842                ASSERTED bool success = false;
2843                DefInfo info(ctx, instr, definition.regClass(), -1);
2844                success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr,
2845                                              def_regs);
2846                assert(success);
2847 
2848                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
2849             }
2850             ctx.defs_done.set(i);
2851 
2852             if (!definition.isTemp())
2853                continue;
2854 
2855             ctx.assignments[definition.tempId()].set(definition);
2856             register_file.fill(definition);
2857          }
2858 
2859          /* handle all other definitions */
2860          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
2861             Definition* definition = &instr->definitions[i];
2862 
2863             if (definition->isFixed() || !definition->isTemp())
2864                continue;
2865 
2866             /* find free reg */
2867             if (instr->opcode == aco_opcode::p_split_vector) {
2868                PhysReg reg = instr->operands[0].physReg();
2869                RegClass rc = definition->regClass();
2870                for (unsigned j = 0; j < i; j++)
2871                   reg.reg_b += instr->definitions[j].bytes();
2872                if (get_reg_specified(ctx, register_file, rc, instr, reg)) {
2873                   definition->setFixed(reg);
2874                } else if (i == 0) {
2875                   RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes());
2876                   DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
2877                   std::pair<PhysReg, bool> res = get_reg_simple(ctx, register_file, info);
2878                   reg = res.first;
2879                   if (res.second && get_reg_specified(ctx, register_file, rc, instr, reg))
2880                      definition->setFixed(reg);
2881                } else if (instr->definitions[i - 1].isFixed()) {
2882                   reg = instr->definitions[i - 1].physReg();
2883                   reg.reg_b += instr->definitions[i - 1].bytes();
2884                   if (get_reg_specified(ctx, register_file, rc, instr, reg))
2885                      definition->setFixed(reg);
2886                }
2887             } else if (instr->opcode == aco_opcode::p_wqm ||
2888                        instr->opcode == aco_opcode::p_parallelcopy) {
2889                PhysReg reg = instr->operands[i].physReg();
2890                if (instr->operands[i].isTemp() &&
2891                    instr->operands[i].getTemp().type() == definition->getTemp().type() &&
2892                    !register_file.test(reg, definition->bytes()))
2893                   definition->setFixed(reg);
2894             } else if (instr->opcode == aco_opcode::p_extract_vector) {
2895                PhysReg reg = instr->operands[0].physReg();
2896                reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
2897                if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))
2898                   definition->setFixed(reg);
2899             } else if (instr->opcode == aco_opcode::p_create_vector) {
2900                PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
2901                                                    parallelcopy, instr);
2902                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
2903                definition->setFixed(reg);
2904             }
2905 
2906             if (!definition->isFixed()) {
2907                Temp tmp = definition->getTemp();
2908                if (definition->regClass().is_subdword() && definition->bytes() < 4) {
2909                   PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
2910                   definition->setFixed(reg);
2911                   if (reg.byte() || register_file.test(reg, 4)) {
2912                      add_subdword_definition(program, instr, reg);
2913                      definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
2914                                                              the reference */
2915                   }
2916                } else {
2917                   definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
2918                }
2919                update_renames(ctx, register_file, parallelcopy, instr,
2920                               instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops
2921                                                                            : (UpdateRenames)0);
2922             }
2923 
2924             assert(
2925                definition->isFixed() &&
2926                ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
2927                 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
2928             ctx.defs_done.set(i);
2929             ctx.assignments[definition->tempId()].set(*definition);
2930             register_file.fill(*definition);
2931          }
2932 
2933          handle_pseudo(ctx, register_file, instr.get());
2934 
2935          /* kill definitions and late-kill operands and ensure that sub-dword operands can actually
2936           * be read */
2937          for (const Definition& def : instr->definitions) {
2938             if (def.isTemp() && def.isKill())
2939                register_file.clear(def);
2940          }
2941          for (unsigned i = 0; i < instr->operands.size(); i++) {
2942             const Operand& op = instr->operands[i];
2943             if (op.isTemp() && op.isFirstKill() && op.isLateKill())
2944                register_file.clear(op);
2945             if (op.isTemp() && op.physReg().byte() != 0)
2946                add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
2947          }
2948 
2949          /* emit parallelcopy */
2950          if (!parallelcopy.empty()) {
2951             aco_ptr<Pseudo_instruction> pc;
2952             pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
2953                                                             Format::PSEUDO, parallelcopy.size(),
2954                                                             parallelcopy.size()));
2955             bool linear_vgpr = false;
2956             bool sgpr_operands_alias_defs = false;
2957             uint64_t sgpr_operands[4] = {0, 0, 0, 0};
2958             for (unsigned i = 0; i < parallelcopy.size(); i++) {
2959                linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
2960 
2961                if (temp_in_scc && parallelcopy[i].first.isTemp() &&
2962                    parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
2963                   if (!sgpr_operands_alias_defs) {
2964                      unsigned reg = parallelcopy[i].first.physReg().reg();
2965                      unsigned size = parallelcopy[i].first.getTemp().size();
2966                      sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
2967 
2968                      reg = parallelcopy[i].second.physReg().reg();
2969                      size = parallelcopy[i].second.getTemp().size();
2970                      if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
2971                         sgpr_operands_alias_defs = true;
2972                   }
2973                }
2974 
2975                pc->operands[i] = parallelcopy[i].first;
2976                pc->definitions[i] = parallelcopy[i].second;
2977                assert(pc->operands[i].size() == pc->definitions[i].size());
2978 
2979                /* it might happen that the operand is already renamed. we have to restore the
2980                 * original name. */
2981                std::unordered_map<unsigned, Temp>::iterator it =
2982                   ctx.orig_names.find(pc->operands[i].tempId());
2983                Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
2984                ctx.orig_names[pc->definitions[i].tempId()] = orig;
2985                ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
2986             }
2987 
2988             if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
2989                /* disable definitions and re-enable operands */
2990                RegisterFile tmp_file(register_file);
2991                for (const Definition& def : instr->definitions) {
2992                   if (def.isTemp() && !def.isKill())
2993                      tmp_file.clear(def);
2994                }
2995                for (const Operand& op : instr->operands) {
2996                   if (op.isTemp() && op.isFirstKill())
2997                      tmp_file.block(op.physReg(), op.regClass());
2998                }
2999 
3000                handle_pseudo(ctx, tmp_file, pc.get());
3001             } else {
3002                pc->tmp_in_scc = false;
3003             }
3004 
3005             instructions.emplace_back(std::move(pc));
3006          }
3007 
3008          /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
3009          bool instr_needs_vop3 =
3010             !instr->isVOP3() &&
3011             ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
3012              (instr->opcode == aco_opcode::v_cndmask_b32 &&
3013               !(instr->operands[2].physReg() == vcc)) ||
3014              ((instr->opcode == aco_opcode::v_add_co_u32 ||
3015                instr->opcode == aco_opcode::v_addc_co_u32 ||
3016                instr->opcode == aco_opcode::v_sub_co_u32 ||
3017                instr->opcode == aco_opcode::v_subb_co_u32 ||
3018                instr->opcode == aco_opcode::v_subrev_co_u32 ||
3019                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3020               !(instr->definitions[1].physReg() == vcc)) ||
3021              ((instr->opcode == aco_opcode::v_addc_co_u32 ||
3022                instr->opcode == aco_opcode::v_subb_co_u32 ||
3023                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3024               !(instr->operands[2].physReg() == vcc)));
3025          if (instr_needs_vop3) {
3026 
3027             /* if the first operand is a literal, we have to move it to a reg */
3028             if (instr->operands.size() && instr->operands[0].isLiteral() &&
3029                 program->gfx_level < GFX10) {
3030                bool can_sgpr = true;
3031                /* check, if we have to move to vgpr */
3032                for (const Operand& op : instr->operands) {
3033                   if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
3034                      can_sgpr = false;
3035                      break;
3036                   }
3037                }
3038                /* disable definitions and re-enable operands */
3039                RegisterFile tmp_file(register_file);
3040                for (const Definition& def : instr->definitions)
3041                   tmp_file.clear(def);
3042                for (const Operand& op : instr->operands) {
3043                   if (op.isTemp() && op.isFirstKill())
3044                      tmp_file.block(op.physReg(), op.regClass());
3045                }
3046                Temp tmp = program->allocateTmp(can_sgpr ? s1 : v1);
3047                ctx.assignments.emplace_back();
3048                PhysReg reg = get_reg(ctx, tmp_file, tmp, parallelcopy, instr);
3049                update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
3050 
3051                aco_ptr<Instruction> mov;
3052                if (can_sgpr)
3053                   mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32,
3054                                                                  Format::SOP1, 1, 1));
3055                else
3056                   mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32,
3057                                                                  Format::VOP1, 1, 1));
3058                mov->operands[0] = instr->operands[0];
3059                mov->definitions[0] = Definition(tmp);
3060                mov->definitions[0].setFixed(reg);
3061 
3062                instr->operands[0] = Operand(tmp);
3063                instr->operands[0].setFixed(reg);
3064                instr->operands[0].setFirstKill(true);
3065 
3066                instructions.emplace_back(std::move(mov));
3067             }
3068 
3069             /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
3070             aco_ptr<Instruction> tmp = std::move(instr);
3071             Format format = asVOP3(tmp->format);
3072             instr.reset(create_instruction<VOP3_instruction>(
3073                tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
3074             std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin());
3075             std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
3076          }
3077 
3078          instructions.emplace_back(std::move(*instr_it));
3079 
3080       } /* end for Instr */
3081 
3082       block.instructions = std::move(instructions);
3083    } /* end for BB */
3084 
3085    /* num_gpr = rnd_up(max_used_gpr + 1) */
3086    program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1);
3087    program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
3088 
3089    program->progress = CompilationProgress::after_ra;
3090 }
3091 
3092 } // namespace aco
3093