1/* 2 * Copyright © 2019 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_builder.h" 26#include "aco_ir.h" 27 28#include <algorithm> 29#include <map> 30#include <unordered_map> 31#include <vector> 32 33/* 34 * Implements an algorithm to lower to Concentional SSA Form (CSSA). 35 * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency" 36 * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon, 37 * 38 * By lowering the IR to CSSA, the insertion of parallelcopies is separated from 39 * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling. 40 * The algorithm coalesces non-interfering phi-resources while taking value-equality 41 * into account. Re-indexes the SSA-defs. 42 */ 43 44namespace aco { 45namespace { 46 47typedef std::vector<Temp> merge_set; 48 49struct copy { 50 Definition def; 51 Operand op; 52}; 53 54struct merge_node { 55 Operand value = Operand(); /* original value: can be an SSA-def or constant value */ 56 uint32_t index = -1u; /* index into the vector of merge sets */ 57 uint32_t defined_at = -1u; /* defining block */ 58 59 /* we also remember two dominating defs with the same value: */ 60 Temp equal_anc_in = Temp(); /* within the same merge set */ 61 Temp equal_anc_out = Temp(); /* from a different set */ 62}; 63 64struct cssa_ctx { 65 Program* program; 66 std::vector<IDSet>& live_out; /* live-out sets per block */ 67 std::vector<std::vector<copy>> parallelcopies; /* copies per block */ 68 std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */ 69 std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */ 70}; 71 72/* create (virtual) parallelcopies for each phi instruction and 73 * already merge copy-definitions with phi-defs into merge sets */ 74void 75collect_parallelcopies(cssa_ctx& ctx) 76{ 77 ctx.parallelcopies.resize(ctx.program->blocks.size()); 78 Builder bld(ctx.program); 79 for (Block& block : ctx.program->blocks) { 80 for (aco_ptr<Instruction>& phi : block.instructions) { 81 if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) 82 break; 83 84 const Definition& def = phi->definitions[0]; 85 86 /* if the definition is not temp, it is the exec mask. 87 * We can reload the exec mask directly from the spill slot. 88 */ 89 if (!def.isTemp()) 90 continue; 91 92 std::vector<unsigned>& preds = 93 phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; 94 uint32_t index = ctx.merge_sets.size(); 95 merge_set set; 96 97 bool has_preheader_copy = false; 98 for (unsigned i = 0; i < phi->operands.size(); i++) { 99 Operand op = phi->operands[i]; 100 if (op.isUndefined()) 101 continue; 102 103 if (def.regClass().type() == RegType::sgpr && !op.isTemp()) { 104 /* SGPR inline constants and literals on GFX10+ can be spilled 105 * and reloaded directly (without intermediate register) */ 106 if (op.isConstant()) { 107 if (ctx.program->gfx_level >= GFX10) 108 continue; 109 if (op.size() == 1 && !op.isLiteral()) 110 continue; 111 } else { 112 assert(op.isFixed() && op.physReg() == exec); 113 continue; 114 } 115 } 116 117 /* create new temporary and rename operands */ 118 Temp tmp = bld.tmp(def.regClass()); 119 ctx.parallelcopies[preds[i]].emplace_back(copy{Definition(tmp), op}); 120 phi->operands[i] = Operand(tmp); 121 122 /* place the new operands in the same merge set */ 123 set.emplace_back(tmp); 124 ctx.merge_node_table[tmp.id()] = {op, index, preds[i]}; 125 126 /* update the liveness information */ 127 if (op.isKill()) 128 ctx.live_out[preds[i]].erase(op.tempId()); 129 ctx.live_out[preds[i]].insert(tmp.id()); 130 131 has_preheader_copy |= i == 0 && block.kind & block_kind_loop_header; 132 } 133 134 if (set.empty()) 135 continue; 136 137 /* place the definition in dominance-order */ 138 if (def.isTemp()) { 139 if (has_preheader_copy) 140 set.emplace(std::next(set.begin()), def.getTemp()); 141 else if (block.kind & block_kind_loop_header) 142 set.emplace(set.begin(), def.getTemp()); 143 else 144 set.emplace_back(def.getTemp()); 145 ctx.merge_node_table[def.tempId()] = {Operand(def.getTemp()), index, block.index}; 146 } 147 ctx.merge_sets.emplace_back(set); 148 } 149 } 150} 151 152/* check whether the definition of a comes after b. */ 153inline bool 154defined_after(cssa_ctx& ctx, Temp a, Temp b) 155{ 156 merge_node& node_a = ctx.merge_node_table[a.id()]; 157 merge_node& node_b = ctx.merge_node_table[b.id()]; 158 if (node_a.defined_at == node_b.defined_at) 159 return a.id() > b.id(); 160 161 return node_a.defined_at > node_b.defined_at; 162} 163 164/* check whether a dominates b where b is defined after a */ 165inline bool 166dominates(cssa_ctx& ctx, Temp a, Temp b) 167{ 168 assert(defined_after(ctx, b, a)); 169 merge_node& node_a = ctx.merge_node_table[a.id()]; 170 merge_node& node_b = ctx.merge_node_table[b.id()]; 171 unsigned idom = node_b.defined_at; 172 while (idom > node_a.defined_at) 173 idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom 174 : ctx.program->blocks[idom].linear_idom; 175 176 return idom == node_a.defined_at; 177} 178 179/* check intersection between var and parent: 180 * We already know that parent dominates var. */ 181inline bool 182intersects(cssa_ctx& ctx, Temp var, Temp parent) 183{ 184 merge_node& node_var = ctx.merge_node_table[var.id()]; 185 merge_node& node_parent = ctx.merge_node_table[parent.id()]; 186 assert(node_var.index != node_parent.index); 187 uint32_t block_idx = node_var.defined_at; 188 189 /* if the parent is live-out at the definition block of var, they intersect */ 190 bool parent_live = ctx.live_out[block_idx].count(parent.id()); 191 if (parent_live) 192 return true; 193 194 /* parent is defined in a different block than var */ 195 if (node_parent.defined_at < node_var.defined_at) { 196 /* if the parent is not live-in, they don't interfere */ 197 std::vector<uint32_t>& preds = var.type() == RegType::vgpr 198 ? ctx.program->blocks[block_idx].logical_preds 199 : ctx.program->blocks[block_idx].linear_preds; 200 for (uint32_t pred : preds) { 201 if (!ctx.live_out[pred].count(parent.id())) 202 return false; 203 } 204 } 205 206 for (const copy& cp : ctx.parallelcopies[block_idx]) { 207 /* if var is defined at the edge, they don't intersect */ 208 if (cp.def.getTemp() == var) 209 return false; 210 if (cp.op.isTemp() && cp.op.getTemp() == parent) 211 parent_live = true; 212 } 213 /* if the parent is live at the edge, they intersect */ 214 if (parent_live) 215 return true; 216 217 /* both, parent and var, are present in the same block */ 218 const Block& block = ctx.program->blocks[block_idx]; 219 for (auto it = block.instructions.crbegin(); it != block.instructions.crend(); ++it) { 220 /* if the parent was not encountered yet, it can only be used by a phi */ 221 if (is_phi(it->get())) 222 break; 223 224 for (const Definition& def : (*it)->definitions) { 225 if (!def.isTemp()) 226 continue; 227 /* if parent was not found yet, they don't intersect */ 228 if (def.getTemp() == var) 229 return false; 230 } 231 232 for (const Operand& op : (*it)->operands) { 233 if (!op.isTemp()) 234 continue; 235 /* if the var was defined before this point, they intersect */ 236 if (op.getTemp() == parent) 237 return true; 238 } 239 } 240 241 return false; 242} 243 244/* check interference between var and parent: 245 * i.e. they have different values and intersect. 246 * If parent and var share the same value, also updates the equal ancestor. */ 247inline bool 248interference(cssa_ctx& ctx, Temp var, Temp parent) 249{ 250 assert(var != parent); 251 merge_node& node_var = ctx.merge_node_table[var.id()]; 252 node_var.equal_anc_out = Temp(); 253 254 if (node_var.index == ctx.merge_node_table[parent.id()].index) { 255 /* check/update in other set */ 256 parent = ctx.merge_node_table[parent.id()].equal_anc_out; 257 } 258 259 Temp tmp = parent; 260 /* check if var intersects with parent or any equal-valued ancestor */ 261 while (tmp != Temp() && !intersects(ctx, var, tmp)) { 262 merge_node& node_tmp = ctx.merge_node_table[tmp.id()]; 263 tmp = node_tmp.equal_anc_in; 264 } 265 266 /* no intersection found */ 267 if (tmp == Temp()) 268 return false; 269 270 /* var and parent, same value, but in different sets */ 271 if (node_var.value == ctx.merge_node_table[parent.id()].value) { 272 node_var.equal_anc_out = tmp; 273 return false; 274 } 275 276 /* var and parent, different values and intersect */ 277 return true; 278} 279 280/* tries to merge set_b into set_a of given temporary and 281 * drops that temporary as it is being coalesced */ 282bool 283try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b) 284{ 285 auto def_node_it = ctx.merge_node_table.find(dst.id()); 286 uint32_t index = def_node_it->second.index; 287 merge_set& set_a = ctx.merge_sets[index]; 288 std::vector<Temp> dom; /* stack of the traversal */ 289 merge_set union_set; /* the new merged merge-set */ 290 uint32_t i_a = 0; 291 uint32_t i_b = 0; 292 293 while (i_a < set_a.size() || i_b < set_b.size()) { 294 Temp current; 295 if (i_a == set_a.size()) 296 current = set_b[i_b++]; 297 else if (i_b == set_b.size()) 298 current = set_a[i_a++]; 299 /* else pick the one defined first */ 300 else if (defined_after(ctx, set_a[i_a], set_b[i_b])) 301 current = set_b[i_b++]; 302 else 303 current = set_a[i_a++]; 304 305 while (!dom.empty() && !dominates(ctx, dom.back(), current)) 306 dom.pop_back(); /* not the desired parent, remove */ 307 308 if (!dom.empty() && interference(ctx, current, dom.back())) 309 return false; /* intersection detected */ 310 311 dom.emplace_back(current); /* otherwise, keep checking */ 312 if (current != dst) 313 union_set.emplace_back(current); /* maintain the new merge-set sorted */ 314 } 315 316 /* update hashmap */ 317 for (Temp t : union_set) { 318 merge_node& node = ctx.merge_node_table[t.id()]; 319 /* update the equal ancestors: 320 * i.e. the 'closest' dominating def with the same value */ 321 Temp in = node.equal_anc_in; 322 Temp out = node.equal_anc_out; 323 if (in == Temp() || (out != Temp() && defined_after(ctx, out, in))) 324 node.equal_anc_in = out; 325 node.equal_anc_out = Temp(); 326 /* update merge-set index */ 327 node.index = index; 328 } 329 set_b = merge_set(); /* free the old set_b */ 330 ctx.merge_sets[index] = union_set; 331 ctx.merge_node_table.erase(dst.id()); /* remove the temporary */ 332 333 return true; 334} 335 336/* returns true if the copy can safely be omitted */ 337bool 338try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx) 339{ 340 /* we can only coalesce temporaries */ 341 if (!copy.op.isTemp()) 342 return false; 343 344 /* try emplace a merge_node for the copy operand */ 345 merge_node& op_node = ctx.merge_node_table[copy.op.tempId()]; 346 if (op_node.defined_at == -1u) { 347 /* find defining block of operand */ 348 uint32_t pred = block_idx; 349 do { 350 block_idx = pred; 351 pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom 352 : ctx.program->blocks[pred].linear_idom; 353 } while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId())); 354 op_node.defined_at = block_idx; 355 op_node.value = copy.op; 356 } 357 358 /* we can only coalesce copies of the same register class */ 359 if (copy.op.regClass() != copy.def.regClass()) 360 return false; 361 362 /* check if this operand has not yet been coalesced */ 363 if (op_node.index == -1u) { 364 merge_set op_set = merge_set{copy.op.getTemp()}; 365 return try_merge_merge_set(ctx, copy.def.getTemp(), op_set); 366 } 367 368 /* check if this operand has been coalesced into the same set */ 369 assert(ctx.merge_node_table.count(copy.def.tempId())); 370 if (op_node.index == ctx.merge_node_table[copy.def.tempId()].index) 371 return true; 372 373 /* otherwise, try to coalesce both merge sets */ 374 return try_merge_merge_set(ctx, copy.def.getTemp(), ctx.merge_sets[op_node.index]); 375} 376 377/* node in the location-transfer-graph */ 378struct ltg_node { 379 copy cp; 380 uint32_t read_idx; 381 uint32_t num_uses = 0; 382}; 383 384/* emit the copies in an order that does not 385 * create interferences within a merge-set */ 386void 387emit_copies_block(Builder& bld, std::map<uint32_t, ltg_node>& ltg, RegType type) 388{ 389 auto&& it = ltg.begin(); 390 while (it != ltg.end()) { 391 const copy& cp = it->second.cp; 392 /* wrong regclass or still needed as operand */ 393 if (cp.def.regClass().type() != type || it->second.num_uses > 0) { 394 ++it; 395 continue; 396 } 397 398 /* emit the copy */ 399 bld.copy(cp.def, it->second.cp.op); 400 401 /* update the location transfer graph */ 402 if (it->second.read_idx != -1u) { 403 auto&& other = ltg.find(it->second.read_idx); 404 if (other != ltg.end()) 405 other->second.num_uses--; 406 } 407 ltg.erase(it); 408 it = ltg.begin(); 409 } 410 411 /* count the number of remaining circular dependencies */ 412 unsigned num = std::count_if(ltg.begin(), ltg.end(), 413 [&](auto& n) { return n.second.cp.def.regClass().type() == type; }); 414 415 /* if there are circular dependencies, we just emit them as single parallelcopy */ 416 if (num) { 417 // TODO: this should be restricted to a feasible number of registers 418 // and otherwise use a temporary to avoid having to reload more (spilled) 419 // variables than we have registers. 420 aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>( 421 aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)}; 422 it = ltg.begin(); 423 for (unsigned i = 0; i < num; i++) { 424 while (it->second.cp.def.regClass().type() != type) 425 ++it; 426 427 copy->definitions[i] = it->second.cp.def; 428 copy->operands[i] = it->second.cp.op; 429 it = ltg.erase(it); 430 } 431 bld.insert(std::move(copy)); 432 } 433} 434 435/* either emits or coalesces all parallelcopies and 436 * renames the phi-operands accordingly. */ 437void 438emit_parallelcopies(cssa_ctx& ctx) 439{ 440 std::unordered_map<uint32_t, Operand> renames; 441 442 /* we iterate backwards to prioritize coalescing in else-blocks */ 443 for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) { 444 if (ctx.parallelcopies[i].empty()) 445 continue; 446 447 std::map<uint32_t, ltg_node> ltg; 448 bool has_vgpr_copy = false; 449 bool has_sgpr_copy = false; 450 451 /* first, try to coalesce all parallelcopies */ 452 for (const copy& cp : ctx.parallelcopies[i]) { 453 if (try_coalesce_copy(ctx, cp, i)) { 454 renames.emplace(cp.def.tempId(), cp.op); 455 /* update liveness info */ 456 ctx.live_out[i].erase(cp.def.tempId()); 457 ctx.live_out[i].insert(cp.op.tempId()); 458 } else { 459 uint32_t read_idx = -1u; 460 if (cp.op.isTemp()) 461 read_idx = ctx.merge_node_table[cp.op.tempId()].index; 462 uint32_t write_idx = ctx.merge_node_table[cp.def.tempId()].index; 463 assert(write_idx != -1u); 464 ltg[write_idx] = {cp, read_idx}; 465 466 bool is_vgpr = cp.def.regClass().type() == RegType::vgpr; 467 has_vgpr_copy |= is_vgpr; 468 has_sgpr_copy |= !is_vgpr; 469 } 470 } 471 472 /* build location-transfer-graph */ 473 for (auto& pair : ltg) { 474 if (pair.second.read_idx == -1u) 475 continue; 476 auto&& it = ltg.find(pair.second.read_idx); 477 if (it != ltg.end()) 478 it->second.num_uses++; 479 } 480 481 /* emit parallelcopies ordered */ 482 Builder bld(ctx.program); 483 Block& block = ctx.program->blocks[i]; 484 485 if (has_vgpr_copy) { 486 /* emit VGPR copies */ 487 auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool 488 { return inst->opcode == aco_opcode::p_logical_end; }; 489 auto it = 490 std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd); 491 bld.reset(&block.instructions, std::prev(it.base())); 492 emit_copies_block(bld, ltg, RegType::vgpr); 493 } 494 495 if (has_sgpr_copy) { 496 /* emit SGPR copies */ 497 aco_ptr<Instruction> branch = std::move(block.instructions.back()); 498 block.instructions.pop_back(); 499 bld.reset(&block.instructions); 500 emit_copies_block(bld, ltg, RegType::sgpr); 501 bld.insert(std::move(branch)); 502 } 503 } 504 505 /* finally, rename coalesced phi operands */ 506 for (Block& block : ctx.program->blocks) { 507 for (aco_ptr<Instruction>& phi : block.instructions) { 508 if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) 509 break; 510 511 for (Operand& op : phi->operands) { 512 if (!op.isTemp()) 513 continue; 514 auto&& it = renames.find(op.tempId()); 515 if (it != renames.end()) { 516 op = it->second; 517 renames.erase(it); 518 } 519 } 520 } 521 } 522 assert(renames.empty()); 523} 524 525} /* end namespace */ 526 527void 528lower_to_cssa(Program* program, live& live_vars) 529{ 530 reindex_ssa(program, live_vars.live_out); 531 cssa_ctx ctx = {program, live_vars.live_out}; 532 collect_parallelcopies(ctx); 533 emit_parallelcopies(ctx); 534 535 /* update live variable information */ 536 live_vars = live_var_analysis(program); 537} 538} // namespace aco 539