1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_ir.h" 26 27#include <algorithm> 28#include <map> 29#include <vector> 30 31namespace aco { 32namespace { 33 34struct phi_info_item { 35 Definition def; 36 Operand op; 37}; 38 39struct ssa_elimination_ctx { 40 /* The outer vectors should be indexed by block index. The inner vectors store phi information 41 * for each block. */ 42 std::vector<std::vector<phi_info_item>> logical_phi_info; 43 std::vector<std::vector<phi_info_item>> linear_phi_info; 44 std::vector<bool> empty_blocks; 45 std::vector<bool> blocks_incoming_exec_used; 46 Program* program; 47 48 ssa_elimination_ctx(Program* program_) 49 : logical_phi_info(program_->blocks.size()), linear_phi_info(program_->blocks.size()), 50 empty_blocks(program_->blocks.size(), true), 51 blocks_incoming_exec_used(program_->blocks.size(), true), program(program_) 52 {} 53}; 54 55void 56collect_phi_info(ssa_elimination_ctx& ctx) 57{ 58 for (Block& block : ctx.program->blocks) { 59 for (aco_ptr<Instruction>& phi : block.instructions) { 60 if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) 61 break; 62 63 for (unsigned i = 0; i < phi->operands.size(); i++) { 64 if (phi->operands[i].isUndefined()) 65 continue; 66 if (phi->operands[i].physReg() == phi->definitions[0].physReg()) 67 continue; 68 69 assert(phi->definitions[0].size() == phi->operands[i].size()); 70 71 std::vector<unsigned>& preds = 72 phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; 73 uint32_t pred_idx = preds[i]; 74 auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx] 75 : ctx.linear_phi_info[pred_idx]; 76 info_vec.push_back({phi->definitions[0], phi->operands[i]}); 77 ctx.empty_blocks[pred_idx] = false; 78 } 79 } 80 } 81} 82 83void 84insert_parallelcopies(ssa_elimination_ctx& ctx) 85{ 86 /* insert the parallelcopies from logical phis before p_logical_end */ 87 for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) { 88 auto& logical_phi_info = ctx.logical_phi_info[block_idx]; 89 if (logical_phi_info.empty()) 90 continue; 91 92 Block& block = ctx.program->blocks[block_idx]; 93 unsigned idx = block.instructions.size() - 1; 94 while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) { 95 assert(idx > 0); 96 idx--; 97 } 98 99 std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx); 100 aco_ptr<Pseudo_instruction> pc{ 101 create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 102 logical_phi_info.size(), logical_phi_info.size())}; 103 unsigned i = 0; 104 for (auto& phi_info : logical_phi_info) { 105 pc->definitions[i] = phi_info.def; 106 pc->operands[i] = phi_info.op; 107 i++; 108 } 109 /* this shouldn't be needed since we're only copying vgprs */ 110 pc->tmp_in_scc = false; 111 block.instructions.insert(it, std::move(pc)); 112 } 113 114 /* insert parallelcopies for the linear phis at the end of blocks just before the branch */ 115 for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) { 116 auto& linear_phi_info = ctx.linear_phi_info[block_idx]; 117 if (linear_phi_info.empty()) 118 continue; 119 120 Block& block = ctx.program->blocks[block_idx]; 121 std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end(); 122 --it; 123 assert((*it)->isBranch()); 124 PhysReg scratch_sgpr = (*it)->definitions[0].physReg(); 125 aco_ptr<Pseudo_instruction> pc{ 126 create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 127 linear_phi_info.size(), linear_phi_info.size())}; 128 unsigned i = 0; 129 for (auto& phi_info : linear_phi_info) { 130 pc->definitions[i] = phi_info.def; 131 pc->operands[i] = phi_info.op; 132 i++; 133 } 134 pc->tmp_in_scc = block.scc_live_out; 135 pc->scratch_sgpr = scratch_sgpr; 136 block.instructions.insert(it, std::move(pc)); 137 } 138} 139 140bool 141is_empty_block(Block* block, bool ignore_exec_writes) 142{ 143 /* check if this block is empty and the exec mask is not needed */ 144 for (aco_ptr<Instruction>& instr : block->instructions) { 145 switch (instr->opcode) { 146 case aco_opcode::p_linear_phi: 147 case aco_opcode::p_phi: 148 case aco_opcode::p_logical_start: 149 case aco_opcode::p_logical_end: 150 case aco_opcode::p_branch: break; 151 case aco_opcode::p_parallelcopy: 152 for (unsigned i = 0; i < instr->definitions.size(); i++) { 153 if (ignore_exec_writes && instr->definitions[i].physReg() == exec) 154 continue; 155 if (instr->definitions[i].physReg() != instr->operands[i].physReg()) 156 return false; 157 } 158 break; 159 case aco_opcode::s_andn2_b64: 160 case aco_opcode::s_andn2_b32: 161 if (ignore_exec_writes && instr->definitions[0].physReg() == exec) 162 break; 163 return false; 164 default: return false; 165 } 166 } 167 return true; 168} 169 170void 171try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block) 172{ 173 /* check if the successor is another merge block which restores exec */ 174 // TODO: divergent loops also restore exec 175 if (block->linear_succs.size() != 1 || 176 !(ctx.program->blocks[block->linear_succs[0]].kind & block_kind_merge)) 177 return; 178 179 /* check if this block is empty */ 180 if (!is_empty_block(block, true)) 181 return; 182 183 /* keep the branch instruction and remove the rest */ 184 aco_ptr<Instruction> branch = std::move(block->instructions.back()); 185 block->instructions.clear(); 186 block->instructions.emplace_back(std::move(branch)); 187} 188 189void 190try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block) 191{ 192 assert(block->linear_succs.size() == 2); 193 /* only remove this block if the successor got removed as well */ 194 if (block->linear_succs[0] != block->linear_succs[1]) 195 return; 196 197 /* check if block is otherwise empty */ 198 if (!is_empty_block(block, true)) 199 return; 200 201 unsigned succ_idx = block->linear_succs[0]; 202 assert(block->linear_preds.size() == 2); 203 for (unsigned i = 0; i < 2; i++) { 204 Block* pred = &ctx.program->blocks[block->linear_preds[i]]; 205 pred->linear_succs[0] = succ_idx; 206 ctx.program->blocks[succ_idx].linear_preds[i] = pred->index; 207 208 Pseudo_branch_instruction& branch = pred->instructions.back()->branch(); 209 assert(branch.isBranch()); 210 branch.target[0] = succ_idx; 211 branch.target[1] = succ_idx; 212 } 213 214 block->instructions.clear(); 215 block->linear_preds.clear(); 216 block->linear_succs.clear(); 217} 218 219void 220try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block) 221{ 222 if (!is_empty_block(block, false)) 223 return; 224 225 Block& pred = ctx.program->blocks[block->linear_preds[0]]; 226 Block& succ = ctx.program->blocks[block->linear_succs[0]]; 227 Pseudo_branch_instruction& branch = pred.instructions.back()->branch(); 228 if (branch.opcode == aco_opcode::p_branch) { 229 branch.target[0] = succ.index; 230 branch.target[1] = succ.index; 231 } else if (branch.target[0] == block->index) { 232 branch.target[0] = succ.index; 233 } else if (branch.target[0] == succ.index) { 234 assert(branch.target[1] == block->index); 235 branch.target[1] = succ.index; 236 branch.opcode = aco_opcode::p_branch; 237 } else if (branch.target[1] == block->index) { 238 /* check if there is a fall-through path from block to succ */ 239 bool falls_through = block->index < succ.index; 240 for (unsigned j = block->index + 1; falls_through && j < succ.index; j++) { 241 assert(ctx.program->blocks[j].index == j); 242 if (!ctx.program->blocks[j].instructions.empty()) 243 falls_through = false; 244 } 245 if (falls_through) { 246 branch.target[1] = succ.index; 247 } else { 248 /* check if there is a fall-through path for the alternative target */ 249 if (block->index >= branch.target[0]) 250 return; 251 for (unsigned j = block->index + 1; j < branch.target[0]; j++) { 252 if (!ctx.program->blocks[j].instructions.empty()) 253 return; 254 } 255 256 /* This is a (uniform) break or continue block. The branch condition has to be inverted. */ 257 if (branch.opcode == aco_opcode::p_cbranch_z) 258 branch.opcode = aco_opcode::p_cbranch_nz; 259 else if (branch.opcode == aco_opcode::p_cbranch_nz) 260 branch.opcode = aco_opcode::p_cbranch_z; 261 else 262 assert(false); 263 /* also invert the linear successors */ 264 pred.linear_succs[0] = pred.linear_succs[1]; 265 pred.linear_succs[1] = succ.index; 266 branch.target[1] = branch.target[0]; 267 branch.target[0] = succ.index; 268 } 269 } else { 270 assert(false); 271 } 272 273 if (branch.target[0] == branch.target[1]) 274 branch.opcode = aco_opcode::p_branch; 275 276 for (unsigned i = 0; i < pred.linear_succs.size(); i++) 277 if (pred.linear_succs[i] == block->index) 278 pred.linear_succs[i] = succ.index; 279 280 for (unsigned i = 0; i < succ.linear_preds.size(); i++) 281 if (succ.linear_preds[i] == block->index) 282 succ.linear_preds[i] = pred.index; 283 284 block->instructions.clear(); 285 block->linear_preds.clear(); 286 block->linear_succs.clear(); 287} 288 289bool 290instr_writes_exec(Instruction* instr) 291{ 292 for (Definition& def : instr->definitions) 293 if (def.physReg() == exec || def.physReg() == exec_hi) 294 return true; 295 296 return false; 297} 298 299void 300eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block) 301{ 302 /* Check if any successor needs the outgoing exec mask from the current block. */ 303 304 bool exec_write_used; 305 306 if (!ctx.logical_phi_info[block.index].empty()) { 307 exec_write_used = true; 308 } else { 309 bool copy_to_exec = false; 310 bool copy_from_exec = false; 311 312 for (const auto& successor_phi_info : ctx.linear_phi_info[block.index]) { 313 copy_to_exec |= successor_phi_info.def.physReg() == exec; 314 copy_from_exec |= successor_phi_info.op.physReg() == exec; 315 } 316 317 if (copy_from_exec) 318 exec_write_used = true; 319 else if (copy_to_exec) 320 exec_write_used = false; 321 else 322 /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */ 323 exec_write_used = 324 std::any_of(block.linear_succs.begin(), block.linear_succs.end(), 325 [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; }); 326 } 327 328 /* Go through all instructions and eliminate useless exec writes. */ 329 330 for (int i = block.instructions.size() - 1; i >= 0; --i) { 331 aco_ptr<Instruction>& instr = block.instructions[i]; 332 333 /* We already take information from phis into account before the loop, so let's just break on 334 * phis. */ 335 if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi) 336 break; 337 338 /* See if the current instruction needs or writes exec. */ 339 bool needs_exec = needs_exec_mask(instr.get()); 340 bool writes_exec = instr_writes_exec(instr.get()); 341 342 /* See if we found an unused exec write. */ 343 if (writes_exec && !exec_write_used) { 344 instr.reset(); 345 continue; 346 } 347 348 /* For a newly encountered exec write, clear the used flag. */ 349 if (writes_exec) 350 exec_write_used = false; 351 352 /* If the current instruction needs exec, mark it as used. */ 353 exec_write_used |= needs_exec; 354 } 355 356 /* Remember if the current block needs an incoming exec mask from its predecessors. */ 357 ctx.blocks_incoming_exec_used[block.index] = exec_write_used; 358 359 /* Cleanup: remove deleted instructions from the vector. */ 360 auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr); 361 block.instructions.resize(new_end - block.instructions.begin()); 362} 363 364void 365jump_threading(ssa_elimination_ctx& ctx) 366{ 367 for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) { 368 Block* block = &ctx.program->blocks[i]; 369 eliminate_useless_exec_writes_in_block(ctx, *block); 370 371 if (!ctx.empty_blocks[i]) 372 continue; 373 374 if (block->kind & block_kind_invert) { 375 try_remove_invert_block(ctx, block); 376 continue; 377 } 378 379 if (block->linear_succs.size() > 1) 380 continue; 381 382 if (block->kind & block_kind_merge || block->kind & block_kind_loop_exit) 383 try_remove_merge_block(ctx, block); 384 385 if (block->linear_preds.size() == 1) 386 try_remove_simple_block(ctx, block); 387 } 388} 389 390} /* end namespace */ 391 392void 393ssa_elimination(Program* program) 394{ 395 ssa_elimination_ctx ctx(program); 396 397 /* Collect information about every phi-instruction */ 398 collect_phi_info(ctx); 399 400 /* eliminate empty blocks */ 401 jump_threading(ctx); 402 403 /* insert parallelcopies from SSA elimination */ 404 insert_parallelcopies(ctx); 405} 406} // namespace aco 407