1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_ir.h" 26 27#include <map> 28#include <unordered_map> 29#include <vector> 30 31/* 32 * Implements the algorithm for dominator-tree value numbering 33 * from "Value Numbering" by Briggs, Cooper, and Simpson. 34 */ 35 36namespace aco { 37namespace { 38 39inline uint32_t 40murmur_32_scramble(uint32_t h, uint32_t k) 41{ 42 k *= 0xcc9e2d51; 43 k = (k << 15) | (k >> 17); 44 h ^= k * 0x1b873593; 45 h = (h << 13) | (h >> 19); 46 h = h * 5 + 0xe6546b64; 47 return h; 48} 49 50template <typename T> 51uint32_t 52hash_murmur_32(Instruction* instr) 53{ 54 uint32_t hash = uint32_t(instr->format) << 16 | uint32_t(instr->opcode); 55 56 for (const Operand& op : instr->operands) 57 hash = murmur_32_scramble(hash, op.constantValue()); 58 59 /* skip format, opcode and pass_flags */ 60 for (unsigned i = 2; i < (sizeof(T) >> 2); i++) { 61 uint32_t u; 62 /* Accesses it though a byte array, so doesn't violate the strict aliasing rule */ 63 memcpy(&u, reinterpret_cast<uint8_t*>(instr) + i * 4, 4); 64 hash = murmur_32_scramble(hash, u); 65 } 66 67 /* Finalize. */ 68 uint32_t len = instr->operands.size() + instr->definitions.size() + sizeof(T); 69 hash ^= len; 70 hash ^= hash >> 16; 71 hash *= 0x85ebca6b; 72 hash ^= hash >> 13; 73 hash *= 0xc2b2ae35; 74 hash ^= hash >> 16; 75 return hash; 76} 77 78struct InstrHash { 79 /* This hash function uses the Murmur3 algorithm written by Austin Appleby 80 * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp 81 * 82 * In order to calculate the expression set, only the right-hand-side of an 83 * instruction is used for the hash, i.e. everything except the definitions. 84 */ 85 std::size_t operator()(Instruction* instr) const 86 { 87 if (instr->isVOP3()) 88 return hash_murmur_32<VOP3_instruction>(instr); 89 90 if (instr->isDPP16()) 91 return hash_murmur_32<DPP16_instruction>(instr); 92 93 if (instr->isDPP8()) 94 return hash_murmur_32<DPP8_instruction>(instr); 95 96 if (instr->isSDWA()) 97 return hash_murmur_32<SDWA_instruction>(instr); 98 99 switch (instr->format) { 100 case Format::SMEM: return hash_murmur_32<SMEM_instruction>(instr); 101 case Format::VINTRP: return hash_murmur_32<Interp_instruction>(instr); 102 case Format::DS: return hash_murmur_32<DS_instruction>(instr); 103 case Format::SOPP: return hash_murmur_32<SOPP_instruction>(instr); 104 case Format::SOPK: return hash_murmur_32<SOPK_instruction>(instr); 105 case Format::EXP: return hash_murmur_32<Export_instruction>(instr); 106 case Format::MUBUF: return hash_murmur_32<MUBUF_instruction>(instr); 107 case Format::MIMG: return hash_murmur_32<MIMG_instruction>(instr); 108 case Format::MTBUF: return hash_murmur_32<MTBUF_instruction>(instr); 109 case Format::FLAT: return hash_murmur_32<FLAT_instruction>(instr); 110 case Format::PSEUDO_BRANCH: return hash_murmur_32<Pseudo_branch_instruction>(instr); 111 case Format::PSEUDO_REDUCTION: return hash_murmur_32<Pseudo_reduction_instruction>(instr); 112 default: return hash_murmur_32<Instruction>(instr); 113 } 114 } 115}; 116 117struct InstrPred { 118 bool operator()(Instruction* a, Instruction* b) const 119 { 120 if (a->format != b->format) 121 return false; 122 if (a->opcode != b->opcode) 123 return false; 124 if (a->operands.size() != b->operands.size() || 125 a->definitions.size() != b->definitions.size()) 126 return false; /* possible with pseudo-instructions */ 127 for (unsigned i = 0; i < a->operands.size(); i++) { 128 if (a->operands[i].isConstant()) { 129 if (!b->operands[i].isConstant()) 130 return false; 131 if (a->operands[i].constantValue() != b->operands[i].constantValue()) 132 return false; 133 } else if (a->operands[i].isTemp()) { 134 if (!b->operands[i].isTemp()) 135 return false; 136 if (a->operands[i].tempId() != b->operands[i].tempId()) 137 return false; 138 } else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined()) 139 return false; 140 if (a->operands[i].isFixed()) { 141 if (!b->operands[i].isFixed()) 142 return false; 143 if (a->operands[i].physReg() != b->operands[i].physReg()) 144 return false; 145 if (a->operands[i].physReg() == exec && a->pass_flags != b->pass_flags) 146 return false; 147 } 148 } 149 for (unsigned i = 0; i < a->definitions.size(); i++) { 150 if (a->definitions[i].isTemp()) { 151 if (!b->definitions[i].isTemp()) 152 return false; 153 if (a->definitions[i].regClass() != b->definitions[i].regClass()) 154 return false; 155 } 156 if (a->definitions[i].isFixed()) { 157 if (!b->definitions[i].isFixed()) 158 return false; 159 if (a->definitions[i].physReg() != b->definitions[i].physReg()) 160 return false; 161 if (a->definitions[i].physReg() == exec) 162 return false; 163 } 164 } 165 166 if (a->opcode == aco_opcode::v_readfirstlane_b32) 167 return a->pass_flags == b->pass_flags; 168 169 if (a->isVOP3()) { 170 VOP3_instruction& a3 = a->vop3(); 171 VOP3_instruction& b3 = b->vop3(); 172 for (unsigned i = 0; i < 3; i++) { 173 if (a3.abs[i] != b3.abs[i] || a3.neg[i] != b3.neg[i]) 174 return false; 175 } 176 return a3.clamp == b3.clamp && a3.omod == b3.omod && a3.opsel == b3.opsel; 177 } 178 if (a->isDPP16()) { 179 DPP16_instruction& aDPP = a->dpp16(); 180 DPP16_instruction& bDPP = b->dpp16(); 181 return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl && 182 aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask && 183 aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.abs[0] == bDPP.abs[0] && 184 aDPP.abs[1] == bDPP.abs[1] && aDPP.neg[0] == bDPP.neg[0] && 185 aDPP.neg[1] == bDPP.neg[1]; 186 } 187 if (a->isDPP8()) { 188 DPP8_instruction& aDPP = a->dpp8(); 189 DPP8_instruction& bDPP = b->dpp8(); 190 return aDPP.pass_flags == bDPP.pass_flags && 191 !memcmp(aDPP.lane_sel, bDPP.lane_sel, sizeof(aDPP.lane_sel)); 192 } 193 if (a->isSDWA()) { 194 SDWA_instruction& aSDWA = a->sdwa(); 195 SDWA_instruction& bSDWA = b->sdwa(); 196 return aSDWA.sel[0] == bSDWA.sel[0] && aSDWA.sel[1] == bSDWA.sel[1] && 197 aSDWA.dst_sel == bSDWA.dst_sel && aSDWA.abs[0] == bSDWA.abs[0] && 198 aSDWA.abs[1] == bSDWA.abs[1] && aSDWA.neg[0] == bSDWA.neg[0] && 199 aSDWA.neg[1] == bSDWA.neg[1] && aSDWA.clamp == bSDWA.clamp && 200 aSDWA.omod == bSDWA.omod; 201 } 202 203 switch (a->format) { 204 case Format::SOPK: { 205 if (a->opcode == aco_opcode::s_getreg_b32) 206 return false; 207 SOPK_instruction& aK = a->sopk(); 208 SOPK_instruction& bK = b->sopk(); 209 return aK.imm == bK.imm; 210 } 211 case Format::SMEM: { 212 SMEM_instruction& aS = a->smem(); 213 SMEM_instruction& bS = b->smem(); 214 /* isel shouldn't be creating situations where this assertion fails */ 215 assert(aS.prevent_overflow == bS.prevent_overflow); 216 return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && aS.nv == bS.nv && 217 aS.disable_wqm == bS.disable_wqm && aS.prevent_overflow == bS.prevent_overflow; 218 } 219 case Format::VINTRP: { 220 Interp_instruction& aI = a->vintrp(); 221 Interp_instruction& bI = b->vintrp(); 222 if (aI.attribute != bI.attribute) 223 return false; 224 if (aI.component != bI.component) 225 return false; 226 return true; 227 } 228 case Format::VOP3P: { 229 VOP3P_instruction& a3P = a->vop3p(); 230 VOP3P_instruction& b3P = b->vop3p(); 231 for (unsigned i = 0; i < 3; i++) { 232 if (a3P.neg_lo[i] != b3P.neg_lo[i] || a3P.neg_hi[i] != b3P.neg_hi[i]) 233 return false; 234 } 235 return a3P.opsel_lo == b3P.opsel_lo && a3P.opsel_hi == b3P.opsel_hi && 236 a3P.clamp == b3P.clamp; 237 } 238 case Format::PSEUDO_REDUCTION: { 239 Pseudo_reduction_instruction& aR = a->reduction(); 240 Pseudo_reduction_instruction& bR = b->reduction(); 241 return aR.pass_flags == bR.pass_flags && aR.reduce_op == bR.reduce_op && 242 aR.cluster_size == bR.cluster_size; 243 } 244 case Format::DS: { 245 assert(a->opcode == aco_opcode::ds_bpermute_b32 || 246 a->opcode == aco_opcode::ds_permute_b32 || a->opcode == aco_opcode::ds_swizzle_b32); 247 DS_instruction& aD = a->ds(); 248 DS_instruction& bD = b->ds(); 249 return aD.sync == bD.sync && aD.pass_flags == bD.pass_flags && aD.gds == bD.gds && 250 aD.offset0 == bD.offset0 && aD.offset1 == bD.offset1; 251 } 252 case Format::MTBUF: { 253 MTBUF_instruction& aM = a->mtbuf(); 254 MTBUF_instruction& bM = b->mtbuf(); 255 return aM.sync == bM.sync && aM.dfmt == bM.dfmt && aM.nfmt == bM.nfmt && 256 aM.offset == bM.offset && aM.offen == bM.offen && aM.idxen == bM.idxen && 257 aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && aM.tfe == bM.tfe && 258 aM.disable_wqm == bM.disable_wqm; 259 } 260 case Format::MUBUF: { 261 MUBUF_instruction& aM = a->mubuf(); 262 MUBUF_instruction& bM = b->mubuf(); 263 return aM.sync == bM.sync && aM.offset == bM.offset && aM.offen == bM.offen && 264 aM.idxen == bM.idxen && aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && 265 aM.tfe == bM.tfe && aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm; 266 } 267 case Format::MIMG: { 268 MIMG_instruction& aM = a->mimg(); 269 MIMG_instruction& bM = b->mimg(); 270 return aM.sync == bM.sync && aM.dmask == bM.dmask && aM.unrm == bM.unrm && 271 aM.glc == bM.glc && aM.slc == bM.slc && aM.tfe == bM.tfe && aM.da == bM.da && 272 aM.lwe == bM.lwe && aM.r128 == bM.r128 && aM.a16 == bM.a16 && aM.d16 == bM.d16 && 273 aM.disable_wqm == bM.disable_wqm; 274 } 275 case Format::FLAT: 276 case Format::GLOBAL: 277 case Format::SCRATCH: 278 case Format::EXP: 279 case Format::SOPP: 280 case Format::PSEUDO_BRANCH: 281 case Format::PSEUDO_BARRIER: assert(false); 282 default: return true; 283 } 284 } 285}; 286 287using expr_set = std::unordered_map<Instruction*, uint32_t, InstrHash, InstrPred>; 288 289struct vn_ctx { 290 Program* program; 291 expr_set expr_values; 292 std::map<uint32_t, Temp> renames; 293 294 /* The exec id should be the same on the same level of control flow depth. 295 * Together with the check for dominator relations, it is safe to assume 296 * that the same exec_id also means the same execution mask. 297 * Discards increment the exec_id, so that it won't return to the previous value. 298 */ 299 uint32_t exec_id = 1; 300 301 vn_ctx(Program* program_) : program(program_) 302 { 303 static_assert(sizeof(Temp) == 4, "Temp must fit in 32bits"); 304 unsigned size = 0; 305 for (Block& block : program->blocks) 306 size += block.instructions.size(); 307 expr_values.reserve(size); 308 } 309}; 310 311/* dominates() returns true if the parent block dominates the child block and 312 * if the parent block is part of the same loop or has a smaller loop nest depth. 313 */ 314bool 315dominates(vn_ctx& ctx, uint32_t parent, uint32_t child) 316{ 317 unsigned parent_loop_nest_depth = ctx.program->blocks[parent].loop_nest_depth; 318 while (parent < child && parent_loop_nest_depth <= ctx.program->blocks[child].loop_nest_depth) 319 child = ctx.program->blocks[child].logical_idom; 320 321 return parent == child; 322} 323 324/** Returns whether this instruction can safely be removed 325 * and replaced by an equal expression. 326 * This is in particular true for ALU instructions and 327 * read-only memory instructions. 328 * 329 * Note that expr_set must not be used with instructions 330 * which cannot be eliminated. 331 */ 332bool 333can_eliminate(aco_ptr<Instruction>& instr) 334{ 335 switch (instr->format) { 336 case Format::FLAT: 337 case Format::GLOBAL: 338 case Format::SCRATCH: 339 case Format::EXP: 340 case Format::SOPP: 341 case Format::PSEUDO_BRANCH: 342 case Format::PSEUDO_BARRIER: return false; 343 case Format::DS: 344 return instr->opcode == aco_opcode::ds_bpermute_b32 || 345 instr->opcode == aco_opcode::ds_permute_b32 || 346 instr->opcode == aco_opcode::ds_swizzle_b32; 347 case Format::SMEM: 348 case Format::MUBUF: 349 case Format::MIMG: 350 case Format::MTBUF: 351 if (!get_sync_info(instr.get()).can_reorder()) 352 return false; 353 break; 354 default: break; 355 } 356 357 if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi || 358 instr->opcode == aco_opcode::p_linear_phi || instr->definitions[0].isNoCSE()) 359 return false; 360 361 return true; 362} 363 364void 365process_block(vn_ctx& ctx, Block& block) 366{ 367 std::vector<aco_ptr<Instruction>> new_instructions; 368 new_instructions.reserve(block.instructions.size()); 369 370 for (aco_ptr<Instruction>& instr : block.instructions) { 371 /* first, rename operands */ 372 for (Operand& op : instr->operands) { 373 if (!op.isTemp()) 374 continue; 375 auto it = ctx.renames.find(op.tempId()); 376 if (it != ctx.renames.end()) 377 op.setTemp(it->second); 378 } 379 380 if (instr->opcode == aco_opcode::p_discard_if || 381 instr->opcode == aco_opcode::p_demote_to_helper) 382 ctx.exec_id++; 383 384 if (!can_eliminate(instr)) { 385 new_instructions.emplace_back(std::move(instr)); 386 continue; 387 } 388 389 /* simple copy-propagation through renaming */ 390 bool copy_instr = 391 instr->opcode == aco_opcode::p_parallelcopy || 392 (instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1); 393 if (copy_instr && !instr->definitions[0].isFixed() && instr->operands[0].isTemp() && 394 instr->operands[0].regClass() == instr->definitions[0].regClass()) { 395 ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp(); 396 continue; 397 } 398 399 instr->pass_flags = ctx.exec_id; 400 std::pair<expr_set::iterator, bool> res = ctx.expr_values.emplace(instr.get(), block.index); 401 402 /* if there was already an expression with the same value number */ 403 if (!res.second) { 404 Instruction* orig_instr = res.first->first; 405 assert(instr->definitions.size() == orig_instr->definitions.size()); 406 /* check if the original instruction dominates the current one */ 407 if (dominates(ctx, res.first->second, block.index) && 408 ctx.program->blocks[res.first->second].fp_mode.canReplace(block.fp_mode)) { 409 for (unsigned i = 0; i < instr->definitions.size(); i++) { 410 assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass()); 411 assert(instr->definitions[i].isTemp()); 412 ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp(); 413 if (instr->definitions[i].isPrecise()) 414 orig_instr->definitions[i].setPrecise(true); 415 /* SPIR_V spec says that an instruction marked with NUW wrapping 416 * around is undefined behaviour, so we can break additions in 417 * other contexts. 418 */ 419 if (instr->definitions[i].isNUW()) 420 orig_instr->definitions[i].setNUW(true); 421 } 422 } else { 423 ctx.expr_values.erase(res.first); 424 ctx.expr_values.emplace(instr.get(), block.index); 425 new_instructions.emplace_back(std::move(instr)); 426 } 427 } else { 428 new_instructions.emplace_back(std::move(instr)); 429 } 430 } 431 432 block.instructions = std::move(new_instructions); 433} 434 435void 436rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames) 437{ 438 for (aco_ptr<Instruction>& phi : block.instructions) { 439 if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) 440 break; 441 442 for (Operand& op : phi->operands) { 443 if (!op.isTemp()) 444 continue; 445 auto it = renames.find(op.tempId()); 446 if (it != renames.end()) 447 op.setTemp(it->second); 448 } 449 } 450} 451} /* end namespace */ 452 453void 454value_numbering(Program* program) 455{ 456 vn_ctx ctx(program); 457 std::vector<unsigned> loop_headers; 458 459 for (Block& block : program->blocks) { 460 assert(ctx.exec_id > 0); 461 /* decrement exec_id when leaving nested control flow */ 462 if (block.kind & block_kind_loop_header) 463 loop_headers.push_back(block.index); 464 if (block.kind & block_kind_merge) { 465 ctx.exec_id--; 466 } else if (block.kind & block_kind_loop_exit) { 467 ctx.exec_id -= program->blocks[loop_headers.back()].linear_preds.size(); 468 ctx.exec_id -= block.linear_preds.size(); 469 loop_headers.pop_back(); 470 } 471 472 if (block.logical_idom != -1) 473 process_block(ctx, block); 474 else 475 rename_phi_operands(block, ctx.renames); 476 477 /* increment exec_id when entering nested control flow */ 478 if (block.kind & block_kind_branch || block.kind & block_kind_loop_preheader || 479 block.kind & block_kind_break || block.kind & block_kind_continue) 480 ctx.exec_id++; 481 else if (block.kind & block_kind_continue_or_break) 482 ctx.exec_id += 2; 483 } 484 485 /* rename loop header phi operands */ 486 for (Block& block : program->blocks) { 487 if (block.kind & block_kind_loop_header) 488 rename_phi_operands(block, ctx.renames); 489 } 490} 491 492} // namespace aco 493