1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2021 Valve Corporation 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci */ 24bf215546Sopenharmony_ci 25bf215546Sopenharmony_ci#include "aco_builder.h" 26bf215546Sopenharmony_ci#include "aco_ir.h" 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci#include <algorithm> 29bf215546Sopenharmony_ci#include <array> 30bf215546Sopenharmony_ci#include <bitset> 31bf215546Sopenharmony_ci#include <vector> 32bf215546Sopenharmony_ci 33bf215546Sopenharmony_cinamespace aco { 34bf215546Sopenharmony_cinamespace { 35bf215546Sopenharmony_ci 36bf215546Sopenharmony_ciconstexpr const size_t max_reg_cnt = 512; 37bf215546Sopenharmony_ciconstexpr const size_t max_sgpr_cnt = 128; 38bf215546Sopenharmony_ciconstexpr const size_t min_vgpr = 256; 39bf215546Sopenharmony_ciconstexpr const size_t max_vgpr_cnt = 256; 40bf215546Sopenharmony_ci 41bf215546Sopenharmony_cistruct Idx { 42bf215546Sopenharmony_ci bool operator==(const Idx& other) const { return block == other.block && instr == other.instr; } 43bf215546Sopenharmony_ci bool operator!=(const Idx& other) const { return !operator==(other); } 44bf215546Sopenharmony_ci 45bf215546Sopenharmony_ci bool found() const { return block != UINT32_MAX; } 46bf215546Sopenharmony_ci 47bf215546Sopenharmony_ci uint32_t block; 48bf215546Sopenharmony_ci uint32_t instr; 49bf215546Sopenharmony_ci}; 50bf215546Sopenharmony_ci 51bf215546Sopenharmony_ciIdx not_written_in_block{UINT32_MAX, 0}; 52bf215546Sopenharmony_ciIdx clobbered{UINT32_MAX, 1}; 53bf215546Sopenharmony_ciIdx const_or_undef{UINT32_MAX, 2}; 54bf215546Sopenharmony_ciIdx written_by_multiple_instrs{UINT32_MAX, 3}; 55bf215546Sopenharmony_ci 56bf215546Sopenharmony_cistruct pr_opt_ctx { 57bf215546Sopenharmony_ci Program* program; 58bf215546Sopenharmony_ci Block* current_block; 59bf215546Sopenharmony_ci uint32_t current_instr_idx; 60bf215546Sopenharmony_ci std::vector<uint16_t> uses; 61bf215546Sopenharmony_ci std::vector<std::array<Idx, max_reg_cnt>> instr_idx_by_regs; 62bf215546Sopenharmony_ci 63bf215546Sopenharmony_ci void reset_block(Block* block) 64bf215546Sopenharmony_ci { 65bf215546Sopenharmony_ci current_block = block; 66bf215546Sopenharmony_ci current_instr_idx = 0; 67bf215546Sopenharmony_ci 68bf215546Sopenharmony_ci if ((block->kind & block_kind_loop_header) || block->linear_preds.empty()) { 69bf215546Sopenharmony_ci std::fill(instr_idx_by_regs[block->index].begin(), instr_idx_by_regs[block->index].end(), 70bf215546Sopenharmony_ci not_written_in_block); 71bf215546Sopenharmony_ci } else { 72bf215546Sopenharmony_ci const uint32_t first_linear_pred = block->linear_preds[0]; 73bf215546Sopenharmony_ci const std::vector<uint32_t>& linear_preds = block->linear_preds; 74bf215546Sopenharmony_ci 75bf215546Sopenharmony_ci for (unsigned i = 0; i < max_sgpr_cnt; i++) { 76bf215546Sopenharmony_ci const bool all_same = std::all_of( 77bf215546Sopenharmony_ci std::next(linear_preds.begin()), linear_preds.end(), 78bf215546Sopenharmony_ci [=](unsigned pred) 79bf215546Sopenharmony_ci { return instr_idx_by_regs[pred][i] == instr_idx_by_regs[first_linear_pred][i]; }); 80bf215546Sopenharmony_ci 81bf215546Sopenharmony_ci if (all_same) 82bf215546Sopenharmony_ci instr_idx_by_regs[block->index][i] = instr_idx_by_regs[first_linear_pred][i]; 83bf215546Sopenharmony_ci else 84bf215546Sopenharmony_ci instr_idx_by_regs[block->index][i] = written_by_multiple_instrs; 85bf215546Sopenharmony_ci } 86bf215546Sopenharmony_ci 87bf215546Sopenharmony_ci if (!block->logical_preds.empty()) { 88bf215546Sopenharmony_ci /* We assume that VGPRs are only read by blocks which have a logical predecessor, 89bf215546Sopenharmony_ci * ie. any block that reads any VGPR has at least 1 logical predecessor. 90bf215546Sopenharmony_ci */ 91bf215546Sopenharmony_ci const unsigned first_logical_pred = block->logical_preds[0]; 92bf215546Sopenharmony_ci const std::vector<uint32_t>& logical_preds = block->logical_preds; 93bf215546Sopenharmony_ci 94bf215546Sopenharmony_ci for (unsigned i = min_vgpr; i < (min_vgpr + max_vgpr_cnt); i++) { 95bf215546Sopenharmony_ci const bool all_same = std::all_of( 96bf215546Sopenharmony_ci std::next(logical_preds.begin()), logical_preds.end(), 97bf215546Sopenharmony_ci [=](unsigned pred) { 98bf215546Sopenharmony_ci return instr_idx_by_regs[pred][i] == instr_idx_by_regs[first_logical_pred][i]; 99bf215546Sopenharmony_ci }); 100bf215546Sopenharmony_ci 101bf215546Sopenharmony_ci if (all_same) 102bf215546Sopenharmony_ci instr_idx_by_regs[block->index][i] = instr_idx_by_regs[first_logical_pred][i]; 103bf215546Sopenharmony_ci else 104bf215546Sopenharmony_ci instr_idx_by_regs[block->index][i] = written_by_multiple_instrs; 105bf215546Sopenharmony_ci } 106bf215546Sopenharmony_ci } else { 107bf215546Sopenharmony_ci /* If a block has no logical predecessors, it is not part of the 108bf215546Sopenharmony_ci * logical CFG and therefore it also won't have any logical successors. 109bf215546Sopenharmony_ci * Such a block does not write any VGPRs ever. 110bf215546Sopenharmony_ci */ 111bf215546Sopenharmony_ci assert(block->logical_succs.empty()); 112bf215546Sopenharmony_ci } 113bf215546Sopenharmony_ci } 114bf215546Sopenharmony_ci } 115bf215546Sopenharmony_ci 116bf215546Sopenharmony_ci Instruction* get(Idx idx) { return program->blocks[idx.block].instructions[idx.instr].get(); } 117bf215546Sopenharmony_ci}; 118bf215546Sopenharmony_ci 119bf215546Sopenharmony_civoid 120bf215546Sopenharmony_cisave_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr) 121bf215546Sopenharmony_ci{ 122bf215546Sopenharmony_ci for (const Definition& def : instr->definitions) { 123bf215546Sopenharmony_ci assert(def.regClass().type() != RegType::sgpr || def.physReg().reg() <= 255); 124bf215546Sopenharmony_ci assert(def.regClass().type() != RegType::vgpr || def.physReg().reg() >= 256); 125bf215546Sopenharmony_ci 126bf215546Sopenharmony_ci unsigned dw_size = DIV_ROUND_UP(def.bytes(), 4u); 127bf215546Sopenharmony_ci unsigned r = def.physReg().reg(); 128bf215546Sopenharmony_ci Idx idx{ctx.current_block->index, ctx.current_instr_idx}; 129bf215546Sopenharmony_ci 130bf215546Sopenharmony_ci if (def.regClass().is_subdword()) 131bf215546Sopenharmony_ci idx = clobbered; 132bf215546Sopenharmony_ci 133bf215546Sopenharmony_ci assert((r + dw_size) <= max_reg_cnt); 134bf215546Sopenharmony_ci assert(def.size() == dw_size || def.regClass().is_subdword()); 135bf215546Sopenharmony_ci std::fill(ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r, 136bf215546Sopenharmony_ci ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size, idx); 137bf215546Sopenharmony_ci } 138bf215546Sopenharmony_ci} 139bf215546Sopenharmony_ci 140bf215546Sopenharmony_ciIdx 141bf215546Sopenharmony_cilast_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc) 142bf215546Sopenharmony_ci{ 143bf215546Sopenharmony_ci /* Verify that all of the operand's registers are written by the same instruction. */ 144bf215546Sopenharmony_ci assert(physReg.reg() < max_reg_cnt); 145bf215546Sopenharmony_ci Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][physReg.reg()]; 146bf215546Sopenharmony_ci unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u); 147bf215546Sopenharmony_ci unsigned r = physReg.reg(); 148bf215546Sopenharmony_ci bool all_same = 149bf215546Sopenharmony_ci std::all_of(ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r, 150bf215546Sopenharmony_ci ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size, 151bf215546Sopenharmony_ci [instr_idx](Idx i) { return i == instr_idx; }); 152bf215546Sopenharmony_ci 153bf215546Sopenharmony_ci return all_same ? instr_idx : written_by_multiple_instrs; 154bf215546Sopenharmony_ci} 155bf215546Sopenharmony_ci 156bf215546Sopenharmony_ciIdx 157bf215546Sopenharmony_cilast_writer_idx(pr_opt_ctx& ctx, const Operand& op) 158bf215546Sopenharmony_ci{ 159bf215546Sopenharmony_ci if (op.isConstant() || op.isUndefined()) 160bf215546Sopenharmony_ci return const_or_undef; 161bf215546Sopenharmony_ci 162bf215546Sopenharmony_ci return last_writer_idx(ctx, op.physReg(), op.regClass()); 163bf215546Sopenharmony_ci} 164bf215546Sopenharmony_ci 165bf215546Sopenharmony_cibool 166bf215546Sopenharmony_ciis_clobbered_since(pr_opt_ctx& ctx, PhysReg reg, RegClass rc, const Idx& idx) 167bf215546Sopenharmony_ci{ 168bf215546Sopenharmony_ci /* If we didn't find an instruction, assume that the register is clobbered. */ 169bf215546Sopenharmony_ci if (!idx.found()) 170bf215546Sopenharmony_ci return true; 171bf215546Sopenharmony_ci 172bf215546Sopenharmony_ci /* TODO: We currently can't keep track of subdword registers. */ 173bf215546Sopenharmony_ci if (rc.is_subdword()) 174bf215546Sopenharmony_ci return true; 175bf215546Sopenharmony_ci 176bf215546Sopenharmony_ci unsigned begin_reg = reg.reg(); 177bf215546Sopenharmony_ci unsigned end_reg = begin_reg + rc.size(); 178bf215546Sopenharmony_ci unsigned current_block_idx = ctx.current_block->index; 179bf215546Sopenharmony_ci 180bf215546Sopenharmony_ci for (unsigned r = begin_reg; r < end_reg; ++r) { 181bf215546Sopenharmony_ci Idx& i = ctx.instr_idx_by_regs[current_block_idx][r]; 182bf215546Sopenharmony_ci if (i == clobbered || i == written_by_multiple_instrs) 183bf215546Sopenharmony_ci return true; 184bf215546Sopenharmony_ci else if (i == not_written_in_block) 185bf215546Sopenharmony_ci continue; 186bf215546Sopenharmony_ci 187bf215546Sopenharmony_ci assert(i.found()); 188bf215546Sopenharmony_ci 189bf215546Sopenharmony_ci if (i.block > idx.block || (i.block == idx.block && i.instr > idx.instr)) 190bf215546Sopenharmony_ci return true; 191bf215546Sopenharmony_ci } 192bf215546Sopenharmony_ci 193bf215546Sopenharmony_ci return false; 194bf215546Sopenharmony_ci} 195bf215546Sopenharmony_ci 196bf215546Sopenharmony_citemplate <typename T> 197bf215546Sopenharmony_cibool 198bf215546Sopenharmony_ciis_clobbered_since(pr_opt_ctx& ctx, const T& t, const Idx& idx) 199bf215546Sopenharmony_ci{ 200bf215546Sopenharmony_ci return is_clobbered_since(ctx, t.physReg(), t.regClass(), idx); 201bf215546Sopenharmony_ci} 202bf215546Sopenharmony_ci 203bf215546Sopenharmony_civoid 204bf215546Sopenharmony_citry_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr) 205bf215546Sopenharmony_ci{ 206bf215546Sopenharmony_ci /* We are looking for the following pattern: 207bf215546Sopenharmony_ci * 208bf215546Sopenharmony_ci * vcc = ... ; last_vcc_wr 209bf215546Sopenharmony_ci * sX, scc = s_and_bXX vcc, exec ; op0_instr 210bf215546Sopenharmony_ci * (...vcc and exec must not be clobbered inbetween...) 211bf215546Sopenharmony_ci * s_cbranch_XX scc ; instr 212bf215546Sopenharmony_ci * 213bf215546Sopenharmony_ci * If possible, the above is optimized into: 214bf215546Sopenharmony_ci * 215bf215546Sopenharmony_ci * vcc = ... ; last_vcc_wr 216bf215546Sopenharmony_ci * s_cbranch_XX vcc ; instr modified to use vcc 217bf215546Sopenharmony_ci */ 218bf215546Sopenharmony_ci 219bf215546Sopenharmony_ci /* Don't try to optimize this on GFX6-7 because SMEM may corrupt the vccz bit. */ 220bf215546Sopenharmony_ci if (ctx.program->gfx_level < GFX8) 221bf215546Sopenharmony_ci return; 222bf215546Sopenharmony_ci 223bf215546Sopenharmony_ci if (instr->format != Format::PSEUDO_BRANCH || instr->operands.size() == 0 || 224bf215546Sopenharmony_ci instr->operands[0].physReg() != scc) 225bf215546Sopenharmony_ci return; 226bf215546Sopenharmony_ci 227bf215546Sopenharmony_ci Idx op0_instr_idx = last_writer_idx(ctx, instr->operands[0]); 228bf215546Sopenharmony_ci Idx last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask); 229bf215546Sopenharmony_ci 230bf215546Sopenharmony_ci /* We need to make sure: 231bf215546Sopenharmony_ci * - the instructions that wrote the operand register and VCC are both found 232bf215546Sopenharmony_ci * - the operand register used by the branch, and VCC were both written in the current block 233bf215546Sopenharmony_ci * - EXEC hasn't been clobbered since the last VCC write 234bf215546Sopenharmony_ci * - VCC hasn't been clobbered since the operand register was written 235bf215546Sopenharmony_ci * (ie. the last VCC writer precedes the op0 writer) 236bf215546Sopenharmony_ci */ 237bf215546Sopenharmony_ci if (!op0_instr_idx.found() || !last_vcc_wr_idx.found() || 238bf215546Sopenharmony_ci op0_instr_idx.block != ctx.current_block->index || 239bf215546Sopenharmony_ci last_vcc_wr_idx.block != ctx.current_block->index || 240bf215546Sopenharmony_ci is_clobbered_since(ctx, exec, ctx.program->lane_mask, last_vcc_wr_idx) || 241bf215546Sopenharmony_ci is_clobbered_since(ctx, vcc, ctx.program->lane_mask, op0_instr_idx)) 242bf215546Sopenharmony_ci return; 243bf215546Sopenharmony_ci 244bf215546Sopenharmony_ci Instruction* op0_instr = ctx.get(op0_instr_idx); 245bf215546Sopenharmony_ci Instruction* last_vcc_wr = ctx.get(last_vcc_wr_idx); 246bf215546Sopenharmony_ci 247bf215546Sopenharmony_ci if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ && 248bf215546Sopenharmony_ci op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) || 249bf215546Sopenharmony_ci op0_instr->operands[0].physReg() != vcc || op0_instr->operands[1].physReg() != exec || 250bf215546Sopenharmony_ci !last_vcc_wr->isVOPC()) 251bf215546Sopenharmony_ci return; 252bf215546Sopenharmony_ci 253bf215546Sopenharmony_ci assert(last_vcc_wr->definitions[0].tempId() == op0_instr->operands[0].tempId()); 254bf215546Sopenharmony_ci 255bf215546Sopenharmony_ci /* Reduce the uses of the SCC def */ 256bf215546Sopenharmony_ci ctx.uses[instr->operands[0].tempId()]--; 257bf215546Sopenharmony_ci /* Use VCC instead of SCC in the branch */ 258bf215546Sopenharmony_ci instr->operands[0] = op0_instr->operands[0]; 259bf215546Sopenharmony_ci} 260bf215546Sopenharmony_ci 261bf215546Sopenharmony_civoid 262bf215546Sopenharmony_citry_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr) 263bf215546Sopenharmony_ci{ 264bf215546Sopenharmony_ci /* We are looking for the following pattern: 265bf215546Sopenharmony_ci * 266bf215546Sopenharmony_ci * s_bfe_u32 s0, s3, 0x40018 ; outputs SGPR and SCC if the SGPR != 0 267bf215546Sopenharmony_ci * s_cmp_eq_i32 s0, 0 ; comparison between the SGPR and 0 268bf215546Sopenharmony_ci * s_cbranch_scc0 BB3 ; use the result of the comparison, eg. branch or cselect 269bf215546Sopenharmony_ci * 270bf215546Sopenharmony_ci * If possible, the above is optimized into: 271bf215546Sopenharmony_ci * 272bf215546Sopenharmony_ci * s_bfe_u32 s0, s3, 0x40018 ; original instruction 273bf215546Sopenharmony_ci * s_cbranch_scc1 BB3 ; modified to use SCC directly rather than the SGPR with comparison 274bf215546Sopenharmony_ci * 275bf215546Sopenharmony_ci */ 276bf215546Sopenharmony_ci 277bf215546Sopenharmony_ci if (!instr->isSALU() && !instr->isBranch()) 278bf215546Sopenharmony_ci return; 279bf215546Sopenharmony_ci 280bf215546Sopenharmony_ci if (instr->isSOPC() && 281bf215546Sopenharmony_ci (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 || 282bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cmp_lg_u32 || instr->opcode == aco_opcode::s_cmp_lg_i32 || 283bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cmp_eq_u64 || instr->opcode == aco_opcode::s_cmp_lg_u64) && 284bf215546Sopenharmony_ci (instr->operands[0].constantEquals(0) || instr->operands[1].constantEquals(0)) && 285bf215546Sopenharmony_ci (instr->operands[0].isTemp() || instr->operands[1].isTemp())) { 286bf215546Sopenharmony_ci /* Make sure the constant is always in operand 1 */ 287bf215546Sopenharmony_ci if (instr->operands[0].isConstant()) 288bf215546Sopenharmony_ci std::swap(instr->operands[0], instr->operands[1]); 289bf215546Sopenharmony_ci 290bf215546Sopenharmony_ci if (ctx.uses[instr->operands[0].tempId()] > 1) 291bf215546Sopenharmony_ci return; 292bf215546Sopenharmony_ci 293bf215546Sopenharmony_ci /* Make sure both SCC and Operand 0 are written by the same instruction. */ 294bf215546Sopenharmony_ci Idx wr_idx = last_writer_idx(ctx, instr->operands[0]); 295bf215546Sopenharmony_ci Idx sccwr_idx = last_writer_idx(ctx, scc, s1); 296bf215546Sopenharmony_ci if (!wr_idx.found() || wr_idx != sccwr_idx) 297bf215546Sopenharmony_ci return; 298bf215546Sopenharmony_ci 299bf215546Sopenharmony_ci Instruction* wr_instr = ctx.get(wr_idx); 300bf215546Sopenharmony_ci if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 || 301bf215546Sopenharmony_ci wr_instr->definitions[1].physReg() != scc) 302bf215546Sopenharmony_ci return; 303bf215546Sopenharmony_ci 304bf215546Sopenharmony_ci /* Look for instructions which set SCC := (D != 0) */ 305bf215546Sopenharmony_ci switch (wr_instr->opcode) { 306bf215546Sopenharmony_ci case aco_opcode::s_bfe_i32: 307bf215546Sopenharmony_ci case aco_opcode::s_bfe_i64: 308bf215546Sopenharmony_ci case aco_opcode::s_bfe_u32: 309bf215546Sopenharmony_ci case aco_opcode::s_bfe_u64: 310bf215546Sopenharmony_ci case aco_opcode::s_and_b32: 311bf215546Sopenharmony_ci case aco_opcode::s_and_b64: 312bf215546Sopenharmony_ci case aco_opcode::s_andn2_b32: 313bf215546Sopenharmony_ci case aco_opcode::s_andn2_b64: 314bf215546Sopenharmony_ci case aco_opcode::s_or_b32: 315bf215546Sopenharmony_ci case aco_opcode::s_or_b64: 316bf215546Sopenharmony_ci case aco_opcode::s_orn2_b32: 317bf215546Sopenharmony_ci case aco_opcode::s_orn2_b64: 318bf215546Sopenharmony_ci case aco_opcode::s_xor_b32: 319bf215546Sopenharmony_ci case aco_opcode::s_xor_b64: 320bf215546Sopenharmony_ci case aco_opcode::s_not_b32: 321bf215546Sopenharmony_ci case aco_opcode::s_not_b64: 322bf215546Sopenharmony_ci case aco_opcode::s_nor_b32: 323bf215546Sopenharmony_ci case aco_opcode::s_nor_b64: 324bf215546Sopenharmony_ci case aco_opcode::s_xnor_b32: 325bf215546Sopenharmony_ci case aco_opcode::s_xnor_b64: 326bf215546Sopenharmony_ci case aco_opcode::s_nand_b32: 327bf215546Sopenharmony_ci case aco_opcode::s_nand_b64: 328bf215546Sopenharmony_ci case aco_opcode::s_lshl_b32: 329bf215546Sopenharmony_ci case aco_opcode::s_lshl_b64: 330bf215546Sopenharmony_ci case aco_opcode::s_lshr_b32: 331bf215546Sopenharmony_ci case aco_opcode::s_lshr_b64: 332bf215546Sopenharmony_ci case aco_opcode::s_ashr_i32: 333bf215546Sopenharmony_ci case aco_opcode::s_ashr_i64: 334bf215546Sopenharmony_ci case aco_opcode::s_abs_i32: 335bf215546Sopenharmony_ci case aco_opcode::s_absdiff_i32: break; 336bf215546Sopenharmony_ci default: return; 337bf215546Sopenharmony_ci } 338bf215546Sopenharmony_ci 339bf215546Sopenharmony_ci /* Use the SCC def from wr_instr */ 340bf215546Sopenharmony_ci ctx.uses[instr->operands[0].tempId()]--; 341bf215546Sopenharmony_ci instr->operands[0] = Operand(wr_instr->definitions[1].getTemp(), scc); 342bf215546Sopenharmony_ci ctx.uses[instr->operands[0].tempId()]++; 343bf215546Sopenharmony_ci 344bf215546Sopenharmony_ci /* Set the opcode and operand to 32-bit */ 345bf215546Sopenharmony_ci instr->operands[1] = Operand::zero(); 346bf215546Sopenharmony_ci instr->opcode = 347bf215546Sopenharmony_ci (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 || 348bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cmp_eq_u64) 349bf215546Sopenharmony_ci ? aco_opcode::s_cmp_eq_u32 350bf215546Sopenharmony_ci : aco_opcode::s_cmp_lg_u32; 351bf215546Sopenharmony_ci } else if ((instr->format == Format::PSEUDO_BRANCH && instr->operands.size() == 1 && 352bf215546Sopenharmony_ci instr->operands[0].physReg() == scc) || 353bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_cselect_b32) { 354bf215546Sopenharmony_ci 355bf215546Sopenharmony_ci /* For cselect, operand 2 is the SCC condition */ 356bf215546Sopenharmony_ci unsigned scc_op_idx = 0; 357bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_cselect_b32) { 358bf215546Sopenharmony_ci scc_op_idx = 2; 359bf215546Sopenharmony_ci } 360bf215546Sopenharmony_ci 361bf215546Sopenharmony_ci Idx wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]); 362bf215546Sopenharmony_ci if (!wr_idx.found()) 363bf215546Sopenharmony_ci return; 364bf215546Sopenharmony_ci 365bf215546Sopenharmony_ci Instruction* wr_instr = ctx.get(wr_idx); 366bf215546Sopenharmony_ci 367bf215546Sopenharmony_ci /* Check if we found the pattern above. */ 368bf215546Sopenharmony_ci if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 && 369bf215546Sopenharmony_ci wr_instr->opcode != aco_opcode::s_cmp_lg_u32) 370bf215546Sopenharmony_ci return; 371bf215546Sopenharmony_ci if (wr_instr->operands[0].physReg() != scc) 372bf215546Sopenharmony_ci return; 373bf215546Sopenharmony_ci if (!wr_instr->operands[1].constantEquals(0)) 374bf215546Sopenharmony_ci return; 375bf215546Sopenharmony_ci 376bf215546Sopenharmony_ci /* The optimization can be unsafe when there are other users. */ 377bf215546Sopenharmony_ci if (ctx.uses[instr->operands[scc_op_idx].tempId()] > 1) 378bf215546Sopenharmony_ci return; 379bf215546Sopenharmony_ci 380bf215546Sopenharmony_ci if (wr_instr->opcode == aco_opcode::s_cmp_eq_u32) { 381bf215546Sopenharmony_ci /* Flip the meaning of the instruction to correctly use the SCC. */ 382bf215546Sopenharmony_ci if (instr->format == Format::PSEUDO_BRANCH) 383bf215546Sopenharmony_ci instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz 384bf215546Sopenharmony_ci : aco_opcode::p_cbranch_z; 385bf215546Sopenharmony_ci else if (instr->opcode == aco_opcode::s_cselect_b32) 386bf215546Sopenharmony_ci std::swap(instr->operands[0], instr->operands[1]); 387bf215546Sopenharmony_ci else 388bf215546Sopenharmony_ci unreachable( 389bf215546Sopenharmony_ci "scc_nocompare optimization is only implemented for p_cbranch and s_cselect"); 390bf215546Sopenharmony_ci } 391bf215546Sopenharmony_ci 392bf215546Sopenharmony_ci /* Use the SCC def from the original instruction, not the comparison */ 393bf215546Sopenharmony_ci ctx.uses[instr->operands[scc_op_idx].tempId()]--; 394bf215546Sopenharmony_ci instr->operands[scc_op_idx] = wr_instr->operands[0]; 395bf215546Sopenharmony_ci } 396bf215546Sopenharmony_ci} 397bf215546Sopenharmony_ci 398bf215546Sopenharmony_civoid 399bf215546Sopenharmony_citry_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr) 400bf215546Sopenharmony_ci{ 401bf215546Sopenharmony_ci /* We are looking for the following pattern: 402bf215546Sopenharmony_ci * 403bf215546Sopenharmony_ci * v_mov_dpp vA, vB, ... ; move instruction with DPP 404bf215546Sopenharmony_ci * v_xxx vC, vA, ... ; current instr that uses the result from the move 405bf215546Sopenharmony_ci * 406bf215546Sopenharmony_ci * If possible, the above is optimized into: 407bf215546Sopenharmony_ci * 408bf215546Sopenharmony_ci * v_xxx_dpp vC, vB, ... ; current instr modified to use DPP directly 409bf215546Sopenharmony_ci * 410bf215546Sopenharmony_ci */ 411bf215546Sopenharmony_ci 412bf215546Sopenharmony_ci if (!instr->isVALU() || instr->isDPP()) 413bf215546Sopenharmony_ci return; 414bf215546Sopenharmony_ci 415bf215546Sopenharmony_ci for (unsigned i = 0; i < MIN2(2, instr->operands.size()); i++) { 416bf215546Sopenharmony_ci Idx op_instr_idx = last_writer_idx(ctx, instr->operands[i]); 417bf215546Sopenharmony_ci if (!op_instr_idx.found()) 418bf215546Sopenharmony_ci continue; 419bf215546Sopenharmony_ci 420bf215546Sopenharmony_ci const Instruction* mov = ctx.get(op_instr_idx); 421bf215546Sopenharmony_ci if (mov->opcode != aco_opcode::v_mov_b32 || !mov->isDPP()) 422bf215546Sopenharmony_ci continue; 423bf215546Sopenharmony_ci bool dpp8 = mov->isDPP8(); 424bf215546Sopenharmony_ci if (!can_use_DPP(instr, false, dpp8)) 425bf215546Sopenharmony_ci return; 426bf215546Sopenharmony_ci 427bf215546Sopenharmony_ci /* If we aren't going to remove the v_mov_b32, we have to ensure that it doesn't overwrite 428bf215546Sopenharmony_ci * it's own operand before we use it. 429bf215546Sopenharmony_ci */ 430bf215546Sopenharmony_ci if (mov->definitions[0].physReg() == mov->operands[0].physReg() && 431bf215546Sopenharmony_ci (!mov->definitions[0].tempId() || ctx.uses[mov->definitions[0].tempId()] > 1)) 432bf215546Sopenharmony_ci continue; 433bf215546Sopenharmony_ci 434bf215546Sopenharmony_ci /* Don't propagate DPP if the source register is overwritten since the move. */ 435bf215546Sopenharmony_ci if (is_clobbered_since(ctx, mov->operands[0], op_instr_idx)) 436bf215546Sopenharmony_ci continue; 437bf215546Sopenharmony_ci 438bf215546Sopenharmony_ci if (i && !can_swap_operands(instr, &instr->opcode)) 439bf215546Sopenharmony_ci continue; 440bf215546Sopenharmony_ci 441bf215546Sopenharmony_ci if (!dpp8) /* anything else doesn't make sense in SSA */ 442bf215546Sopenharmony_ci assert(mov->dpp16().row_mask == 0xf && mov->dpp16().bank_mask == 0xf); 443bf215546Sopenharmony_ci 444bf215546Sopenharmony_ci if (--ctx.uses[mov->definitions[0].tempId()]) 445bf215546Sopenharmony_ci ctx.uses[mov->operands[0].tempId()]++; 446bf215546Sopenharmony_ci 447bf215546Sopenharmony_ci convert_to_DPP(instr, dpp8); 448bf215546Sopenharmony_ci 449bf215546Sopenharmony_ci if (dpp8) { 450bf215546Sopenharmony_ci DPP8_instruction* dpp = &instr->dpp8(); 451bf215546Sopenharmony_ci if (i) { 452bf215546Sopenharmony_ci std::swap(dpp->operands[0], dpp->operands[1]); 453bf215546Sopenharmony_ci } 454bf215546Sopenharmony_ci dpp->operands[0] = mov->operands[0]; 455bf215546Sopenharmony_ci memcpy(dpp->lane_sel, mov->dpp8().lane_sel, sizeof(dpp->lane_sel)); 456bf215546Sopenharmony_ci } else { 457bf215546Sopenharmony_ci DPP16_instruction* dpp = &instr->dpp16(); 458bf215546Sopenharmony_ci if (i) { 459bf215546Sopenharmony_ci std::swap(dpp->operands[0], dpp->operands[1]); 460bf215546Sopenharmony_ci std::swap(dpp->neg[0], dpp->neg[1]); 461bf215546Sopenharmony_ci std::swap(dpp->abs[0], dpp->abs[1]); 462bf215546Sopenharmony_ci } 463bf215546Sopenharmony_ci dpp->operands[0] = mov->operands[0]; 464bf215546Sopenharmony_ci dpp->dpp_ctrl = mov->dpp16().dpp_ctrl; 465bf215546Sopenharmony_ci dpp->bound_ctrl = true; 466bf215546Sopenharmony_ci dpp->neg[0] ^= mov->dpp16().neg[0] && !dpp->abs[0]; 467bf215546Sopenharmony_ci dpp->abs[0] |= mov->dpp16().abs[0]; 468bf215546Sopenharmony_ci } 469bf215546Sopenharmony_ci return; 470bf215546Sopenharmony_ci } 471bf215546Sopenharmony_ci} 472bf215546Sopenharmony_ci 473bf215546Sopenharmony_civoid 474bf215546Sopenharmony_ciprocess_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr) 475bf215546Sopenharmony_ci{ 476bf215546Sopenharmony_ci try_apply_branch_vcc(ctx, instr); 477bf215546Sopenharmony_ci 478bf215546Sopenharmony_ci try_optimize_scc_nocompare(ctx, instr); 479bf215546Sopenharmony_ci 480bf215546Sopenharmony_ci try_combine_dpp(ctx, instr); 481bf215546Sopenharmony_ci 482bf215546Sopenharmony_ci if (instr) 483bf215546Sopenharmony_ci save_reg_writes(ctx, instr); 484bf215546Sopenharmony_ci 485bf215546Sopenharmony_ci ctx.current_instr_idx++; 486bf215546Sopenharmony_ci} 487bf215546Sopenharmony_ci 488bf215546Sopenharmony_ci} // namespace 489bf215546Sopenharmony_ci 490bf215546Sopenharmony_civoid 491bf215546Sopenharmony_cioptimize_postRA(Program* program) 492bf215546Sopenharmony_ci{ 493bf215546Sopenharmony_ci pr_opt_ctx ctx; 494bf215546Sopenharmony_ci ctx.program = program; 495bf215546Sopenharmony_ci ctx.uses = dead_code_analysis(program); 496bf215546Sopenharmony_ci ctx.instr_idx_by_regs.resize(program->blocks.size()); 497bf215546Sopenharmony_ci 498bf215546Sopenharmony_ci /* Forward pass 499bf215546Sopenharmony_ci * Goes through each instruction exactly once, and can transform 500bf215546Sopenharmony_ci * instructions or adjust the use counts of temps. 501bf215546Sopenharmony_ci */ 502bf215546Sopenharmony_ci for (auto& block : program->blocks) { 503bf215546Sopenharmony_ci ctx.reset_block(&block); 504bf215546Sopenharmony_ci 505bf215546Sopenharmony_ci for (aco_ptr<Instruction>& instr : block.instructions) 506bf215546Sopenharmony_ci process_instruction(ctx, instr); 507bf215546Sopenharmony_ci } 508bf215546Sopenharmony_ci 509bf215546Sopenharmony_ci /* Cleanup pass 510bf215546Sopenharmony_ci * Gets rid of instructions which are manually deleted or 511bf215546Sopenharmony_ci * no longer have any uses. 512bf215546Sopenharmony_ci */ 513bf215546Sopenharmony_ci for (auto& block : program->blocks) { 514bf215546Sopenharmony_ci auto new_end = std::remove_if(block.instructions.begin(), block.instructions.end(), 515bf215546Sopenharmony_ci [&ctx](const aco_ptr<Instruction>& instr) 516bf215546Sopenharmony_ci { return !instr || is_dead(ctx.uses, instr.get()); }); 517bf215546Sopenharmony_ci block.instructions.resize(new_end - block.instructions.begin()); 518bf215546Sopenharmony_ci } 519bf215546Sopenharmony_ci} 520bf215546Sopenharmony_ci 521bf215546Sopenharmony_ci} // namespace aco 522