1/* 2 * Copyright © 2018 Valve Corporation 3 * Copyright © 2018 Google 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 * 24 */ 25 26#include "aco_ir.h" 27 28#include "util/u_math.h" 29 30#include <set> 31#include <vector> 32 33namespace aco { 34RegisterDemand 35get_live_changes(aco_ptr<Instruction>& instr) 36{ 37 RegisterDemand changes; 38 for (const Definition& def : instr->definitions) { 39 if (!def.isTemp() || def.isKill()) 40 continue; 41 changes += def.getTemp(); 42 } 43 44 for (const Operand& op : instr->operands) { 45 if (!op.isTemp() || !op.isFirstKill()) 46 continue; 47 changes -= op.getTemp(); 48 } 49 50 return changes; 51} 52 53RegisterDemand 54get_temp_registers(aco_ptr<Instruction>& instr) 55{ 56 RegisterDemand temp_registers; 57 58 for (Definition def : instr->definitions) { 59 if (!def.isTemp()) 60 continue; 61 if (def.isKill()) 62 temp_registers += def.getTemp(); 63 } 64 65 for (Operand op : instr->operands) { 66 if (op.isTemp() && op.isLateKill() && op.isFirstKill()) 67 temp_registers += op.getTemp(); 68 } 69 70 return temp_registers; 71} 72 73RegisterDemand 74get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, 75 aco_ptr<Instruction>& instr_before) 76{ 77 demand -= get_live_changes(instr); 78 demand -= get_temp_registers(instr); 79 if (instr_before) 80 demand += get_temp_registers(instr_before); 81 return demand; 82} 83 84namespace { 85struct PhiInfo { 86 uint16_t logical_phi_sgpr_ops = 0; 87 uint16_t linear_phi_ops = 0; 88 uint16_t linear_phi_defs = 0; 89}; 90 91bool 92instr_needs_vcc(Instruction* instr) 93{ 94 if (instr->isVOPC()) 95 return true; 96 if (instr->isVOP2() && !instr->isVOP3()) { 97 if (instr->operands.size() == 3 && instr->operands[2].isTemp() && 98 instr->operands[2].regClass().type() == RegType::sgpr) 99 return true; 100 if (instr->definitions.size() == 2) 101 return true; 102 } 103 return false; 104} 105 106void 107process_live_temps_per_block(Program* program, live& lives, Block* block, unsigned& worklist, 108 std::vector<PhiInfo>& phi_info) 109{ 110 std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index]; 111 RegisterDemand new_demand; 112 113 register_demand.resize(block->instructions.size()); 114 RegisterDemand block_register_demand; 115 IDSet live = lives.live_out[block->index]; 116 117 /* initialize register demand */ 118 for (unsigned t : live) 119 new_demand += Temp(t, program->temp_rc[t]); 120 new_demand.sgpr -= phi_info[block->index].logical_phi_sgpr_ops; 121 122 /* traverse the instructions backwards */ 123 int idx; 124 for (idx = block->instructions.size() - 1; idx >= 0; idx--) { 125 Instruction* insn = block->instructions[idx].get(); 126 if (is_phi(insn)) 127 break; 128 129 program->needs_vcc |= instr_needs_vcc(insn); 130 register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr); 131 132 /* KILL */ 133 for (Definition& definition : insn->definitions) { 134 if (!definition.isTemp()) { 135 continue; 136 } 137 if (definition.isFixed() && definition.physReg() == vcc) 138 program->needs_vcc = true; 139 140 const Temp temp = definition.getTemp(); 141 const size_t n = live.erase(temp.id()); 142 143 if (n) { 144 new_demand -= temp; 145 definition.setKill(false); 146 } else { 147 register_demand[idx] += temp; 148 definition.setKill(true); 149 } 150 } 151 152 /* GEN */ 153 if (insn->opcode == aco_opcode::p_logical_end) { 154 new_demand.sgpr += phi_info[block->index].logical_phi_sgpr_ops; 155 } else { 156 /* we need to do this in a separate loop because the next one can 157 * setKill() for several operands at once and we don't want to 158 * overwrite that in a later iteration */ 159 for (Operand& op : insn->operands) 160 op.setKill(false); 161 162 for (unsigned i = 0; i < insn->operands.size(); ++i) { 163 Operand& operand = insn->operands[i]; 164 if (!operand.isTemp()) 165 continue; 166 if (operand.isFixed() && operand.physReg() == vcc) 167 program->needs_vcc = true; 168 const Temp temp = operand.getTemp(); 169 const bool inserted = live.insert(temp.id()).second; 170 if (inserted) { 171 operand.setFirstKill(true); 172 for (unsigned j = i + 1; j < insn->operands.size(); ++j) { 173 if (insn->operands[j].isTemp() && 174 insn->operands[j].tempId() == operand.tempId()) { 175 insn->operands[j].setFirstKill(false); 176 insn->operands[j].setKill(true); 177 } 178 } 179 if (operand.isLateKill()) 180 register_demand[idx] += temp; 181 new_demand += temp; 182 } 183 } 184 } 185 186 block_register_demand.update(register_demand[idx]); 187 } 188 189 /* update block's register demand for a last time */ 190 block_register_demand.update(new_demand); 191 if (program->progress < CompilationProgress::after_ra) 192 block->register_demand = block_register_demand; 193 194 /* handle phi definitions */ 195 uint16_t linear_phi_defs = 0; 196 int phi_idx = idx; 197 while (phi_idx >= 0) { 198 register_demand[phi_idx] = new_demand; 199 Instruction* insn = block->instructions[phi_idx].get(); 200 201 assert(is_phi(insn) && insn->definitions.size() == 1); 202 if (!insn->definitions[0].isTemp()) { 203 assert(insn->definitions[0].isFixed() && insn->definitions[0].physReg() == exec); 204 phi_idx--; 205 continue; 206 } 207 Definition& definition = insn->definitions[0]; 208 if (definition.isFixed() && definition.physReg() == vcc) 209 program->needs_vcc = true; 210 const Temp temp = definition.getTemp(); 211 const size_t n = live.erase(temp.id()); 212 213 if (n) 214 definition.setKill(false); 215 else 216 definition.setKill(true); 217 218 if (insn->opcode == aco_opcode::p_linear_phi) { 219 assert(definition.getTemp().type() == RegType::sgpr); 220 linear_phi_defs += definition.size(); 221 } 222 223 phi_idx--; 224 } 225 226 for (unsigned pred_idx : block->linear_preds) 227 phi_info[pred_idx].linear_phi_defs = linear_phi_defs; 228 229 /* now, we need to merge the live-ins into the live-out sets */ 230 for (unsigned t : live) { 231 RegClass rc = program->temp_rc[t]; 232 std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds; 233 234#ifndef NDEBUG 235 if (preds.empty()) 236 aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, 237 block->index); 238#endif 239 240 for (unsigned pred_idx : preds) { 241 auto it = lives.live_out[pred_idx].insert(t); 242 if (it.second) 243 worklist = std::max(worklist, pred_idx + 1); 244 } 245 } 246 247 /* handle phi operands */ 248 phi_idx = idx; 249 while (phi_idx >= 0) { 250 Instruction* insn = block->instructions[phi_idx].get(); 251 assert(is_phi(insn)); 252 /* directly insert into the predecessors live-out set */ 253 std::vector<unsigned>& preds = 254 insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; 255 for (unsigned i = 0; i < preds.size(); ++i) { 256 Operand& operand = insn->operands[i]; 257 if (!operand.isTemp()) 258 continue; 259 if (operand.isFixed() && operand.physReg() == vcc) 260 program->needs_vcc = true; 261 /* check if we changed an already processed block */ 262 const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second; 263 if (inserted) { 264 worklist = std::max(worklist, preds[i] + 1); 265 if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr) { 266 phi_info[preds[i]].logical_phi_sgpr_ops += operand.size(); 267 } else if (insn->opcode == aco_opcode::p_linear_phi) { 268 assert(operand.getTemp().type() == RegType::sgpr); 269 phi_info[preds[i]].linear_phi_ops += operand.size(); 270 } 271 } 272 273 /* set if the operand is killed by this (or another) phi instruction */ 274 operand.setKill(!live.count(operand.tempId())); 275 } 276 phi_idx--; 277 } 278 279 assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty())); 280} 281 282unsigned 283calc_waves_per_workgroup(Program* program) 284{ 285 /* When workgroup size is not known, just go with wave_size */ 286 unsigned workgroup_size = 287 program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size; 288 289 return align(workgroup_size, program->wave_size) / program->wave_size; 290} 291} /* end namespace */ 292 293uint16_t 294get_extra_sgprs(Program* program) 295{ 296 /* We don't use this register on GFX6-8 and it's removed on GFX10+. */ 297 bool needs_flat_scr = program->config->scratch_bytes_per_wave && program->gfx_level == GFX9; 298 299 if (program->gfx_level >= GFX10) { 300 assert(!program->dev.xnack_enabled); 301 return 0; 302 } else if (program->gfx_level >= GFX8) { 303 if (needs_flat_scr) 304 return 6; 305 else if (program->dev.xnack_enabled) 306 return 4; 307 else if (program->needs_vcc) 308 return 2; 309 else 310 return 0; 311 } else { 312 assert(!program->dev.xnack_enabled); 313 if (needs_flat_scr) 314 return 4; 315 else if (program->needs_vcc) 316 return 2; 317 else 318 return 0; 319 } 320} 321 322uint16_t 323get_sgpr_alloc(Program* program, uint16_t addressable_sgprs) 324{ 325 uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program); 326 uint16_t granule = program->dev.sgpr_alloc_granule; 327 return ALIGN_NPOT(std::max(sgprs, granule), granule); 328} 329 330uint16_t 331get_vgpr_alloc(Program* program, uint16_t addressable_vgprs) 332{ 333 assert(addressable_vgprs <= program->dev.vgpr_limit); 334 uint16_t granule = program->dev.vgpr_alloc_granule; 335 return align(std::max(addressable_vgprs, granule), granule); 336} 337 338unsigned 339round_down(unsigned a, unsigned b) 340{ 341 return a - (a % b); 342} 343 344uint16_t 345get_addr_sgpr_from_waves(Program* program, uint16_t waves) 346{ 347 /* it's not possible to allocate more than 128 SGPRs */ 348 uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128); 349 sgprs = round_down(sgprs, program->dev.sgpr_alloc_granule); 350 sgprs -= get_extra_sgprs(program); 351 return std::min(sgprs, program->dev.sgpr_limit); 352} 353 354uint16_t 355get_addr_vgpr_from_waves(Program* program, uint16_t waves) 356{ 357 uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1); 358 vgprs -= program->config->num_shared_vgprs / 2; 359 return std::min(vgprs, program->dev.vgpr_limit); 360} 361 362void 363calc_min_waves(Program* program) 364{ 365 unsigned waves_per_workgroup = calc_waves_per_workgroup(program); 366 unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1); 367 program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp); 368} 369 370uint16_t 371max_suitable_waves(Program* program, uint16_t waves) 372{ 373 unsigned num_simd = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1); 374 unsigned waves_per_workgroup = calc_waves_per_workgroup(program); 375 unsigned num_workgroups = waves * num_simd / waves_per_workgroup; 376 377 /* Adjust #workgroups for LDS */ 378 unsigned lds_per_workgroup = align(program->config->lds_size * program->dev.lds_encoding_granule, 379 program->dev.lds_alloc_granule); 380 381 if (program->stage == fragment_fs) { 382 /* PS inputs are moved from PC (parameter cache) to LDS before PS waves are launched. 383 * Each PS input occupies 3x vec4 of LDS space. See Figure 10.3 in GCN3 ISA manual. 384 * These limit occupancy the same way as other stages' LDS usage does. 385 */ 386 unsigned lds_bytes_per_interp = 3 * 16; 387 unsigned lds_param_bytes = lds_bytes_per_interp * program->info.ps.num_interp; 388 lds_per_workgroup += align(lds_param_bytes, program->dev.lds_alloc_granule); 389 } 390 unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit; 391 if (lds_per_workgroup) 392 num_workgroups = std::min(num_workgroups, lds_limit / lds_per_workgroup); 393 394 /* Hardware limitation */ 395 if (waves_per_workgroup > 1) 396 num_workgroups = std::min(num_workgroups, program->wgp_mode ? 32u : 16u); 397 398 /* Adjust #waves for workgroup multiples: 399 * In cases like waves_per_workgroup=3 or lds=65536 and 400 * waves_per_workgroup=1, we want the maximum possible number of waves per 401 * SIMD and not the minimum. so DIV_ROUND_UP is used 402 */ 403 unsigned workgroup_waves = num_workgroups * waves_per_workgroup; 404 return DIV_ROUND_UP(workgroup_waves, num_simd); 405} 406 407void 408update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) 409{ 410 assert(program->min_waves >= 1); 411 uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); 412 uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); 413 414 /* this won't compile, register pressure reduction necessary */ 415 if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) { 416 program->num_waves = 0; 417 program->max_reg_demand = new_demand; 418 } else { 419 program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); 420 uint16_t vgpr_demand = 421 get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2; 422 program->num_waves = 423 std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand); 424 uint16_t max_waves = program->dev.max_wave64_per_simd * (64 / program->wave_size); 425 program->num_waves = std::min(program->num_waves, max_waves); 426 427 /* Adjust for LDS and workgroup multiples and calculate max_reg_demand */ 428 program->num_waves = max_suitable_waves(program, program->num_waves); 429 program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves); 430 program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves); 431 } 432} 433 434live 435live_var_analysis(Program* program) 436{ 437 live result; 438 result.live_out.resize(program->blocks.size()); 439 result.register_demand.resize(program->blocks.size()); 440 unsigned worklist = program->blocks.size(); 441 std::vector<PhiInfo> phi_info(program->blocks.size()); 442 RegisterDemand new_demand; 443 444 program->needs_vcc = program->gfx_level >= GFX10; 445 446 /* this implementation assumes that the block idx corresponds to the block's position in 447 * program->blocks vector */ 448 while (worklist) { 449 unsigned block_idx = --worklist; 450 process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, 451 phi_info); 452 new_demand.update(program->blocks[block_idx].register_demand); 453 } 454 455 /* Handle branches: we will insert copies created for linear phis just before the branch. */ 456 for (Block& block : program->blocks) { 457 result.register_demand[block.index].back().sgpr += phi_info[block.index].linear_phi_defs; 458 result.register_demand[block.index].back().sgpr -= phi_info[block.index].linear_phi_ops; 459 } 460 461 /* calculate the program's register demand and number of waves */ 462 if (program->progress < CompilationProgress::after_ra) 463 update_vgpr_sgpr_demand(program, new_demand); 464 465 return result; 466} 467 468} // namespace aco 469