1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2018 Valve Corporation 3bf215546Sopenharmony_ci * Copyright © 2018 Google 4bf215546Sopenharmony_ci * 5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 8bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 10bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 11bf215546Sopenharmony_ci * 12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 14bf215546Sopenharmony_ci * Software. 15bf215546Sopenharmony_ci * 16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22bf215546Sopenharmony_ci * IN THE SOFTWARE. 23bf215546Sopenharmony_ci * 24bf215546Sopenharmony_ci */ 25bf215546Sopenharmony_ci 26bf215546Sopenharmony_ci#include "aco_ir.h" 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci#include "util/u_math.h" 29bf215546Sopenharmony_ci 30bf215546Sopenharmony_ci#include <set> 31bf215546Sopenharmony_ci#include <vector> 32bf215546Sopenharmony_ci 33bf215546Sopenharmony_cinamespace aco { 34bf215546Sopenharmony_ciRegisterDemand 35bf215546Sopenharmony_ciget_live_changes(aco_ptr<Instruction>& instr) 36bf215546Sopenharmony_ci{ 37bf215546Sopenharmony_ci RegisterDemand changes; 38bf215546Sopenharmony_ci for (const Definition& def : instr->definitions) { 39bf215546Sopenharmony_ci if (!def.isTemp() || def.isKill()) 40bf215546Sopenharmony_ci continue; 41bf215546Sopenharmony_ci changes += def.getTemp(); 42bf215546Sopenharmony_ci } 43bf215546Sopenharmony_ci 44bf215546Sopenharmony_ci for (const Operand& op : instr->operands) { 45bf215546Sopenharmony_ci if (!op.isTemp() || !op.isFirstKill()) 46bf215546Sopenharmony_ci continue; 47bf215546Sopenharmony_ci changes -= op.getTemp(); 48bf215546Sopenharmony_ci } 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_ci return changes; 51bf215546Sopenharmony_ci} 52bf215546Sopenharmony_ci 53bf215546Sopenharmony_ciRegisterDemand 54bf215546Sopenharmony_ciget_temp_registers(aco_ptr<Instruction>& instr) 55bf215546Sopenharmony_ci{ 56bf215546Sopenharmony_ci RegisterDemand temp_registers; 57bf215546Sopenharmony_ci 58bf215546Sopenharmony_ci for (Definition def : instr->definitions) { 59bf215546Sopenharmony_ci if (!def.isTemp()) 60bf215546Sopenharmony_ci continue; 61bf215546Sopenharmony_ci if (def.isKill()) 62bf215546Sopenharmony_ci temp_registers += def.getTemp(); 63bf215546Sopenharmony_ci } 64bf215546Sopenharmony_ci 65bf215546Sopenharmony_ci for (Operand op : instr->operands) { 66bf215546Sopenharmony_ci if (op.isTemp() && op.isLateKill() && op.isFirstKill()) 67bf215546Sopenharmony_ci temp_registers += op.getTemp(); 68bf215546Sopenharmony_ci } 69bf215546Sopenharmony_ci 70bf215546Sopenharmony_ci return temp_registers; 71bf215546Sopenharmony_ci} 72bf215546Sopenharmony_ci 73bf215546Sopenharmony_ciRegisterDemand 74bf215546Sopenharmony_ciget_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, 75bf215546Sopenharmony_ci aco_ptr<Instruction>& instr_before) 76bf215546Sopenharmony_ci{ 77bf215546Sopenharmony_ci demand -= get_live_changes(instr); 78bf215546Sopenharmony_ci demand -= get_temp_registers(instr); 79bf215546Sopenharmony_ci if (instr_before) 80bf215546Sopenharmony_ci demand += get_temp_registers(instr_before); 81bf215546Sopenharmony_ci return demand; 82bf215546Sopenharmony_ci} 83bf215546Sopenharmony_ci 84bf215546Sopenharmony_cinamespace { 85bf215546Sopenharmony_cistruct PhiInfo { 86bf215546Sopenharmony_ci uint16_t logical_phi_sgpr_ops = 0; 87bf215546Sopenharmony_ci uint16_t linear_phi_ops = 0; 88bf215546Sopenharmony_ci uint16_t linear_phi_defs = 0; 89bf215546Sopenharmony_ci}; 90bf215546Sopenharmony_ci 91bf215546Sopenharmony_cibool 92bf215546Sopenharmony_ciinstr_needs_vcc(Instruction* instr) 93bf215546Sopenharmony_ci{ 94bf215546Sopenharmony_ci if (instr->isVOPC()) 95bf215546Sopenharmony_ci return true; 96bf215546Sopenharmony_ci if (instr->isVOP2() && !instr->isVOP3()) { 97bf215546Sopenharmony_ci if (instr->operands.size() == 3 && instr->operands[2].isTemp() && 98bf215546Sopenharmony_ci instr->operands[2].regClass().type() == RegType::sgpr) 99bf215546Sopenharmony_ci return true; 100bf215546Sopenharmony_ci if (instr->definitions.size() == 2) 101bf215546Sopenharmony_ci return true; 102bf215546Sopenharmony_ci } 103bf215546Sopenharmony_ci return false; 104bf215546Sopenharmony_ci} 105bf215546Sopenharmony_ci 106bf215546Sopenharmony_civoid 107bf215546Sopenharmony_ciprocess_live_temps_per_block(Program* program, live& lives, Block* block, unsigned& worklist, 108bf215546Sopenharmony_ci std::vector<PhiInfo>& phi_info) 109bf215546Sopenharmony_ci{ 110bf215546Sopenharmony_ci std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index]; 111bf215546Sopenharmony_ci RegisterDemand new_demand; 112bf215546Sopenharmony_ci 113bf215546Sopenharmony_ci register_demand.resize(block->instructions.size()); 114bf215546Sopenharmony_ci RegisterDemand block_register_demand; 115bf215546Sopenharmony_ci IDSet live = lives.live_out[block->index]; 116bf215546Sopenharmony_ci 117bf215546Sopenharmony_ci /* initialize register demand */ 118bf215546Sopenharmony_ci for (unsigned t : live) 119bf215546Sopenharmony_ci new_demand += Temp(t, program->temp_rc[t]); 120bf215546Sopenharmony_ci new_demand.sgpr -= phi_info[block->index].logical_phi_sgpr_ops; 121bf215546Sopenharmony_ci 122bf215546Sopenharmony_ci /* traverse the instructions backwards */ 123bf215546Sopenharmony_ci int idx; 124bf215546Sopenharmony_ci for (idx = block->instructions.size() - 1; idx >= 0; idx--) { 125bf215546Sopenharmony_ci Instruction* insn = block->instructions[idx].get(); 126bf215546Sopenharmony_ci if (is_phi(insn)) 127bf215546Sopenharmony_ci break; 128bf215546Sopenharmony_ci 129bf215546Sopenharmony_ci program->needs_vcc |= instr_needs_vcc(insn); 130bf215546Sopenharmony_ci register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr); 131bf215546Sopenharmony_ci 132bf215546Sopenharmony_ci /* KILL */ 133bf215546Sopenharmony_ci for (Definition& definition : insn->definitions) { 134bf215546Sopenharmony_ci if (!definition.isTemp()) { 135bf215546Sopenharmony_ci continue; 136bf215546Sopenharmony_ci } 137bf215546Sopenharmony_ci if (definition.isFixed() && definition.physReg() == vcc) 138bf215546Sopenharmony_ci program->needs_vcc = true; 139bf215546Sopenharmony_ci 140bf215546Sopenharmony_ci const Temp temp = definition.getTemp(); 141bf215546Sopenharmony_ci const size_t n = live.erase(temp.id()); 142bf215546Sopenharmony_ci 143bf215546Sopenharmony_ci if (n) { 144bf215546Sopenharmony_ci new_demand -= temp; 145bf215546Sopenharmony_ci definition.setKill(false); 146bf215546Sopenharmony_ci } else { 147bf215546Sopenharmony_ci register_demand[idx] += temp; 148bf215546Sopenharmony_ci definition.setKill(true); 149bf215546Sopenharmony_ci } 150bf215546Sopenharmony_ci } 151bf215546Sopenharmony_ci 152bf215546Sopenharmony_ci /* GEN */ 153bf215546Sopenharmony_ci if (insn->opcode == aco_opcode::p_logical_end) { 154bf215546Sopenharmony_ci new_demand.sgpr += phi_info[block->index].logical_phi_sgpr_ops; 155bf215546Sopenharmony_ci } else { 156bf215546Sopenharmony_ci /* we need to do this in a separate loop because the next one can 157bf215546Sopenharmony_ci * setKill() for several operands at once and we don't want to 158bf215546Sopenharmony_ci * overwrite that in a later iteration */ 159bf215546Sopenharmony_ci for (Operand& op : insn->operands) 160bf215546Sopenharmony_ci op.setKill(false); 161bf215546Sopenharmony_ci 162bf215546Sopenharmony_ci for (unsigned i = 0; i < insn->operands.size(); ++i) { 163bf215546Sopenharmony_ci Operand& operand = insn->operands[i]; 164bf215546Sopenharmony_ci if (!operand.isTemp()) 165bf215546Sopenharmony_ci continue; 166bf215546Sopenharmony_ci if (operand.isFixed() && operand.physReg() == vcc) 167bf215546Sopenharmony_ci program->needs_vcc = true; 168bf215546Sopenharmony_ci const Temp temp = operand.getTemp(); 169bf215546Sopenharmony_ci const bool inserted = live.insert(temp.id()).second; 170bf215546Sopenharmony_ci if (inserted) { 171bf215546Sopenharmony_ci operand.setFirstKill(true); 172bf215546Sopenharmony_ci for (unsigned j = i + 1; j < insn->operands.size(); ++j) { 173bf215546Sopenharmony_ci if (insn->operands[j].isTemp() && 174bf215546Sopenharmony_ci insn->operands[j].tempId() == operand.tempId()) { 175bf215546Sopenharmony_ci insn->operands[j].setFirstKill(false); 176bf215546Sopenharmony_ci insn->operands[j].setKill(true); 177bf215546Sopenharmony_ci } 178bf215546Sopenharmony_ci } 179bf215546Sopenharmony_ci if (operand.isLateKill()) 180bf215546Sopenharmony_ci register_demand[idx] += temp; 181bf215546Sopenharmony_ci new_demand += temp; 182bf215546Sopenharmony_ci } 183bf215546Sopenharmony_ci } 184bf215546Sopenharmony_ci } 185bf215546Sopenharmony_ci 186bf215546Sopenharmony_ci block_register_demand.update(register_demand[idx]); 187bf215546Sopenharmony_ci } 188bf215546Sopenharmony_ci 189bf215546Sopenharmony_ci /* update block's register demand for a last time */ 190bf215546Sopenharmony_ci block_register_demand.update(new_demand); 191bf215546Sopenharmony_ci if (program->progress < CompilationProgress::after_ra) 192bf215546Sopenharmony_ci block->register_demand = block_register_demand; 193bf215546Sopenharmony_ci 194bf215546Sopenharmony_ci /* handle phi definitions */ 195bf215546Sopenharmony_ci uint16_t linear_phi_defs = 0; 196bf215546Sopenharmony_ci int phi_idx = idx; 197bf215546Sopenharmony_ci while (phi_idx >= 0) { 198bf215546Sopenharmony_ci register_demand[phi_idx] = new_demand; 199bf215546Sopenharmony_ci Instruction* insn = block->instructions[phi_idx].get(); 200bf215546Sopenharmony_ci 201bf215546Sopenharmony_ci assert(is_phi(insn) && insn->definitions.size() == 1); 202bf215546Sopenharmony_ci if (!insn->definitions[0].isTemp()) { 203bf215546Sopenharmony_ci assert(insn->definitions[0].isFixed() && insn->definitions[0].physReg() == exec); 204bf215546Sopenharmony_ci phi_idx--; 205bf215546Sopenharmony_ci continue; 206bf215546Sopenharmony_ci } 207bf215546Sopenharmony_ci Definition& definition = insn->definitions[0]; 208bf215546Sopenharmony_ci if (definition.isFixed() && definition.physReg() == vcc) 209bf215546Sopenharmony_ci program->needs_vcc = true; 210bf215546Sopenharmony_ci const Temp temp = definition.getTemp(); 211bf215546Sopenharmony_ci const size_t n = live.erase(temp.id()); 212bf215546Sopenharmony_ci 213bf215546Sopenharmony_ci if (n) 214bf215546Sopenharmony_ci definition.setKill(false); 215bf215546Sopenharmony_ci else 216bf215546Sopenharmony_ci definition.setKill(true); 217bf215546Sopenharmony_ci 218bf215546Sopenharmony_ci if (insn->opcode == aco_opcode::p_linear_phi) { 219bf215546Sopenharmony_ci assert(definition.getTemp().type() == RegType::sgpr); 220bf215546Sopenharmony_ci linear_phi_defs += definition.size(); 221bf215546Sopenharmony_ci } 222bf215546Sopenharmony_ci 223bf215546Sopenharmony_ci phi_idx--; 224bf215546Sopenharmony_ci } 225bf215546Sopenharmony_ci 226bf215546Sopenharmony_ci for (unsigned pred_idx : block->linear_preds) 227bf215546Sopenharmony_ci phi_info[pred_idx].linear_phi_defs = linear_phi_defs; 228bf215546Sopenharmony_ci 229bf215546Sopenharmony_ci /* now, we need to merge the live-ins into the live-out sets */ 230bf215546Sopenharmony_ci for (unsigned t : live) { 231bf215546Sopenharmony_ci RegClass rc = program->temp_rc[t]; 232bf215546Sopenharmony_ci std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds; 233bf215546Sopenharmony_ci 234bf215546Sopenharmony_ci#ifndef NDEBUG 235bf215546Sopenharmony_ci if (preds.empty()) 236bf215546Sopenharmony_ci aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, 237bf215546Sopenharmony_ci block->index); 238bf215546Sopenharmony_ci#endif 239bf215546Sopenharmony_ci 240bf215546Sopenharmony_ci for (unsigned pred_idx : preds) { 241bf215546Sopenharmony_ci auto it = lives.live_out[pred_idx].insert(t); 242bf215546Sopenharmony_ci if (it.second) 243bf215546Sopenharmony_ci worklist = std::max(worklist, pred_idx + 1); 244bf215546Sopenharmony_ci } 245bf215546Sopenharmony_ci } 246bf215546Sopenharmony_ci 247bf215546Sopenharmony_ci /* handle phi operands */ 248bf215546Sopenharmony_ci phi_idx = idx; 249bf215546Sopenharmony_ci while (phi_idx >= 0) { 250bf215546Sopenharmony_ci Instruction* insn = block->instructions[phi_idx].get(); 251bf215546Sopenharmony_ci assert(is_phi(insn)); 252bf215546Sopenharmony_ci /* directly insert into the predecessors live-out set */ 253bf215546Sopenharmony_ci std::vector<unsigned>& preds = 254bf215546Sopenharmony_ci insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; 255bf215546Sopenharmony_ci for (unsigned i = 0; i < preds.size(); ++i) { 256bf215546Sopenharmony_ci Operand& operand = insn->operands[i]; 257bf215546Sopenharmony_ci if (!operand.isTemp()) 258bf215546Sopenharmony_ci continue; 259bf215546Sopenharmony_ci if (operand.isFixed() && operand.physReg() == vcc) 260bf215546Sopenharmony_ci program->needs_vcc = true; 261bf215546Sopenharmony_ci /* check if we changed an already processed block */ 262bf215546Sopenharmony_ci const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second; 263bf215546Sopenharmony_ci if (inserted) { 264bf215546Sopenharmony_ci worklist = std::max(worklist, preds[i] + 1); 265bf215546Sopenharmony_ci if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr) { 266bf215546Sopenharmony_ci phi_info[preds[i]].logical_phi_sgpr_ops += operand.size(); 267bf215546Sopenharmony_ci } else if (insn->opcode == aco_opcode::p_linear_phi) { 268bf215546Sopenharmony_ci assert(operand.getTemp().type() == RegType::sgpr); 269bf215546Sopenharmony_ci phi_info[preds[i]].linear_phi_ops += operand.size(); 270bf215546Sopenharmony_ci } 271bf215546Sopenharmony_ci } 272bf215546Sopenharmony_ci 273bf215546Sopenharmony_ci /* set if the operand is killed by this (or another) phi instruction */ 274bf215546Sopenharmony_ci operand.setKill(!live.count(operand.tempId())); 275bf215546Sopenharmony_ci } 276bf215546Sopenharmony_ci phi_idx--; 277bf215546Sopenharmony_ci } 278bf215546Sopenharmony_ci 279bf215546Sopenharmony_ci assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty())); 280bf215546Sopenharmony_ci} 281bf215546Sopenharmony_ci 282bf215546Sopenharmony_ciunsigned 283bf215546Sopenharmony_cicalc_waves_per_workgroup(Program* program) 284bf215546Sopenharmony_ci{ 285bf215546Sopenharmony_ci /* When workgroup size is not known, just go with wave_size */ 286bf215546Sopenharmony_ci unsigned workgroup_size = 287bf215546Sopenharmony_ci program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size; 288bf215546Sopenharmony_ci 289bf215546Sopenharmony_ci return align(workgroup_size, program->wave_size) / program->wave_size; 290bf215546Sopenharmony_ci} 291bf215546Sopenharmony_ci} /* end namespace */ 292bf215546Sopenharmony_ci 293bf215546Sopenharmony_ciuint16_t 294bf215546Sopenharmony_ciget_extra_sgprs(Program* program) 295bf215546Sopenharmony_ci{ 296bf215546Sopenharmony_ci /* We don't use this register on GFX6-8 and it's removed on GFX10+. */ 297bf215546Sopenharmony_ci bool needs_flat_scr = program->config->scratch_bytes_per_wave && program->gfx_level == GFX9; 298bf215546Sopenharmony_ci 299bf215546Sopenharmony_ci if (program->gfx_level >= GFX10) { 300bf215546Sopenharmony_ci assert(!program->dev.xnack_enabled); 301bf215546Sopenharmony_ci return 0; 302bf215546Sopenharmony_ci } else if (program->gfx_level >= GFX8) { 303bf215546Sopenharmony_ci if (needs_flat_scr) 304bf215546Sopenharmony_ci return 6; 305bf215546Sopenharmony_ci else if (program->dev.xnack_enabled) 306bf215546Sopenharmony_ci return 4; 307bf215546Sopenharmony_ci else if (program->needs_vcc) 308bf215546Sopenharmony_ci return 2; 309bf215546Sopenharmony_ci else 310bf215546Sopenharmony_ci return 0; 311bf215546Sopenharmony_ci } else { 312bf215546Sopenharmony_ci assert(!program->dev.xnack_enabled); 313bf215546Sopenharmony_ci if (needs_flat_scr) 314bf215546Sopenharmony_ci return 4; 315bf215546Sopenharmony_ci else if (program->needs_vcc) 316bf215546Sopenharmony_ci return 2; 317bf215546Sopenharmony_ci else 318bf215546Sopenharmony_ci return 0; 319bf215546Sopenharmony_ci } 320bf215546Sopenharmony_ci} 321bf215546Sopenharmony_ci 322bf215546Sopenharmony_ciuint16_t 323bf215546Sopenharmony_ciget_sgpr_alloc(Program* program, uint16_t addressable_sgprs) 324bf215546Sopenharmony_ci{ 325bf215546Sopenharmony_ci uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program); 326bf215546Sopenharmony_ci uint16_t granule = program->dev.sgpr_alloc_granule; 327bf215546Sopenharmony_ci return ALIGN_NPOT(std::max(sgprs, granule), granule); 328bf215546Sopenharmony_ci} 329bf215546Sopenharmony_ci 330bf215546Sopenharmony_ciuint16_t 331bf215546Sopenharmony_ciget_vgpr_alloc(Program* program, uint16_t addressable_vgprs) 332bf215546Sopenharmony_ci{ 333bf215546Sopenharmony_ci assert(addressable_vgprs <= program->dev.vgpr_limit); 334bf215546Sopenharmony_ci uint16_t granule = program->dev.vgpr_alloc_granule; 335bf215546Sopenharmony_ci return align(std::max(addressable_vgprs, granule), granule); 336bf215546Sopenharmony_ci} 337bf215546Sopenharmony_ci 338bf215546Sopenharmony_ciunsigned 339bf215546Sopenharmony_ciround_down(unsigned a, unsigned b) 340bf215546Sopenharmony_ci{ 341bf215546Sopenharmony_ci return a - (a % b); 342bf215546Sopenharmony_ci} 343bf215546Sopenharmony_ci 344bf215546Sopenharmony_ciuint16_t 345bf215546Sopenharmony_ciget_addr_sgpr_from_waves(Program* program, uint16_t waves) 346bf215546Sopenharmony_ci{ 347bf215546Sopenharmony_ci /* it's not possible to allocate more than 128 SGPRs */ 348bf215546Sopenharmony_ci uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128); 349bf215546Sopenharmony_ci sgprs = round_down(sgprs, program->dev.sgpr_alloc_granule); 350bf215546Sopenharmony_ci sgprs -= get_extra_sgprs(program); 351bf215546Sopenharmony_ci return std::min(sgprs, program->dev.sgpr_limit); 352bf215546Sopenharmony_ci} 353bf215546Sopenharmony_ci 354bf215546Sopenharmony_ciuint16_t 355bf215546Sopenharmony_ciget_addr_vgpr_from_waves(Program* program, uint16_t waves) 356bf215546Sopenharmony_ci{ 357bf215546Sopenharmony_ci uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1); 358bf215546Sopenharmony_ci vgprs -= program->config->num_shared_vgprs / 2; 359bf215546Sopenharmony_ci return std::min(vgprs, program->dev.vgpr_limit); 360bf215546Sopenharmony_ci} 361bf215546Sopenharmony_ci 362bf215546Sopenharmony_civoid 363bf215546Sopenharmony_cicalc_min_waves(Program* program) 364bf215546Sopenharmony_ci{ 365bf215546Sopenharmony_ci unsigned waves_per_workgroup = calc_waves_per_workgroup(program); 366bf215546Sopenharmony_ci unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1); 367bf215546Sopenharmony_ci program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp); 368bf215546Sopenharmony_ci} 369bf215546Sopenharmony_ci 370bf215546Sopenharmony_ciuint16_t 371bf215546Sopenharmony_cimax_suitable_waves(Program* program, uint16_t waves) 372bf215546Sopenharmony_ci{ 373bf215546Sopenharmony_ci unsigned num_simd = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1); 374bf215546Sopenharmony_ci unsigned waves_per_workgroup = calc_waves_per_workgroup(program); 375bf215546Sopenharmony_ci unsigned num_workgroups = waves * num_simd / waves_per_workgroup; 376bf215546Sopenharmony_ci 377bf215546Sopenharmony_ci /* Adjust #workgroups for LDS */ 378bf215546Sopenharmony_ci unsigned lds_per_workgroup = align(program->config->lds_size * program->dev.lds_encoding_granule, 379bf215546Sopenharmony_ci program->dev.lds_alloc_granule); 380bf215546Sopenharmony_ci 381bf215546Sopenharmony_ci if (program->stage == fragment_fs) { 382bf215546Sopenharmony_ci /* PS inputs are moved from PC (parameter cache) to LDS before PS waves are launched. 383bf215546Sopenharmony_ci * Each PS input occupies 3x vec4 of LDS space. See Figure 10.3 in GCN3 ISA manual. 384bf215546Sopenharmony_ci * These limit occupancy the same way as other stages' LDS usage does. 385bf215546Sopenharmony_ci */ 386bf215546Sopenharmony_ci unsigned lds_bytes_per_interp = 3 * 16; 387bf215546Sopenharmony_ci unsigned lds_param_bytes = lds_bytes_per_interp * program->info.ps.num_interp; 388bf215546Sopenharmony_ci lds_per_workgroup += align(lds_param_bytes, program->dev.lds_alloc_granule); 389bf215546Sopenharmony_ci } 390bf215546Sopenharmony_ci unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit; 391bf215546Sopenharmony_ci if (lds_per_workgroup) 392bf215546Sopenharmony_ci num_workgroups = std::min(num_workgroups, lds_limit / lds_per_workgroup); 393bf215546Sopenharmony_ci 394bf215546Sopenharmony_ci /* Hardware limitation */ 395bf215546Sopenharmony_ci if (waves_per_workgroup > 1) 396bf215546Sopenharmony_ci num_workgroups = std::min(num_workgroups, program->wgp_mode ? 32u : 16u); 397bf215546Sopenharmony_ci 398bf215546Sopenharmony_ci /* Adjust #waves for workgroup multiples: 399bf215546Sopenharmony_ci * In cases like waves_per_workgroup=3 or lds=65536 and 400bf215546Sopenharmony_ci * waves_per_workgroup=1, we want the maximum possible number of waves per 401bf215546Sopenharmony_ci * SIMD and not the minimum. so DIV_ROUND_UP is used 402bf215546Sopenharmony_ci */ 403bf215546Sopenharmony_ci unsigned workgroup_waves = num_workgroups * waves_per_workgroup; 404bf215546Sopenharmony_ci return DIV_ROUND_UP(workgroup_waves, num_simd); 405bf215546Sopenharmony_ci} 406bf215546Sopenharmony_ci 407bf215546Sopenharmony_civoid 408bf215546Sopenharmony_ciupdate_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) 409bf215546Sopenharmony_ci{ 410bf215546Sopenharmony_ci assert(program->min_waves >= 1); 411bf215546Sopenharmony_ci uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); 412bf215546Sopenharmony_ci uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); 413bf215546Sopenharmony_ci 414bf215546Sopenharmony_ci /* this won't compile, register pressure reduction necessary */ 415bf215546Sopenharmony_ci if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) { 416bf215546Sopenharmony_ci program->num_waves = 0; 417bf215546Sopenharmony_ci program->max_reg_demand = new_demand; 418bf215546Sopenharmony_ci } else { 419bf215546Sopenharmony_ci program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); 420bf215546Sopenharmony_ci uint16_t vgpr_demand = 421bf215546Sopenharmony_ci get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2; 422bf215546Sopenharmony_ci program->num_waves = 423bf215546Sopenharmony_ci std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand); 424bf215546Sopenharmony_ci uint16_t max_waves = program->dev.max_wave64_per_simd * (64 / program->wave_size); 425bf215546Sopenharmony_ci program->num_waves = std::min(program->num_waves, max_waves); 426bf215546Sopenharmony_ci 427bf215546Sopenharmony_ci /* Adjust for LDS and workgroup multiples and calculate max_reg_demand */ 428bf215546Sopenharmony_ci program->num_waves = max_suitable_waves(program, program->num_waves); 429bf215546Sopenharmony_ci program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves); 430bf215546Sopenharmony_ci program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves); 431bf215546Sopenharmony_ci } 432bf215546Sopenharmony_ci} 433bf215546Sopenharmony_ci 434bf215546Sopenharmony_cilive 435bf215546Sopenharmony_cilive_var_analysis(Program* program) 436bf215546Sopenharmony_ci{ 437bf215546Sopenharmony_ci live result; 438bf215546Sopenharmony_ci result.live_out.resize(program->blocks.size()); 439bf215546Sopenharmony_ci result.register_demand.resize(program->blocks.size()); 440bf215546Sopenharmony_ci unsigned worklist = program->blocks.size(); 441bf215546Sopenharmony_ci std::vector<PhiInfo> phi_info(program->blocks.size()); 442bf215546Sopenharmony_ci RegisterDemand new_demand; 443bf215546Sopenharmony_ci 444bf215546Sopenharmony_ci program->needs_vcc = program->gfx_level >= GFX10; 445bf215546Sopenharmony_ci 446bf215546Sopenharmony_ci /* this implementation assumes that the block idx corresponds to the block's position in 447bf215546Sopenharmony_ci * program->blocks vector */ 448bf215546Sopenharmony_ci while (worklist) { 449bf215546Sopenharmony_ci unsigned block_idx = --worklist; 450bf215546Sopenharmony_ci process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, 451bf215546Sopenharmony_ci phi_info); 452bf215546Sopenharmony_ci new_demand.update(program->blocks[block_idx].register_demand); 453bf215546Sopenharmony_ci } 454bf215546Sopenharmony_ci 455bf215546Sopenharmony_ci /* Handle branches: we will insert copies created for linear phis just before the branch. */ 456bf215546Sopenharmony_ci for (Block& block : program->blocks) { 457bf215546Sopenharmony_ci result.register_demand[block.index].back().sgpr += phi_info[block.index].linear_phi_defs; 458bf215546Sopenharmony_ci result.register_demand[block.index].back().sgpr -= phi_info[block.index].linear_phi_ops; 459bf215546Sopenharmony_ci } 460bf215546Sopenharmony_ci 461bf215546Sopenharmony_ci /* calculate the program's register demand and number of waves */ 462bf215546Sopenharmony_ci if (program->progress < CompilationProgress::after_ra) 463bf215546Sopenharmony_ci update_vgpr_sgpr_demand(program, new_demand); 464bf215546Sopenharmony_ci 465bf215546Sopenharmony_ci return result; 466bf215546Sopenharmony_ci} 467bf215546Sopenharmony_ci 468bf215546Sopenharmony_ci} // namespace aco 469