1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2020 Valve Corporation 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci */ 24bf215546Sopenharmony_ci 25bf215546Sopenharmony_ci#include "aco_ir.h" 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci#include "util/crc32.h" 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci#include <algorithm> 30bf215546Sopenharmony_ci#include <deque> 31bf215546Sopenharmony_ci#include <set> 32bf215546Sopenharmony_ci#include <vector> 33bf215546Sopenharmony_ci 34bf215546Sopenharmony_cinamespace aco { 35bf215546Sopenharmony_ci 36bf215546Sopenharmony_ci/* sgpr_presched/vgpr_presched */ 37bf215546Sopenharmony_civoid 38bf215546Sopenharmony_cicollect_presched_stats(Program* program) 39bf215546Sopenharmony_ci{ 40bf215546Sopenharmony_ci RegisterDemand presched_demand; 41bf215546Sopenharmony_ci for (Block& block : program->blocks) 42bf215546Sopenharmony_ci presched_demand.update(block.register_demand); 43bf215546Sopenharmony_ci program->statistics[statistic_sgpr_presched] = presched_demand.sgpr; 44bf215546Sopenharmony_ci program->statistics[statistic_vgpr_presched] = presched_demand.vgpr; 45bf215546Sopenharmony_ci} 46bf215546Sopenharmony_ci 47bf215546Sopenharmony_ciclass BlockCycleEstimator { 48bf215546Sopenharmony_cipublic: 49bf215546Sopenharmony_ci enum resource { 50bf215546Sopenharmony_ci null = 0, 51bf215546Sopenharmony_ci scalar, 52bf215546Sopenharmony_ci branch_sendmsg, 53bf215546Sopenharmony_ci valu, 54bf215546Sopenharmony_ci valu_complex, 55bf215546Sopenharmony_ci lds, 56bf215546Sopenharmony_ci export_gds, 57bf215546Sopenharmony_ci vmem, 58bf215546Sopenharmony_ci resource_count, 59bf215546Sopenharmony_ci }; 60bf215546Sopenharmony_ci 61bf215546Sopenharmony_ci BlockCycleEstimator(Program* program_) : program(program_) {} 62bf215546Sopenharmony_ci 63bf215546Sopenharmony_ci Program* program; 64bf215546Sopenharmony_ci 65bf215546Sopenharmony_ci int32_t cur_cycle = 0; 66bf215546Sopenharmony_ci int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0}; 67bf215546Sopenharmony_ci unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0}; 68bf215546Sopenharmony_ci int32_t reg_available[512] = {0}; 69bf215546Sopenharmony_ci std::deque<int32_t> lgkm; 70bf215546Sopenharmony_ci std::deque<int32_t> exp; 71bf215546Sopenharmony_ci std::deque<int32_t> vm; 72bf215546Sopenharmony_ci std::deque<int32_t> vs; 73bf215546Sopenharmony_ci 74bf215546Sopenharmony_ci unsigned predict_cost(aco_ptr<Instruction>& instr); 75bf215546Sopenharmony_ci void add(aco_ptr<Instruction>& instr); 76bf215546Sopenharmony_ci void join(const BlockCycleEstimator& other); 77bf215546Sopenharmony_ci 78bf215546Sopenharmony_ciprivate: 79bf215546Sopenharmony_ci unsigned get_waitcnt_cost(wait_imm imm); 80bf215546Sopenharmony_ci unsigned get_dependency_cost(aco_ptr<Instruction>& instr); 81bf215546Sopenharmony_ci 82bf215546Sopenharmony_ci void use_resources(aco_ptr<Instruction>& instr); 83bf215546Sopenharmony_ci int32_t cycles_until_res_available(aco_ptr<Instruction>& instr); 84bf215546Sopenharmony_ci}; 85bf215546Sopenharmony_ci 86bf215546Sopenharmony_cistruct wait_counter_info { 87bf215546Sopenharmony_ci wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_) 88bf215546Sopenharmony_ci : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) 89bf215546Sopenharmony_ci {} 90bf215546Sopenharmony_ci 91bf215546Sopenharmony_ci unsigned vm; 92bf215546Sopenharmony_ci unsigned exp; 93bf215546Sopenharmony_ci unsigned lgkm; 94bf215546Sopenharmony_ci unsigned vs; 95bf215546Sopenharmony_ci}; 96bf215546Sopenharmony_ci 97bf215546Sopenharmony_cistruct perf_info { 98bf215546Sopenharmony_ci int latency; 99bf215546Sopenharmony_ci 100bf215546Sopenharmony_ci BlockCycleEstimator::resource rsrc0; 101bf215546Sopenharmony_ci unsigned cost0; 102bf215546Sopenharmony_ci 103bf215546Sopenharmony_ci BlockCycleEstimator::resource rsrc1; 104bf215546Sopenharmony_ci unsigned cost1; 105bf215546Sopenharmony_ci}; 106bf215546Sopenharmony_ci 107bf215546Sopenharmony_cistatic perf_info 108bf215546Sopenharmony_ciget_perf_info(Program* program, aco_ptr<Instruction>& instr) 109bf215546Sopenharmony_ci{ 110bf215546Sopenharmony_ci instr_class cls = instr_info.classes[(int)instr->opcode]; 111bf215546Sopenharmony_ci 112bf215546Sopenharmony_ci#define WAIT(res) BlockCycleEstimator::res, 0 113bf215546Sopenharmony_ci#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt 114bf215546Sopenharmony_ci 115bf215546Sopenharmony_ci if (program->gfx_level >= GFX10) { 116bf215546Sopenharmony_ci /* fp64 might be incorrect */ 117bf215546Sopenharmony_ci switch (cls) { 118bf215546Sopenharmony_ci case instr_class::valu32: 119bf215546Sopenharmony_ci case instr_class::valu_convert32: 120bf215546Sopenharmony_ci case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)}; 121bf215546Sopenharmony_ci case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)}; 122bf215546Sopenharmony_ci case instr_class::valu_quarter_rate32: 123bf215546Sopenharmony_ci return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)}; 124bf215546Sopenharmony_ci case instr_class::valu_transcendental32: 125bf215546Sopenharmony_ci return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)}; 126bf215546Sopenharmony_ci case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; 127bf215546Sopenharmony_ci case instr_class::valu_double_add: 128bf215546Sopenharmony_ci return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; 129bf215546Sopenharmony_ci case instr_class::valu_double_convert: 130bf215546Sopenharmony_ci return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; 131bf215546Sopenharmony_ci case instr_class::valu_double_transcendental: 132bf215546Sopenharmony_ci return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; 133bf215546Sopenharmony_ci case instr_class::salu: return {2, WAIT_USE(scalar, 1)}; 134bf215546Sopenharmony_ci case instr_class::smem: return {0, WAIT_USE(scalar, 1)}; 135bf215546Sopenharmony_ci case instr_class::branch: 136bf215546Sopenharmony_ci case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)}; 137bf215546Sopenharmony_ci case instr_class::ds: 138bf215546Sopenharmony_ci return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)} 139bf215546Sopenharmony_ci : perf_info{0, WAIT_USE(lds, 1)}; 140bf215546Sopenharmony_ci case instr_class::exp: return {0, WAIT_USE(export_gds, 1)}; 141bf215546Sopenharmony_ci case instr_class::vmem: return {0, WAIT_USE(vmem, 1)}; 142bf215546Sopenharmony_ci case instr_class::barrier: 143bf215546Sopenharmony_ci case instr_class::waitcnt: 144bf215546Sopenharmony_ci case instr_class::other: 145bf215546Sopenharmony_ci default: return {0}; 146bf215546Sopenharmony_ci } 147bf215546Sopenharmony_ci } else { 148bf215546Sopenharmony_ci switch (cls) { 149bf215546Sopenharmony_ci case instr_class::valu32: return {4, WAIT_USE(valu, 4)}; 150bf215546Sopenharmony_ci case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)}; 151bf215546Sopenharmony_ci case instr_class::valu64: return {8, WAIT_USE(valu, 8)}; 152bf215546Sopenharmony_ci case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)}; 153bf215546Sopenharmony_ci case instr_class::valu_fma: 154bf215546Sopenharmony_ci return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)} 155bf215546Sopenharmony_ci : perf_info{16, WAIT_USE(valu, 16)}; 156bf215546Sopenharmony_ci case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)}; 157bf215546Sopenharmony_ci case instr_class::valu_double: return {64, WAIT_USE(valu, 64)}; 158bf215546Sopenharmony_ci case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)}; 159bf215546Sopenharmony_ci case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)}; 160bf215546Sopenharmony_ci case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)}; 161bf215546Sopenharmony_ci case instr_class::salu: return {4, WAIT_USE(scalar, 4)}; 162bf215546Sopenharmony_ci case instr_class::smem: return {4, WAIT_USE(scalar, 4)}; 163bf215546Sopenharmony_ci case instr_class::branch: 164bf215546Sopenharmony_ci return {8, WAIT_USE(branch_sendmsg, 8)}; 165bf215546Sopenharmony_ci return {4, WAIT_USE(branch_sendmsg, 4)}; 166bf215546Sopenharmony_ci case instr_class::ds: 167bf215546Sopenharmony_ci return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)} 168bf215546Sopenharmony_ci : perf_info{4, WAIT_USE(lds, 4)}; 169bf215546Sopenharmony_ci case instr_class::exp: return {16, WAIT_USE(export_gds, 16)}; 170bf215546Sopenharmony_ci case instr_class::vmem: return {4, WAIT_USE(vmem, 4)}; 171bf215546Sopenharmony_ci case instr_class::barrier: 172bf215546Sopenharmony_ci case instr_class::waitcnt: 173bf215546Sopenharmony_ci case instr_class::other: 174bf215546Sopenharmony_ci default: return {4}; 175bf215546Sopenharmony_ci } 176bf215546Sopenharmony_ci } 177bf215546Sopenharmony_ci 178bf215546Sopenharmony_ci#undef WAIT_USE 179bf215546Sopenharmony_ci#undef WAIT 180bf215546Sopenharmony_ci} 181bf215546Sopenharmony_ci 182bf215546Sopenharmony_civoid 183bf215546Sopenharmony_ciBlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr) 184bf215546Sopenharmony_ci{ 185bf215546Sopenharmony_ci perf_info perf = get_perf_info(program, instr); 186bf215546Sopenharmony_ci 187bf215546Sopenharmony_ci if (perf.rsrc0 != resource_count) { 188bf215546Sopenharmony_ci res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0; 189bf215546Sopenharmony_ci res_usage[(int)perf.rsrc0] += perf.cost0; 190bf215546Sopenharmony_ci } 191bf215546Sopenharmony_ci 192bf215546Sopenharmony_ci if (perf.rsrc1 != resource_count) { 193bf215546Sopenharmony_ci res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1; 194bf215546Sopenharmony_ci res_usage[(int)perf.rsrc1] += perf.cost1; 195bf215546Sopenharmony_ci } 196bf215546Sopenharmony_ci} 197bf215546Sopenharmony_ci 198bf215546Sopenharmony_ciint32_t 199bf215546Sopenharmony_ciBlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr) 200bf215546Sopenharmony_ci{ 201bf215546Sopenharmony_ci perf_info perf = get_perf_info(program, instr); 202bf215546Sopenharmony_ci 203bf215546Sopenharmony_ci int32_t cost = 0; 204bf215546Sopenharmony_ci if (perf.rsrc0 != resource_count) 205bf215546Sopenharmony_ci cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle); 206bf215546Sopenharmony_ci if (perf.rsrc1 != resource_count) 207bf215546Sopenharmony_ci cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle); 208bf215546Sopenharmony_ci 209bf215546Sopenharmony_ci return cost; 210bf215546Sopenharmony_ci} 211bf215546Sopenharmony_ci 212bf215546Sopenharmony_cistatic wait_counter_info 213bf215546Sopenharmony_ciget_wait_counter_info(aco_ptr<Instruction>& instr) 214bf215546Sopenharmony_ci{ 215bf215546Sopenharmony_ci /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance 216bf215546Sopenharmony_ci * depends a lot on the situation. */ 217bf215546Sopenharmony_ci 218bf215546Sopenharmony_ci if (instr->isEXP()) 219bf215546Sopenharmony_ci return wait_counter_info(0, 16, 0, 0); 220bf215546Sopenharmony_ci 221bf215546Sopenharmony_ci if (instr->isFlatLike()) { 222bf215546Sopenharmony_ci unsigned lgkm = instr->isFlat() ? 20 : 0; 223bf215546Sopenharmony_ci if (!instr->definitions.empty()) 224bf215546Sopenharmony_ci return wait_counter_info(320, 0, lgkm, 0); 225bf215546Sopenharmony_ci else 226bf215546Sopenharmony_ci return wait_counter_info(0, 0, lgkm, 320); 227bf215546Sopenharmony_ci } 228bf215546Sopenharmony_ci 229bf215546Sopenharmony_ci if (instr->isSMEM()) { 230bf215546Sopenharmony_ci if (instr->definitions.empty()) 231bf215546Sopenharmony_ci return wait_counter_info(0, 0, 200, 0); 232bf215546Sopenharmony_ci if (instr->operands.empty()) /* s_memtime and s_memrealtime */ 233bf215546Sopenharmony_ci return wait_counter_info(0, 0, 1, 0); 234bf215546Sopenharmony_ci 235bf215546Sopenharmony_ci bool likely_desc_load = instr->operands[0].size() == 2; 236bf215546Sopenharmony_ci bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); 237bf215546Sopenharmony_ci bool const_offset = 238bf215546Sopenharmony_ci instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant()); 239bf215546Sopenharmony_ci 240bf215546Sopenharmony_ci if (likely_desc_load || const_offset) 241bf215546Sopenharmony_ci return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */ 242bf215546Sopenharmony_ci 243bf215546Sopenharmony_ci return wait_counter_info(0, 0, 200, 0); 244bf215546Sopenharmony_ci } 245bf215546Sopenharmony_ci 246bf215546Sopenharmony_ci if (instr->format == Format::DS) 247bf215546Sopenharmony_ci return wait_counter_info(0, 0, 20, 0); 248bf215546Sopenharmony_ci 249bf215546Sopenharmony_ci if (instr->isVMEM() && !instr->definitions.empty()) 250bf215546Sopenharmony_ci return wait_counter_info(320, 0, 0, 0); 251bf215546Sopenharmony_ci 252bf215546Sopenharmony_ci if (instr->isVMEM() && instr->definitions.empty()) 253bf215546Sopenharmony_ci return wait_counter_info(0, 0, 0, 320); 254bf215546Sopenharmony_ci 255bf215546Sopenharmony_ci return wait_counter_info(0, 0, 0, 0); 256bf215546Sopenharmony_ci} 257bf215546Sopenharmony_ci 258bf215546Sopenharmony_cistatic wait_imm 259bf215546Sopenharmony_ciget_wait_imm(Program* program, aco_ptr<Instruction>& instr) 260bf215546Sopenharmony_ci{ 261bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_endpgm) { 262bf215546Sopenharmony_ci return wait_imm(0, 0, 0, 0); 263bf215546Sopenharmony_ci } else if (instr->opcode == aco_opcode::s_waitcnt) { 264bf215546Sopenharmony_ci return wait_imm(GFX10_3, instr->sopp().imm); 265bf215546Sopenharmony_ci } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) { 266bf215546Sopenharmony_ci return wait_imm(0, 0, 0, instr->sopk().imm); 267bf215546Sopenharmony_ci } else { 268bf215546Sopenharmony_ci unsigned max_lgkm_cnt = program->gfx_level >= GFX10 ? 62 : 14; 269bf215546Sopenharmony_ci unsigned max_exp_cnt = 6; 270bf215546Sopenharmony_ci unsigned max_vm_cnt = program->gfx_level >= GFX9 ? 62 : 14; 271bf215546Sopenharmony_ci unsigned max_vs_cnt = 62; 272bf215546Sopenharmony_ci 273bf215546Sopenharmony_ci wait_counter_info wait_info = get_wait_counter_info(instr); 274bf215546Sopenharmony_ci wait_imm imm; 275bf215546Sopenharmony_ci imm.lgkm = wait_info.lgkm ? max_lgkm_cnt : wait_imm::unset_counter; 276bf215546Sopenharmony_ci imm.exp = wait_info.exp ? max_exp_cnt : wait_imm::unset_counter; 277bf215546Sopenharmony_ci imm.vm = wait_info.vm ? max_vm_cnt : wait_imm::unset_counter; 278bf215546Sopenharmony_ci imm.vs = wait_info.vs ? max_vs_cnt : wait_imm::unset_counter; 279bf215546Sopenharmony_ci return imm; 280bf215546Sopenharmony_ci } 281bf215546Sopenharmony_ci} 282bf215546Sopenharmony_ci 283bf215546Sopenharmony_ciunsigned 284bf215546Sopenharmony_ciBlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr) 285bf215546Sopenharmony_ci{ 286bf215546Sopenharmony_ci int deps_available = cur_cycle; 287bf215546Sopenharmony_ci 288bf215546Sopenharmony_ci wait_imm imm = get_wait_imm(program, instr); 289bf215546Sopenharmony_ci if (imm.vm != wait_imm::unset_counter) { 290bf215546Sopenharmony_ci for (int i = 0; i < (int)vm.size() - imm.vm; i++) 291bf215546Sopenharmony_ci deps_available = MAX2(deps_available, vm[i]); 292bf215546Sopenharmony_ci } 293bf215546Sopenharmony_ci if (imm.exp != wait_imm::unset_counter) { 294bf215546Sopenharmony_ci for (int i = 0; i < (int)exp.size() - imm.exp; i++) 295bf215546Sopenharmony_ci deps_available = MAX2(deps_available, exp[i]); 296bf215546Sopenharmony_ci } 297bf215546Sopenharmony_ci if (imm.lgkm != wait_imm::unset_counter) { 298bf215546Sopenharmony_ci for (int i = 0; i < (int)lgkm.size() - imm.lgkm; i++) 299bf215546Sopenharmony_ci deps_available = MAX2(deps_available, lgkm[i]); 300bf215546Sopenharmony_ci } 301bf215546Sopenharmony_ci if (imm.vs != wait_imm::unset_counter) { 302bf215546Sopenharmony_ci for (int i = 0; i < (int)vs.size() - imm.vs; i++) 303bf215546Sopenharmony_ci deps_available = MAX2(deps_available, vs[i]); 304bf215546Sopenharmony_ci } 305bf215546Sopenharmony_ci 306bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_endpgm) { 307bf215546Sopenharmony_ci for (unsigned i = 0; i < 512; i++) 308bf215546Sopenharmony_ci deps_available = MAX2(deps_available, reg_available[i]); 309bf215546Sopenharmony_ci } else if (program->gfx_level >= GFX10) { 310bf215546Sopenharmony_ci for (Operand& op : instr->operands) { 311bf215546Sopenharmony_ci if (op.isConstant() || op.isUndefined()) 312bf215546Sopenharmony_ci continue; 313bf215546Sopenharmony_ci for (unsigned i = 0; i < op.size(); i++) 314bf215546Sopenharmony_ci deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]); 315bf215546Sopenharmony_ci } 316bf215546Sopenharmony_ci } 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci if (program->gfx_level < GFX10) 319bf215546Sopenharmony_ci deps_available = align(deps_available, 4); 320bf215546Sopenharmony_ci 321bf215546Sopenharmony_ci return deps_available - cur_cycle; 322bf215546Sopenharmony_ci} 323bf215546Sopenharmony_ci 324bf215546Sopenharmony_ciunsigned 325bf215546Sopenharmony_ciBlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr) 326bf215546Sopenharmony_ci{ 327bf215546Sopenharmony_ci int32_t dep = get_dependency_cost(instr); 328bf215546Sopenharmony_ci return dep + std::max(cycles_until_res_available(instr) - dep, 0); 329bf215546Sopenharmony_ci} 330bf215546Sopenharmony_ci 331bf215546Sopenharmony_cistatic bool 332bf215546Sopenharmony_ciis_vector(aco_opcode op) 333bf215546Sopenharmony_ci{ 334bf215546Sopenharmony_ci switch (instr_info.classes[(int)op]) { 335bf215546Sopenharmony_ci case instr_class::valu32: 336bf215546Sopenharmony_ci case instr_class::valu_convert32: 337bf215546Sopenharmony_ci case instr_class::valu_fma: 338bf215546Sopenharmony_ci case instr_class::valu_double: 339bf215546Sopenharmony_ci case instr_class::valu_double_add: 340bf215546Sopenharmony_ci case instr_class::valu_double_convert: 341bf215546Sopenharmony_ci case instr_class::valu_double_transcendental: 342bf215546Sopenharmony_ci case instr_class::vmem: 343bf215546Sopenharmony_ci case instr_class::ds: 344bf215546Sopenharmony_ci case instr_class::exp: 345bf215546Sopenharmony_ci case instr_class::valu64: 346bf215546Sopenharmony_ci case instr_class::valu_quarter_rate32: 347bf215546Sopenharmony_ci case instr_class::valu_transcendental32: return true; 348bf215546Sopenharmony_ci default: return false; 349bf215546Sopenharmony_ci } 350bf215546Sopenharmony_ci} 351bf215546Sopenharmony_ci 352bf215546Sopenharmony_civoid 353bf215546Sopenharmony_ciBlockCycleEstimator::add(aco_ptr<Instruction>& instr) 354bf215546Sopenharmony_ci{ 355bf215546Sopenharmony_ci perf_info perf = get_perf_info(program, instr); 356bf215546Sopenharmony_ci 357bf215546Sopenharmony_ci cur_cycle += get_dependency_cost(instr); 358bf215546Sopenharmony_ci 359bf215546Sopenharmony_ci unsigned start; 360bf215546Sopenharmony_ci bool dual_issue = program->gfx_level >= GFX10 && program->wave_size == 64 && 361bf215546Sopenharmony_ci is_vector(instr->opcode) && program->workgroup_size > 32; 362bf215546Sopenharmony_ci for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) { 363bf215546Sopenharmony_ci cur_cycle += cycles_until_res_available(instr); 364bf215546Sopenharmony_ci 365bf215546Sopenharmony_ci start = cur_cycle; 366bf215546Sopenharmony_ci use_resources(instr); 367bf215546Sopenharmony_ci 368bf215546Sopenharmony_ci /* GCN is in-order and doesn't begin the next instruction until the current one finishes */ 369bf215546Sopenharmony_ci cur_cycle += program->gfx_level >= GFX10 ? 1 : perf.latency; 370bf215546Sopenharmony_ci } 371bf215546Sopenharmony_ci 372bf215546Sopenharmony_ci wait_imm imm = get_wait_imm(program, instr); 373bf215546Sopenharmony_ci while (lgkm.size() > imm.lgkm) 374bf215546Sopenharmony_ci lgkm.pop_front(); 375bf215546Sopenharmony_ci while (exp.size() > imm.exp) 376bf215546Sopenharmony_ci exp.pop_front(); 377bf215546Sopenharmony_ci while (vm.size() > imm.vm) 378bf215546Sopenharmony_ci vm.pop_front(); 379bf215546Sopenharmony_ci while (vs.size() > imm.vs) 380bf215546Sopenharmony_ci vs.pop_front(); 381bf215546Sopenharmony_ci 382bf215546Sopenharmony_ci wait_counter_info wait_info = get_wait_counter_info(instr); 383bf215546Sopenharmony_ci if (wait_info.exp) 384bf215546Sopenharmony_ci exp.push_back(cur_cycle + wait_info.exp); 385bf215546Sopenharmony_ci if (wait_info.lgkm) 386bf215546Sopenharmony_ci lgkm.push_back(cur_cycle + wait_info.lgkm); 387bf215546Sopenharmony_ci if (wait_info.vm) 388bf215546Sopenharmony_ci vm.push_back(cur_cycle + wait_info.vm); 389bf215546Sopenharmony_ci if (wait_info.vs) 390bf215546Sopenharmony_ci vs.push_back(cur_cycle + wait_info.vs); 391bf215546Sopenharmony_ci 392bf215546Sopenharmony_ci /* This is inaccurate but shouldn't affect anything after waitcnt insertion. 393bf215546Sopenharmony_ci * Before waitcnt insertion, this is necessary to consider memory operations. 394bf215546Sopenharmony_ci */ 395bf215546Sopenharmony_ci int latency = MAX3(wait_info.exp, wait_info.lgkm, wait_info.vm); 396bf215546Sopenharmony_ci int32_t result_available = start + MAX2(perf.latency, latency); 397bf215546Sopenharmony_ci 398bf215546Sopenharmony_ci for (Definition& def : instr->definitions) { 399bf215546Sopenharmony_ci int32_t* available = ®_available[def.physReg().reg()]; 400bf215546Sopenharmony_ci for (unsigned i = 0; i < def.size(); i++) 401bf215546Sopenharmony_ci available[i] = MAX2(available[i], result_available); 402bf215546Sopenharmony_ci } 403bf215546Sopenharmony_ci} 404bf215546Sopenharmony_ci 405bf215546Sopenharmony_cistatic void 406bf215546Sopenharmony_cijoin_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff) 407bf215546Sopenharmony_ci{ 408bf215546Sopenharmony_ci for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++) 409bf215546Sopenharmony_ci queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff); 410bf215546Sopenharmony_ci for (int i = pred.size() - queue.size() - 1; i >= 0; i--) 411bf215546Sopenharmony_ci queue.push_front(pred[i] + cycle_diff); 412bf215546Sopenharmony_ci} 413bf215546Sopenharmony_ci 414bf215546Sopenharmony_civoid 415bf215546Sopenharmony_ciBlockCycleEstimator::join(const BlockCycleEstimator& pred) 416bf215546Sopenharmony_ci{ 417bf215546Sopenharmony_ci assert(cur_cycle == 0); 418bf215546Sopenharmony_ci 419bf215546Sopenharmony_ci for (unsigned i = 0; i < (unsigned)resource_count; i++) { 420bf215546Sopenharmony_ci assert(res_usage[i] == 0); 421bf215546Sopenharmony_ci res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle); 422bf215546Sopenharmony_ci } 423bf215546Sopenharmony_ci 424bf215546Sopenharmony_ci for (unsigned i = 0; i < 512; i++) 425bf215546Sopenharmony_ci reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle); 426bf215546Sopenharmony_ci 427bf215546Sopenharmony_ci join_queue(lgkm, pred.lgkm, -pred.cur_cycle); 428bf215546Sopenharmony_ci join_queue(exp, pred.exp, -pred.cur_cycle); 429bf215546Sopenharmony_ci join_queue(vm, pred.vm, -pred.cur_cycle); 430bf215546Sopenharmony_ci join_queue(vs, pred.vs, -pred.cur_cycle); 431bf215546Sopenharmony_ci} 432bf215546Sopenharmony_ci 433bf215546Sopenharmony_ci/* instructions/branches/vmem_clauses/smem_clauses/cycles */ 434bf215546Sopenharmony_civoid 435bf215546Sopenharmony_cicollect_preasm_stats(Program* program) 436bf215546Sopenharmony_ci{ 437bf215546Sopenharmony_ci for (Block& block : program->blocks) { 438bf215546Sopenharmony_ci std::set<Instruction*> vmem_clause; 439bf215546Sopenharmony_ci std::set<Instruction*> smem_clause; 440bf215546Sopenharmony_ci 441bf215546Sopenharmony_ci program->statistics[statistic_instructions] += block.instructions.size(); 442bf215546Sopenharmony_ci 443bf215546Sopenharmony_ci for (aco_ptr<Instruction>& instr : block.instructions) { 444bf215546Sopenharmony_ci if (instr->isSOPP() && instr->sopp().block != -1) 445bf215546Sopenharmony_ci program->statistics[statistic_branches]++; 446bf215546Sopenharmony_ci 447bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::p_constaddr) 448bf215546Sopenharmony_ci program->statistics[statistic_instructions] += 2; 449bf215546Sopenharmony_ci 450bf215546Sopenharmony_ci if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) && 451bf215546Sopenharmony_ci !instr->operands.empty()) { 452bf215546Sopenharmony_ci if (std::none_of(vmem_clause.begin(), vmem_clause.end(), 453bf215546Sopenharmony_ci [&](Instruction* other) 454bf215546Sopenharmony_ci { return should_form_clause(instr.get(), other); })) 455bf215546Sopenharmony_ci program->statistics[statistic_vmem_clauses]++; 456bf215546Sopenharmony_ci vmem_clause.insert(instr.get()); 457bf215546Sopenharmony_ci } else { 458bf215546Sopenharmony_ci vmem_clause.clear(); 459bf215546Sopenharmony_ci } 460bf215546Sopenharmony_ci 461bf215546Sopenharmony_ci if (instr->isSMEM() && !instr->operands.empty()) { 462bf215546Sopenharmony_ci if (std::none_of(smem_clause.begin(), smem_clause.end(), 463bf215546Sopenharmony_ci [&](Instruction* other) 464bf215546Sopenharmony_ci { return should_form_clause(instr.get(), other); })) 465bf215546Sopenharmony_ci program->statistics[statistic_smem_clauses]++; 466bf215546Sopenharmony_ci smem_clause.insert(instr.get()); 467bf215546Sopenharmony_ci } else { 468bf215546Sopenharmony_ci smem_clause.clear(); 469bf215546Sopenharmony_ci } 470bf215546Sopenharmony_ci } 471bf215546Sopenharmony_ci } 472bf215546Sopenharmony_ci 473bf215546Sopenharmony_ci double latency = 0; 474bf215546Sopenharmony_ci double usage[(int)BlockCycleEstimator::resource_count] = {0}; 475bf215546Sopenharmony_ci std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program); 476bf215546Sopenharmony_ci 477bf215546Sopenharmony_ci if (program->stage.has(SWStage::VS) && program->info.vs.has_prolog) { 478bf215546Sopenharmony_ci unsigned vs_input_latency = 320; 479bf215546Sopenharmony_ci for (Definition def : program->vs_inputs) { 480bf215546Sopenharmony_ci blocks[0].vm.push_back(vs_input_latency); 481bf215546Sopenharmony_ci for (unsigned i = 0; i < def.size(); i++) 482bf215546Sopenharmony_ci blocks[0].reg_available[def.physReg().reg() + i] = vs_input_latency; 483bf215546Sopenharmony_ci } 484bf215546Sopenharmony_ci } 485bf215546Sopenharmony_ci 486bf215546Sopenharmony_ci for (Block& block : program->blocks) { 487bf215546Sopenharmony_ci BlockCycleEstimator& block_est = blocks[block.index]; 488bf215546Sopenharmony_ci for (unsigned pred : block.linear_preds) 489bf215546Sopenharmony_ci block_est.join(blocks[pred]); 490bf215546Sopenharmony_ci 491bf215546Sopenharmony_ci for (aco_ptr<Instruction>& instr : block.instructions) { 492bf215546Sopenharmony_ci unsigned before = block_est.cur_cycle; 493bf215546Sopenharmony_ci block_est.add(instr); 494bf215546Sopenharmony_ci instr->pass_flags = block_est.cur_cycle - before; 495bf215546Sopenharmony_ci } 496bf215546Sopenharmony_ci 497bf215546Sopenharmony_ci /* TODO: it would be nice to be able to consider estimated loop trip 498bf215546Sopenharmony_ci * counts used for loop unrolling. 499bf215546Sopenharmony_ci */ 500bf215546Sopenharmony_ci 501bf215546Sopenharmony_ci /* TODO: estimate the trip_count of divergent loops (those which break 502bf215546Sopenharmony_ci * divergent) higher than of uniform loops 503bf215546Sopenharmony_ci */ 504bf215546Sopenharmony_ci 505bf215546Sopenharmony_ci /* Assume loops execute 8-2 times, uniform branches are taken 50% the time, 506bf215546Sopenharmony_ci * and any lane in the wave takes a side of a divergent branch 75% of the 507bf215546Sopenharmony_ci * time. 508bf215546Sopenharmony_ci */ 509bf215546Sopenharmony_ci double iter = 1.0; 510bf215546Sopenharmony_ci iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0; 511bf215546Sopenharmony_ci iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0; 512bf215546Sopenharmony_ci iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0; 513bf215546Sopenharmony_ci iter *= pow(0.5, block.uniform_if_depth); 514bf215546Sopenharmony_ci iter *= pow(0.75, block.divergent_if_logical_depth); 515bf215546Sopenharmony_ci 516bf215546Sopenharmony_ci bool divergent_if_linear_else = 517bf215546Sopenharmony_ci block.logical_preds.empty() && block.linear_preds.size() == 1 && 518bf215546Sopenharmony_ci block.linear_succs.size() == 1 && 519bf215546Sopenharmony_ci program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert); 520bf215546Sopenharmony_ci if (divergent_if_linear_else) 521bf215546Sopenharmony_ci iter *= 0.25; 522bf215546Sopenharmony_ci 523bf215546Sopenharmony_ci latency += block_est.cur_cycle * iter; 524bf215546Sopenharmony_ci for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) 525bf215546Sopenharmony_ci usage[i] += block_est.res_usage[i] * iter; 526bf215546Sopenharmony_ci } 527bf215546Sopenharmony_ci 528bf215546Sopenharmony_ci /* This likely exaggerates the effectiveness of parallelism because it 529bf215546Sopenharmony_ci * ignores instruction ordering. It can assume there might be SALU/VALU/etc 530bf215546Sopenharmony_ci * work to from other waves while one is idle but that might not be the case 531bf215546Sopenharmony_ci * because those other waves have not reached such a point yet. 532bf215546Sopenharmony_ci */ 533bf215546Sopenharmony_ci 534bf215546Sopenharmony_ci double parallelism = program->num_waves; 535bf215546Sopenharmony_ci for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) { 536bf215546Sopenharmony_ci if (usage[i] > 0.0) 537bf215546Sopenharmony_ci parallelism = MIN2(parallelism, latency / usage[i]); 538bf215546Sopenharmony_ci } 539bf215546Sopenharmony_ci double waves_per_cycle = 1.0 / latency * parallelism; 540bf215546Sopenharmony_ci double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0); 541bf215546Sopenharmony_ci 542bf215546Sopenharmony_ci double max_utilization = 1.0; 543bf215546Sopenharmony_ci if (program->workgroup_size != UINT_MAX) 544bf215546Sopenharmony_ci max_utilization = 545bf215546Sopenharmony_ci program->workgroup_size / (double)align(program->workgroup_size, program->wave_size); 546bf215546Sopenharmony_ci wave64_per_cycle *= max_utilization; 547bf215546Sopenharmony_ci 548bf215546Sopenharmony_ci program->statistics[statistic_latency] = round(latency); 549bf215546Sopenharmony_ci program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle); 550bf215546Sopenharmony_ci 551bf215546Sopenharmony_ci if (debug_flags & DEBUG_PERF_INFO) { 552bf215546Sopenharmony_ci aco_print_program(program, stderr, print_no_ssa | print_perf_info); 553bf215546Sopenharmony_ci 554bf215546Sopenharmony_ci fprintf(stderr, "num_waves: %u\n", program->num_waves); 555bf215546Sopenharmony_ci fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]); 556bf215546Sopenharmony_ci fprintf(stderr, "branch_sendmsg_usage: %f\n", 557bf215546Sopenharmony_ci usage[(int)BlockCycleEstimator::branch_sendmsg]); 558bf215546Sopenharmony_ci fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]); 559bf215546Sopenharmony_ci fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]); 560bf215546Sopenharmony_ci fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]); 561bf215546Sopenharmony_ci fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]); 562bf215546Sopenharmony_ci fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]); 563bf215546Sopenharmony_ci fprintf(stderr, "latency: %f\n", latency); 564bf215546Sopenharmony_ci fprintf(stderr, "parallelism: %f\n", parallelism); 565bf215546Sopenharmony_ci fprintf(stderr, "max_utilization: %f\n", max_utilization); 566bf215546Sopenharmony_ci fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle); 567bf215546Sopenharmony_ci fprintf(stderr, "\n"); 568bf215546Sopenharmony_ci } 569bf215546Sopenharmony_ci} 570bf215546Sopenharmony_ci 571bf215546Sopenharmony_civoid 572bf215546Sopenharmony_cicollect_postasm_stats(Program* program, const std::vector<uint32_t>& code) 573bf215546Sopenharmony_ci{ 574bf215546Sopenharmony_ci program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4); 575bf215546Sopenharmony_ci} 576bf215546Sopenharmony_ci 577bf215546Sopenharmony_ci} // namespace aco 578