1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_ir.h" 26 27#include "common/sid.h" 28 29#include <map> 30#include <stack> 31#include <vector> 32 33namespace aco { 34 35namespace { 36 37/** 38 * The general idea of this pass is: 39 * The CFG is traversed in reverse postorder (forward) and loops are processed 40 * several times until no progress is made. 41 * Per BB two wait_ctx is maintained: an in-context and out-context. 42 * The in-context is the joined out-contexts of the predecessors. 43 * The context contains a map: gpr -> wait_entry 44 * consisting of the information about the cnt values to be waited for. 45 * Note: After merge-nodes, it might occur that for the same register 46 * multiple cnt values are to be waited for. 47 * 48 * The values are updated according to the encountered instructions: 49 * - additional events increment the counter of waits of the same type 50 * - or erase gprs with counters higher than to be waited for. 51 */ 52 53// TODO: do a more clever insertion of wait_cnt (lgkm_cnt) 54// when there is a load followed by a use of a previous load 55 56/* Instructions of the same event will finish in-order except for smem 57 * and maybe flat. Instructions of different events may not finish in-order. */ 58enum wait_event : uint16_t { 59 event_smem = 1 << 0, 60 event_lds = 1 << 1, 61 event_gds = 1 << 2, 62 event_vmem = 1 << 3, 63 event_vmem_store = 1 << 4, /* GFX10+ */ 64 event_flat = 1 << 5, 65 event_exp_pos = 1 << 6, 66 event_exp_param = 1 << 7, 67 event_exp_mrt_null = 1 << 8, 68 event_gds_gpr_lock = 1 << 9, 69 event_vmem_gpr_lock = 1 << 10, 70 event_sendmsg = 1 << 11, 71 num_events = 12, 72}; 73 74enum counter_type : uint8_t { 75 counter_exp = 1 << 0, 76 counter_lgkm = 1 << 1, 77 counter_vm = 1 << 2, 78 counter_vs = 1 << 3, 79 num_counters = 4, 80}; 81 82enum vmem_type : uint8_t { 83 vmem_nosampler = 1 << 0, 84 vmem_sampler = 1 << 1, 85 vmem_bvh = 1 << 2, 86}; 87 88static const uint16_t exp_events = 89 event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock; 90static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg; 91static const uint16_t vm_events = event_vmem | event_flat; 92static const uint16_t vs_events = event_vmem_store; 93 94uint8_t 95get_counters_for_event(wait_event ev) 96{ 97 switch (ev) { 98 case event_smem: 99 case event_lds: 100 case event_gds: 101 case event_sendmsg: return counter_lgkm; 102 case event_vmem: return counter_vm; 103 case event_vmem_store: return counter_vs; 104 case event_flat: return counter_vm | counter_lgkm; 105 case event_exp_pos: 106 case event_exp_param: 107 case event_exp_mrt_null: 108 case event_gds_gpr_lock: 109 case event_vmem_gpr_lock: return counter_exp; 110 default: return 0; 111 } 112} 113 114struct wait_entry { 115 wait_imm imm; 116 uint16_t events; /* use wait_event notion */ 117 uint8_t counters; /* use counter_type notion */ 118 bool wait_on_read : 1; 119 bool logical : 1; 120 uint8_t vmem_types : 4; 121 122 wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_) 123 : imm(imm_), events(event_), counters(get_counters_for_event(event_)), 124 wait_on_read(wait_on_read_), logical(logical_), vmem_types(0) 125 {} 126 127 bool join(const wait_entry& other) 128 { 129 bool changed = (other.events & ~events) || (other.counters & ~counters) || 130 (other.wait_on_read && !wait_on_read) || (other.vmem_types & !vmem_types); 131 events |= other.events; 132 counters |= other.counters; 133 changed |= imm.combine(other.imm); 134 wait_on_read |= other.wait_on_read; 135 vmem_types |= other.vmem_types; 136 assert(logical == other.logical); 137 return changed; 138 } 139 140 void remove_counter(counter_type counter) 141 { 142 counters &= ~counter; 143 144 if (counter == counter_lgkm) { 145 imm.lgkm = wait_imm::unset_counter; 146 events &= ~(event_smem | event_lds | event_gds | event_sendmsg); 147 } 148 149 if (counter == counter_vm) { 150 imm.vm = wait_imm::unset_counter; 151 events &= ~event_vmem; 152 vmem_types = 0; 153 } 154 155 if (counter == counter_exp) { 156 imm.exp = wait_imm::unset_counter; 157 events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | 158 event_vmem_gpr_lock); 159 } 160 161 if (counter == counter_vs) { 162 imm.vs = wait_imm::unset_counter; 163 events &= ~event_vmem_store; 164 } 165 166 if (!(counters & counter_lgkm) && !(counters & counter_vm)) 167 events &= ~event_flat; 168 } 169}; 170 171struct wait_ctx { 172 Program* program; 173 enum amd_gfx_level gfx_level; 174 uint16_t max_vm_cnt; 175 uint16_t max_exp_cnt; 176 uint16_t max_lgkm_cnt; 177 uint16_t max_vs_cnt; 178 uint16_t unordered_events = event_smem | event_flat; 179 180 uint8_t vm_cnt = 0; 181 uint8_t exp_cnt = 0; 182 uint8_t lgkm_cnt = 0; 183 uint8_t vs_cnt = 0; 184 bool pending_flat_lgkm = false; 185 bool pending_flat_vm = false; 186 bool pending_s_buffer_store = false; /* GFX10 workaround */ 187 188 wait_imm barrier_imm[storage_count]; 189 uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */ 190 191 std::map<PhysReg, wait_entry> gpr_map; 192 193 wait_ctx() {} 194 wait_ctx(Program* program_) 195 : program(program_), gfx_level(program_->gfx_level), 196 max_vm_cnt(program_->gfx_level >= GFX9 ? 62 : 14), max_exp_cnt(6), 197 max_lgkm_cnt(program_->gfx_level >= GFX10 ? 62 : 14), 198 max_vs_cnt(program_->gfx_level >= GFX10 ? 62 : 0), 199 unordered_events(event_smem | (program_->gfx_level < GFX10 ? event_flat : 0)) 200 {} 201 202 bool join(const wait_ctx* other, bool logical) 203 { 204 bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt || 205 other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt || 206 (other->pending_flat_lgkm && !pending_flat_lgkm) || 207 (other->pending_flat_vm && !pending_flat_vm); 208 209 exp_cnt = std::max(exp_cnt, other->exp_cnt); 210 vm_cnt = std::max(vm_cnt, other->vm_cnt); 211 lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt); 212 vs_cnt = std::max(vs_cnt, other->vs_cnt); 213 pending_flat_lgkm |= other->pending_flat_lgkm; 214 pending_flat_vm |= other->pending_flat_vm; 215 pending_s_buffer_store |= other->pending_s_buffer_store; 216 217 for (const auto& entry : other->gpr_map) { 218 if (entry.second.logical != logical) 219 continue; 220 221 using iterator = std::map<PhysReg, wait_entry>::iterator; 222 const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry); 223 if (insert_pair.second) { 224 changed = true; 225 } else { 226 changed |= insert_pair.first->second.join(entry.second); 227 } 228 } 229 230 for (unsigned i = 0; i < storage_count; i++) { 231 changed |= barrier_imm[i].combine(other->barrier_imm[i]); 232 changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0; 233 barrier_events[i] |= other->barrier_events[i]; 234 } 235 236 return changed; 237 } 238 239 void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) 240 { 241 entry.remove_counter(counter); 242 } 243}; 244 245uint8_t 246get_vmem_type(Instruction* instr) 247{ 248 if (instr->opcode == aco_opcode::image_bvh64_intersect_ray) 249 return vmem_bvh; 250 else if (instr->isMIMG() && !instr->operands[1].isUndefined() && 251 instr->operands[1].regClass() == s4) 252 return vmem_sampler; 253 else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) 254 return vmem_nosampler; 255 return 0; 256} 257 258void 259check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) 260{ 261 for (const Operand op : instr->operands) { 262 if (op.isConstant() || op.isUndefined()) 263 continue; 264 265 /* check consecutively read gprs */ 266 for (unsigned j = 0; j < op.size(); j++) { 267 PhysReg reg{op.physReg() + j}; 268 std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg); 269 if (it == ctx.gpr_map.end() || !it->second.wait_on_read) 270 continue; 271 272 wait.combine(it->second.imm); 273 } 274 } 275 276 for (const Definition& def : instr->definitions) { 277 /* check consecutively written gprs */ 278 for (unsigned j = 0; j < def.getTemp().size(); j++) { 279 PhysReg reg{def.physReg() + j}; 280 281 std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg); 282 if (it == ctx.gpr_map.end()) 283 continue; 284 285 /* Vector Memory reads and writes return in the order they were issued */ 286 uint8_t vmem_type = get_vmem_type(instr); 287 if (vmem_type && ((it->second.events & vm_events) == event_vmem) && 288 it->second.vmem_types == vmem_type) 289 continue; 290 291 /* LDS reads and writes return in the order they were issued. same for GDS */ 292 if (instr->isDS() && 293 (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds)) 294 continue; 295 296 wait.combine(it->second.imm); 297 } 298 } 299} 300 301bool 302parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr) 303{ 304 if (instr->opcode == aco_opcode::s_waitcnt_vscnt && 305 instr->definitions[0].physReg() == sgpr_null) { 306 imm.vs = std::min<uint8_t>(imm.vs, instr->sopk().imm); 307 return true; 308 } else if (instr->opcode == aco_opcode::s_waitcnt) { 309 imm.combine(wait_imm(ctx.gfx_level, instr->sopp().imm)); 310 return true; 311 } 312 return false; 313} 314 315void 316perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics) 317{ 318 sync_scope subgroup_scope = 319 ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup; 320 if ((sync.semantics & semantics) && sync.scope > subgroup_scope) { 321 unsigned storage = sync.storage; 322 while (storage) { 323 unsigned idx = u_bit_scan(&storage); 324 325 /* LDS is private to the workgroup */ 326 sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup); 327 328 uint16_t events = ctx.barrier_events[idx]; 329 if (bar_scope_lds <= subgroup_scope) 330 events &= ~event_lds; 331 332 /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations 333 * in-order for the same workgroup */ 334 if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup) 335 events &= ~(event_vmem | event_vmem_store | event_smem); 336 337 if (events) 338 imm.combine(ctx.barrier_imm[idx]); 339 } 340 } 341} 342 343void 344force_waitcnt(wait_ctx& ctx, wait_imm& imm) 345{ 346 if (ctx.vm_cnt) 347 imm.vm = 0; 348 if (ctx.exp_cnt) 349 imm.exp = 0; 350 if (ctx.lgkm_cnt) 351 imm.lgkm = 0; 352 353 if (ctx.gfx_level >= GFX10) { 354 if (ctx.vs_cnt) 355 imm.vs = 0; 356 } 357} 358 359void 360kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) 361{ 362 if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) { 363 /* Force emitting waitcnt states right after the instruction if there is 364 * something to wait for. This is also applied for s_setpc_b64 to ensure 365 * waitcnt states are inserted before jumping to the PS epilog. 366 */ 367 force_waitcnt(ctx, imm); 368 } 369 370 if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt) 371 check_instr(ctx, imm, instr); 372 373 /* It's required to wait for scalar stores before "writing back" data. 374 * It shouldn't cost anything anyways since we're about to do s_endpgm. 375 */ 376 if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) { 377 assert(ctx.gfx_level >= GFX8); 378 imm.lgkm = 0; 379 } 380 381 if (ctx.gfx_level >= GFX10 && instr->isSMEM()) { 382 /* GFX10: A store followed by a load at the same address causes a problem because 383 * the load doesn't load the correct values unless we wait for the store first. 384 * This is NOT mitigated by an s_nop. 385 * 386 * TODO: Refine this when we have proper alias analysis. 387 */ 388 if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() && 389 !instr->smem().sync.can_reorder()) { 390 imm.lgkm = 0; 391 } 392 } 393 394 if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) { 395 if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) { 396 397 /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos 398 * export. Wait for all stores (and atomics) to complete, so PS can read them. 399 * TODO: This only really applies to DONE pos exports. 400 * Consider setting the DONE bit earlier. 401 */ 402 if (ctx.vs_cnt > 0) 403 imm.vs = 0; 404 if (ctx.vm_cnt > 0) 405 imm.vm = 0; 406 } 407 } 408 409 if (instr->opcode == aco_opcode::p_barrier) 410 perform_barrier(ctx, imm, instr->barrier().sync, semantic_acqrel); 411 else 412 perform_barrier(ctx, imm, sync_info, semantic_release); 413 414 if (!imm.empty()) { 415 if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter) 416 imm.vm = 0; 417 if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter) 418 imm.lgkm = 0; 419 420 /* reset counters */ 421 ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp); 422 ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm); 423 ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm); 424 ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs); 425 426 /* update barrier wait imms */ 427 for (unsigned i = 0; i < storage_count; i++) { 428 wait_imm& bar = ctx.barrier_imm[i]; 429 uint16_t& bar_ev = ctx.barrier_events[i]; 430 if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) { 431 bar.exp = wait_imm::unset_counter; 432 bar_ev &= ~exp_events; 433 } 434 if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) { 435 bar.vm = wait_imm::unset_counter; 436 bar_ev &= ~(vm_events & ~event_flat); 437 } 438 if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) { 439 bar.lgkm = wait_imm::unset_counter; 440 bar_ev &= ~(lgkm_events & ~event_flat); 441 } 442 if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) { 443 bar.vs = wait_imm::unset_counter; 444 bar_ev &= ~vs_events; 445 } 446 if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter) 447 bar_ev &= ~event_flat; 448 } 449 450 /* remove all gprs with higher counter from map */ 451 std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin(); 452 while (it != ctx.gpr_map.end()) { 453 if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp) 454 ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp); 455 if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm) 456 ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm); 457 if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm) 458 ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm); 459 if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs) 460 ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs); 461 if (!it->second.counters) 462 it = ctx.gpr_map.erase(it); 463 else 464 it++; 465 } 466 } 467 468 if (imm.vm == 0) 469 ctx.pending_flat_vm = false; 470 if (imm.lgkm == 0) { 471 ctx.pending_flat_lgkm = false; 472 ctx.pending_s_buffer_store = false; 473 } 474} 475 476void 477update_barrier_counter(uint8_t* ctr, unsigned max) 478{ 479 if (*ctr != wait_imm::unset_counter && *ctr < max) 480 (*ctr)++; 481} 482 483void 484update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync) 485{ 486 for (unsigned i = 0; i < storage_count; i++) { 487 wait_imm& bar = ctx.barrier_imm[i]; 488 uint16_t& bar_ev = ctx.barrier_events[i]; 489 if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) { 490 bar_ev |= event; 491 if (counters & counter_lgkm) 492 bar.lgkm = 0; 493 if (counters & counter_vm) 494 bar.vm = 0; 495 if (counters & counter_exp) 496 bar.exp = 0; 497 if (counters & counter_vs) 498 bar.vs = 0; 499 } else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) { 500 if (counters & counter_lgkm && (bar_ev & lgkm_events) == event) 501 update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt); 502 if (counters & counter_vm && (bar_ev & vm_events) == event) 503 update_barrier_counter(&bar.vm, ctx.max_vm_cnt); 504 if (counters & counter_exp && (bar_ev & exp_events) == event) 505 update_barrier_counter(&bar.exp, ctx.max_exp_cnt); 506 if (counters & counter_vs && (bar_ev & vs_events) == event) 507 update_barrier_counter(&bar.vs, ctx.max_vs_cnt); 508 } 509 } 510} 511 512void 513update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info()) 514{ 515 uint8_t counters = get_counters_for_event(event); 516 517 if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt) 518 ctx.lgkm_cnt++; 519 if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt) 520 ctx.vm_cnt++; 521 if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt) 522 ctx.exp_cnt++; 523 if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt) 524 ctx.vs_cnt++; 525 526 update_barrier_imm(ctx, counters, event, sync); 527 528 if (ctx.unordered_events & event) 529 return; 530 531 if (ctx.pending_flat_lgkm) 532 counters &= ~counter_lgkm; 533 if (ctx.pending_flat_vm) 534 counters &= ~counter_vm; 535 536 for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) { 537 wait_entry& entry = e.second; 538 539 if (entry.events & ctx.unordered_events) 540 continue; 541 542 assert(entry.events); 543 544 if ((counters & counter_exp) && (entry.events & exp_events) == event && 545 entry.imm.exp < ctx.max_exp_cnt) 546 entry.imm.exp++; 547 if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && 548 entry.imm.lgkm < ctx.max_lgkm_cnt) 549 entry.imm.lgkm++; 550 if ((counters & counter_vm) && (entry.events & vm_events) == event && 551 entry.imm.vm < ctx.max_vm_cnt) 552 entry.imm.vm++; 553 if ((counters & counter_vs) && (entry.events & vs_events) == event && 554 entry.imm.vs < ctx.max_vs_cnt) 555 entry.imm.vs++; 556 } 557} 558 559void 560update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info()) 561{ 562 assert(ctx.gfx_level < GFX10); 563 564 if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt) 565 ctx.lgkm_cnt++; 566 if (ctx.vm_cnt <= ctx.max_vm_cnt) 567 ctx.vm_cnt++; 568 569 update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync); 570 571 for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) { 572 if (e.second.counters & counter_vm) 573 e.second.imm.vm = 0; 574 if (e.second.counters & counter_lgkm) 575 e.second.imm.lgkm = 0; 576 } 577 ctx.pending_flat_lgkm = true; 578 ctx.pending_flat_vm = true; 579} 580 581void 582insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read, 583 uint8_t vmem_types = 0) 584{ 585 uint16_t counters = get_counters_for_event(event); 586 wait_imm imm; 587 if (counters & counter_lgkm) 588 imm.lgkm = 0; 589 if (counters & counter_vm) 590 imm.vm = 0; 591 if (counters & counter_exp) 592 imm.exp = 0; 593 if (counters & counter_vs) 594 imm.vs = 0; 595 596 wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read); 597 new_entry.vmem_types |= vmem_types; 598 599 for (unsigned i = 0; i < rc.size(); i++) { 600 auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry); 601 if (!it.second) 602 it.first->second.join(new_entry); 603 } 604} 605 606void 607insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0) 608{ 609 if (!op.isConstant() && !op.isUndefined()) 610 insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types); 611} 612 613void 614insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0) 615{ 616 insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types); 617} 618 619void 620gen(Instruction* instr, wait_ctx& ctx) 621{ 622 switch (instr->format) { 623 case Format::EXP: { 624 Export_instruction& exp_instr = instr->exp(); 625 626 wait_event ev; 627 if (exp_instr.dest <= 9) 628 ev = event_exp_mrt_null; 629 else if (exp_instr.dest <= 15) 630 ev = event_exp_pos; 631 else 632 ev = event_exp_param; 633 update_counters(ctx, ev); 634 635 /* insert new entries for exported vgprs */ 636 for (unsigned i = 0; i < 4; i++) { 637 if (exp_instr.enabled_mask & (1 << i)) { 638 unsigned idx = exp_instr.compressed ? i >> 1 : i; 639 assert(idx < exp_instr.operands.size()); 640 insert_wait_entry(ctx, exp_instr.operands[idx], ev); 641 } 642 } 643 insert_wait_entry(ctx, exec, s2, ev, false); 644 break; 645 } 646 case Format::FLAT: { 647 FLAT_instruction& flat = instr->flat(); 648 if (ctx.gfx_level < GFX10 && !instr->definitions.empty()) 649 update_counters_for_flat_load(ctx, flat.sync); 650 else 651 update_counters(ctx, event_flat, flat.sync); 652 653 if (!instr->definitions.empty()) 654 insert_wait_entry(ctx, instr->definitions[0], event_flat); 655 break; 656 } 657 case Format::SMEM: { 658 SMEM_instruction& smem = instr->smem(); 659 update_counters(ctx, event_smem, smem.sync); 660 661 if (!instr->definitions.empty()) 662 insert_wait_entry(ctx, instr->definitions[0], event_smem); 663 else if (ctx.gfx_level >= GFX10 && !smem.sync.can_reorder()) 664 ctx.pending_s_buffer_store = true; 665 666 break; 667 } 668 case Format::DS: { 669 DS_instruction& ds = instr->ds(); 670 update_counters(ctx, ds.gds ? event_gds : event_lds, ds.sync); 671 if (ds.gds) 672 update_counters(ctx, event_gds_gpr_lock); 673 674 if (!instr->definitions.empty()) 675 insert_wait_entry(ctx, instr->definitions[0], ds.gds ? event_gds : event_lds); 676 677 if (ds.gds) { 678 for (const Operand& op : instr->operands) 679 insert_wait_entry(ctx, op, event_gds_gpr_lock); 680 insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false); 681 } 682 break; 683 } 684 case Format::MUBUF: 685 case Format::MTBUF: 686 case Format::MIMG: 687 case Format::GLOBAL: 688 case Format::SCRATCH: { 689 wait_event ev = 690 !instr->definitions.empty() || ctx.gfx_level < GFX10 ? event_vmem : event_vmem_store; 691 update_counters(ctx, ev, get_sync_info(instr)); 692 693 if (!instr->definitions.empty()) 694 insert_wait_entry(ctx, instr->definitions[0], ev, get_vmem_type(instr)); 695 696 if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) { 697 update_counters(ctx, event_vmem_gpr_lock); 698 insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock); 699 } else if (ctx.gfx_level == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) { 700 update_counters(ctx, event_vmem_gpr_lock); 701 insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock); 702 } 703 704 break; 705 } 706 case Format::SOPP: { 707 if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt) 708 update_counters(ctx, event_sendmsg); 709 break; 710 } 711 default: break; 712 } 713} 714 715void 716emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm) 717{ 718 if (imm.vs != wait_imm::unset_counter) { 719 assert(ctx.gfx_level >= GFX10); 720 SOPK_instruction* waitcnt_vs = 721 create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1); 722 waitcnt_vs->definitions[0] = Definition(sgpr_null, s1); 723 waitcnt_vs->imm = imm.vs; 724 instructions.emplace_back(waitcnt_vs); 725 imm.vs = wait_imm::unset_counter; 726 } 727 if (!imm.empty()) { 728 SOPP_instruction* waitcnt = 729 create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0); 730 waitcnt->imm = imm.pack(ctx.gfx_level); 731 waitcnt->block = -1; 732 instructions.emplace_back(waitcnt); 733 } 734 imm = wait_imm(); 735} 736 737void 738handle_block(Program* program, Block& block, wait_ctx& ctx) 739{ 740 std::vector<aco_ptr<Instruction>> new_instructions; 741 742 wait_imm queued_imm; 743 744 for (aco_ptr<Instruction>& instr : block.instructions) { 745 bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get()); 746 747 memory_sync_info sync_info = get_sync_info(instr.get()); 748 kill(queued_imm, instr.get(), ctx, sync_info); 749 750 gen(instr.get(), ctx); 751 752 if (instr->format != Format::PSEUDO_BARRIER && !is_wait) { 753 if (!queued_imm.empty()) 754 emit_waitcnt(ctx, new_instructions, queued_imm); 755 756 new_instructions.emplace_back(std::move(instr)); 757 perform_barrier(ctx, queued_imm, sync_info, semantic_acquire); 758 } 759 } 760 761 if (!queued_imm.empty()) 762 emit_waitcnt(ctx, new_instructions, queued_imm); 763 764 block.instructions.swap(new_instructions); 765} 766 767} /* end namespace */ 768 769void 770insert_wait_states(Program* program) 771{ 772 /* per BB ctx */ 773 std::vector<bool> done(program->blocks.size()); 774 std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program)); 775 std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program)); 776 777 std::stack<unsigned, std::vector<unsigned>> loop_header_indices; 778 unsigned loop_progress = 0; 779 780 if (program->stage.has(SWStage::VS) && program->info.vs.dynamic_inputs) { 781 for (Definition def : program->vs_inputs) { 782 update_counters(in_ctx[0], event_vmem); 783 insert_wait_entry(in_ctx[0], def, event_vmem); 784 } 785 } 786 787 for (unsigned i = 0; i < program->blocks.size();) { 788 Block& current = program->blocks[i++]; 789 wait_ctx ctx = in_ctx[current.index]; 790 791 if (current.kind & block_kind_loop_header) { 792 loop_header_indices.push(current.index); 793 } else if (current.kind & block_kind_loop_exit) { 794 bool repeat = false; 795 if (loop_progress == loop_header_indices.size()) { 796 i = loop_header_indices.top(); 797 repeat = true; 798 } 799 loop_header_indices.pop(); 800 loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size()); 801 if (repeat) 802 continue; 803 } 804 805 bool changed = false; 806 for (unsigned b : current.linear_preds) 807 changed |= ctx.join(&out_ctx[b], false); 808 for (unsigned b : current.logical_preds) 809 changed |= ctx.join(&out_ctx[b], true); 810 811 if (done[current.index] && !changed) { 812 in_ctx[current.index] = std::move(ctx); 813 continue; 814 } else { 815 in_ctx[current.index] = ctx; 816 } 817 818 if (current.instructions.empty()) { 819 out_ctx[current.index] = std::move(ctx); 820 continue; 821 } 822 823 loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth); 824 done[current.index] = true; 825 826 handle_block(program, current, ctx); 827 828 out_ctx[current.index] = std::move(ctx); 829 } 830} 831 832} // namespace aco 833