1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2018 Valve Corporation
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci *
23bf215546Sopenharmony_ci */
24bf215546Sopenharmony_ci
25bf215546Sopenharmony_ci#include "aco_ir.h"
26bf215546Sopenharmony_ci
27bf215546Sopenharmony_ci#include "common/sid.h"
28bf215546Sopenharmony_ci
29bf215546Sopenharmony_ci#include <map>
30bf215546Sopenharmony_ci#include <stack>
31bf215546Sopenharmony_ci#include <vector>
32bf215546Sopenharmony_ci
33bf215546Sopenharmony_cinamespace aco {
34bf215546Sopenharmony_ci
35bf215546Sopenharmony_cinamespace {
36bf215546Sopenharmony_ci
37bf215546Sopenharmony_ci/**
38bf215546Sopenharmony_ci * The general idea of this pass is:
39bf215546Sopenharmony_ci * The CFG is traversed in reverse postorder (forward) and loops are processed
40bf215546Sopenharmony_ci * several times until no progress is made.
41bf215546Sopenharmony_ci * Per BB two wait_ctx is maintained: an in-context and out-context.
42bf215546Sopenharmony_ci * The in-context is the joined out-contexts of the predecessors.
43bf215546Sopenharmony_ci * The context contains a map: gpr -> wait_entry
44bf215546Sopenharmony_ci * consisting of the information about the cnt values to be waited for.
45bf215546Sopenharmony_ci * Note: After merge-nodes, it might occur that for the same register
46bf215546Sopenharmony_ci *       multiple cnt values are to be waited for.
47bf215546Sopenharmony_ci *
48bf215546Sopenharmony_ci * The values are updated according to the encountered instructions:
49bf215546Sopenharmony_ci * - additional events increment the counter of waits of the same type
50bf215546Sopenharmony_ci * - or erase gprs with counters higher than to be waited for.
51bf215546Sopenharmony_ci */
52bf215546Sopenharmony_ci
53bf215546Sopenharmony_ci// TODO: do a more clever insertion of wait_cnt (lgkm_cnt)
54bf215546Sopenharmony_ci// when there is a load followed by a use of a previous load
55bf215546Sopenharmony_ci
56bf215546Sopenharmony_ci/* Instructions of the same event will finish in-order except for smem
57bf215546Sopenharmony_ci * and maybe flat. Instructions of different events may not finish in-order. */
58bf215546Sopenharmony_cienum wait_event : uint16_t {
59bf215546Sopenharmony_ci   event_smem = 1 << 0,
60bf215546Sopenharmony_ci   event_lds = 1 << 1,
61bf215546Sopenharmony_ci   event_gds = 1 << 2,
62bf215546Sopenharmony_ci   event_vmem = 1 << 3,
63bf215546Sopenharmony_ci   event_vmem_store = 1 << 4, /* GFX10+ */
64bf215546Sopenharmony_ci   event_flat = 1 << 5,
65bf215546Sopenharmony_ci   event_exp_pos = 1 << 6,
66bf215546Sopenharmony_ci   event_exp_param = 1 << 7,
67bf215546Sopenharmony_ci   event_exp_mrt_null = 1 << 8,
68bf215546Sopenharmony_ci   event_gds_gpr_lock = 1 << 9,
69bf215546Sopenharmony_ci   event_vmem_gpr_lock = 1 << 10,
70bf215546Sopenharmony_ci   event_sendmsg = 1 << 11,
71bf215546Sopenharmony_ci   num_events = 12,
72bf215546Sopenharmony_ci};
73bf215546Sopenharmony_ci
74bf215546Sopenharmony_cienum counter_type : uint8_t {
75bf215546Sopenharmony_ci   counter_exp = 1 << 0,
76bf215546Sopenharmony_ci   counter_lgkm = 1 << 1,
77bf215546Sopenharmony_ci   counter_vm = 1 << 2,
78bf215546Sopenharmony_ci   counter_vs = 1 << 3,
79bf215546Sopenharmony_ci   num_counters = 4,
80bf215546Sopenharmony_ci};
81bf215546Sopenharmony_ci
82bf215546Sopenharmony_cienum vmem_type : uint8_t {
83bf215546Sopenharmony_ci   vmem_nosampler = 1 << 0,
84bf215546Sopenharmony_ci   vmem_sampler = 1 << 1,
85bf215546Sopenharmony_ci   vmem_bvh = 1 << 2,
86bf215546Sopenharmony_ci};
87bf215546Sopenharmony_ci
88bf215546Sopenharmony_cistatic const uint16_t exp_events =
89bf215546Sopenharmony_ci   event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
90bf215546Sopenharmony_cistatic const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
91bf215546Sopenharmony_cistatic const uint16_t vm_events = event_vmem | event_flat;
92bf215546Sopenharmony_cistatic const uint16_t vs_events = event_vmem_store;
93bf215546Sopenharmony_ci
94bf215546Sopenharmony_ciuint8_t
95bf215546Sopenharmony_ciget_counters_for_event(wait_event ev)
96bf215546Sopenharmony_ci{
97bf215546Sopenharmony_ci   switch (ev) {
98bf215546Sopenharmony_ci   case event_smem:
99bf215546Sopenharmony_ci   case event_lds:
100bf215546Sopenharmony_ci   case event_gds:
101bf215546Sopenharmony_ci   case event_sendmsg: return counter_lgkm;
102bf215546Sopenharmony_ci   case event_vmem: return counter_vm;
103bf215546Sopenharmony_ci   case event_vmem_store: return counter_vs;
104bf215546Sopenharmony_ci   case event_flat: return counter_vm | counter_lgkm;
105bf215546Sopenharmony_ci   case event_exp_pos:
106bf215546Sopenharmony_ci   case event_exp_param:
107bf215546Sopenharmony_ci   case event_exp_mrt_null:
108bf215546Sopenharmony_ci   case event_gds_gpr_lock:
109bf215546Sopenharmony_ci   case event_vmem_gpr_lock: return counter_exp;
110bf215546Sopenharmony_ci   default: return 0;
111bf215546Sopenharmony_ci   }
112bf215546Sopenharmony_ci}
113bf215546Sopenharmony_ci
114bf215546Sopenharmony_cistruct wait_entry {
115bf215546Sopenharmony_ci   wait_imm imm;
116bf215546Sopenharmony_ci   uint16_t events;  /* use wait_event notion */
117bf215546Sopenharmony_ci   uint8_t counters; /* use counter_type notion */
118bf215546Sopenharmony_ci   bool wait_on_read : 1;
119bf215546Sopenharmony_ci   bool logical : 1;
120bf215546Sopenharmony_ci   uint8_t vmem_types : 4;
121bf215546Sopenharmony_ci
122bf215546Sopenharmony_ci   wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_)
123bf215546Sopenharmony_ci       : imm(imm_), events(event_), counters(get_counters_for_event(event_)),
124bf215546Sopenharmony_ci         wait_on_read(wait_on_read_), logical(logical_), vmem_types(0)
125bf215546Sopenharmony_ci   {}
126bf215546Sopenharmony_ci
127bf215546Sopenharmony_ci   bool join(const wait_entry& other)
128bf215546Sopenharmony_ci   {
129bf215546Sopenharmony_ci      bool changed = (other.events & ~events) || (other.counters & ~counters) ||
130bf215546Sopenharmony_ci                     (other.wait_on_read && !wait_on_read) || (other.vmem_types & !vmem_types);
131bf215546Sopenharmony_ci      events |= other.events;
132bf215546Sopenharmony_ci      counters |= other.counters;
133bf215546Sopenharmony_ci      changed |= imm.combine(other.imm);
134bf215546Sopenharmony_ci      wait_on_read |= other.wait_on_read;
135bf215546Sopenharmony_ci      vmem_types |= other.vmem_types;
136bf215546Sopenharmony_ci      assert(logical == other.logical);
137bf215546Sopenharmony_ci      return changed;
138bf215546Sopenharmony_ci   }
139bf215546Sopenharmony_ci
140bf215546Sopenharmony_ci   void remove_counter(counter_type counter)
141bf215546Sopenharmony_ci   {
142bf215546Sopenharmony_ci      counters &= ~counter;
143bf215546Sopenharmony_ci
144bf215546Sopenharmony_ci      if (counter == counter_lgkm) {
145bf215546Sopenharmony_ci         imm.lgkm = wait_imm::unset_counter;
146bf215546Sopenharmony_ci         events &= ~(event_smem | event_lds | event_gds | event_sendmsg);
147bf215546Sopenharmony_ci      }
148bf215546Sopenharmony_ci
149bf215546Sopenharmony_ci      if (counter == counter_vm) {
150bf215546Sopenharmony_ci         imm.vm = wait_imm::unset_counter;
151bf215546Sopenharmony_ci         events &= ~event_vmem;
152bf215546Sopenharmony_ci         vmem_types = 0;
153bf215546Sopenharmony_ci      }
154bf215546Sopenharmony_ci
155bf215546Sopenharmony_ci      if (counter == counter_exp) {
156bf215546Sopenharmony_ci         imm.exp = wait_imm::unset_counter;
157bf215546Sopenharmony_ci         events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock |
158bf215546Sopenharmony_ci                     event_vmem_gpr_lock);
159bf215546Sopenharmony_ci      }
160bf215546Sopenharmony_ci
161bf215546Sopenharmony_ci      if (counter == counter_vs) {
162bf215546Sopenharmony_ci         imm.vs = wait_imm::unset_counter;
163bf215546Sopenharmony_ci         events &= ~event_vmem_store;
164bf215546Sopenharmony_ci      }
165bf215546Sopenharmony_ci
166bf215546Sopenharmony_ci      if (!(counters & counter_lgkm) && !(counters & counter_vm))
167bf215546Sopenharmony_ci         events &= ~event_flat;
168bf215546Sopenharmony_ci   }
169bf215546Sopenharmony_ci};
170bf215546Sopenharmony_ci
171bf215546Sopenharmony_cistruct wait_ctx {
172bf215546Sopenharmony_ci   Program* program;
173bf215546Sopenharmony_ci   enum amd_gfx_level gfx_level;
174bf215546Sopenharmony_ci   uint16_t max_vm_cnt;
175bf215546Sopenharmony_ci   uint16_t max_exp_cnt;
176bf215546Sopenharmony_ci   uint16_t max_lgkm_cnt;
177bf215546Sopenharmony_ci   uint16_t max_vs_cnt;
178bf215546Sopenharmony_ci   uint16_t unordered_events = event_smem | event_flat;
179bf215546Sopenharmony_ci
180bf215546Sopenharmony_ci   uint8_t vm_cnt = 0;
181bf215546Sopenharmony_ci   uint8_t exp_cnt = 0;
182bf215546Sopenharmony_ci   uint8_t lgkm_cnt = 0;
183bf215546Sopenharmony_ci   uint8_t vs_cnt = 0;
184bf215546Sopenharmony_ci   bool pending_flat_lgkm = false;
185bf215546Sopenharmony_ci   bool pending_flat_vm = false;
186bf215546Sopenharmony_ci   bool pending_s_buffer_store = false; /* GFX10 workaround */
187bf215546Sopenharmony_ci
188bf215546Sopenharmony_ci   wait_imm barrier_imm[storage_count];
189bf215546Sopenharmony_ci   uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
190bf215546Sopenharmony_ci
191bf215546Sopenharmony_ci   std::map<PhysReg, wait_entry> gpr_map;
192bf215546Sopenharmony_ci
193bf215546Sopenharmony_ci   wait_ctx() {}
194bf215546Sopenharmony_ci   wait_ctx(Program* program_)
195bf215546Sopenharmony_ci       : program(program_), gfx_level(program_->gfx_level),
196bf215546Sopenharmony_ci         max_vm_cnt(program_->gfx_level >= GFX9 ? 62 : 14), max_exp_cnt(6),
197bf215546Sopenharmony_ci         max_lgkm_cnt(program_->gfx_level >= GFX10 ? 62 : 14),
198bf215546Sopenharmony_ci         max_vs_cnt(program_->gfx_level >= GFX10 ? 62 : 0),
199bf215546Sopenharmony_ci         unordered_events(event_smem | (program_->gfx_level < GFX10 ? event_flat : 0))
200bf215546Sopenharmony_ci   {}
201bf215546Sopenharmony_ci
202bf215546Sopenharmony_ci   bool join(const wait_ctx* other, bool logical)
203bf215546Sopenharmony_ci   {
204bf215546Sopenharmony_ci      bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt ||
205bf215546Sopenharmony_ci                     other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt ||
206bf215546Sopenharmony_ci                     (other->pending_flat_lgkm && !pending_flat_lgkm) ||
207bf215546Sopenharmony_ci                     (other->pending_flat_vm && !pending_flat_vm);
208bf215546Sopenharmony_ci
209bf215546Sopenharmony_ci      exp_cnt = std::max(exp_cnt, other->exp_cnt);
210bf215546Sopenharmony_ci      vm_cnt = std::max(vm_cnt, other->vm_cnt);
211bf215546Sopenharmony_ci      lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt);
212bf215546Sopenharmony_ci      vs_cnt = std::max(vs_cnt, other->vs_cnt);
213bf215546Sopenharmony_ci      pending_flat_lgkm |= other->pending_flat_lgkm;
214bf215546Sopenharmony_ci      pending_flat_vm |= other->pending_flat_vm;
215bf215546Sopenharmony_ci      pending_s_buffer_store |= other->pending_s_buffer_store;
216bf215546Sopenharmony_ci
217bf215546Sopenharmony_ci      for (const auto& entry : other->gpr_map) {
218bf215546Sopenharmony_ci         if (entry.second.logical != logical)
219bf215546Sopenharmony_ci            continue;
220bf215546Sopenharmony_ci
221bf215546Sopenharmony_ci         using iterator = std::map<PhysReg, wait_entry>::iterator;
222bf215546Sopenharmony_ci         const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
223bf215546Sopenharmony_ci         if (insert_pair.second) {
224bf215546Sopenharmony_ci            changed = true;
225bf215546Sopenharmony_ci         } else {
226bf215546Sopenharmony_ci            changed |= insert_pair.first->second.join(entry.second);
227bf215546Sopenharmony_ci         }
228bf215546Sopenharmony_ci      }
229bf215546Sopenharmony_ci
230bf215546Sopenharmony_ci      for (unsigned i = 0; i < storage_count; i++) {
231bf215546Sopenharmony_ci         changed |= barrier_imm[i].combine(other->barrier_imm[i]);
232bf215546Sopenharmony_ci         changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0;
233bf215546Sopenharmony_ci         barrier_events[i] |= other->barrier_events[i];
234bf215546Sopenharmony_ci      }
235bf215546Sopenharmony_ci
236bf215546Sopenharmony_ci      return changed;
237bf215546Sopenharmony_ci   }
238bf215546Sopenharmony_ci
239bf215546Sopenharmony_ci   void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter)
240bf215546Sopenharmony_ci   {
241bf215546Sopenharmony_ci      entry.remove_counter(counter);
242bf215546Sopenharmony_ci   }
243bf215546Sopenharmony_ci};
244bf215546Sopenharmony_ci
245bf215546Sopenharmony_ciuint8_t
246bf215546Sopenharmony_ciget_vmem_type(Instruction* instr)
247bf215546Sopenharmony_ci{
248bf215546Sopenharmony_ci   if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
249bf215546Sopenharmony_ci      return vmem_bvh;
250bf215546Sopenharmony_ci   else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
251bf215546Sopenharmony_ci            instr->operands[1].regClass() == s4)
252bf215546Sopenharmony_ci      return vmem_sampler;
253bf215546Sopenharmony_ci   else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
254bf215546Sopenharmony_ci      return vmem_nosampler;
255bf215546Sopenharmony_ci   return 0;
256bf215546Sopenharmony_ci}
257bf215546Sopenharmony_ci
258bf215546Sopenharmony_civoid
259bf215546Sopenharmony_cicheck_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
260bf215546Sopenharmony_ci{
261bf215546Sopenharmony_ci   for (const Operand op : instr->operands) {
262bf215546Sopenharmony_ci      if (op.isConstant() || op.isUndefined())
263bf215546Sopenharmony_ci         continue;
264bf215546Sopenharmony_ci
265bf215546Sopenharmony_ci      /* check consecutively read gprs */
266bf215546Sopenharmony_ci      for (unsigned j = 0; j < op.size(); j++) {
267bf215546Sopenharmony_ci         PhysReg reg{op.physReg() + j};
268bf215546Sopenharmony_ci         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
269bf215546Sopenharmony_ci         if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
270bf215546Sopenharmony_ci            continue;
271bf215546Sopenharmony_ci
272bf215546Sopenharmony_ci         wait.combine(it->second.imm);
273bf215546Sopenharmony_ci      }
274bf215546Sopenharmony_ci   }
275bf215546Sopenharmony_ci
276bf215546Sopenharmony_ci   for (const Definition& def : instr->definitions) {
277bf215546Sopenharmony_ci      /* check consecutively written gprs */
278bf215546Sopenharmony_ci      for (unsigned j = 0; j < def.getTemp().size(); j++) {
279bf215546Sopenharmony_ci         PhysReg reg{def.physReg() + j};
280bf215546Sopenharmony_ci
281bf215546Sopenharmony_ci         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
282bf215546Sopenharmony_ci         if (it == ctx.gpr_map.end())
283bf215546Sopenharmony_ci            continue;
284bf215546Sopenharmony_ci
285bf215546Sopenharmony_ci         /* Vector Memory reads and writes return in the order they were issued */
286bf215546Sopenharmony_ci         uint8_t vmem_type = get_vmem_type(instr);
287bf215546Sopenharmony_ci         if (vmem_type && ((it->second.events & vm_events) == event_vmem) &&
288bf215546Sopenharmony_ci             it->second.vmem_types == vmem_type)
289bf215546Sopenharmony_ci            continue;
290bf215546Sopenharmony_ci
291bf215546Sopenharmony_ci         /* LDS reads and writes return in the order they were issued. same for GDS */
292bf215546Sopenharmony_ci         if (instr->isDS() &&
293bf215546Sopenharmony_ci             (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
294bf215546Sopenharmony_ci            continue;
295bf215546Sopenharmony_ci
296bf215546Sopenharmony_ci         wait.combine(it->second.imm);
297bf215546Sopenharmony_ci      }
298bf215546Sopenharmony_ci   }
299bf215546Sopenharmony_ci}
300bf215546Sopenharmony_ci
301bf215546Sopenharmony_cibool
302bf215546Sopenharmony_ciparse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr)
303bf215546Sopenharmony_ci{
304bf215546Sopenharmony_ci   if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&
305bf215546Sopenharmony_ci       instr->definitions[0].physReg() == sgpr_null) {
306bf215546Sopenharmony_ci      imm.vs = std::min<uint8_t>(imm.vs, instr->sopk().imm);
307bf215546Sopenharmony_ci      return true;
308bf215546Sopenharmony_ci   } else if (instr->opcode == aco_opcode::s_waitcnt) {
309bf215546Sopenharmony_ci      imm.combine(wait_imm(ctx.gfx_level, instr->sopp().imm));
310bf215546Sopenharmony_ci      return true;
311bf215546Sopenharmony_ci   }
312bf215546Sopenharmony_ci   return false;
313bf215546Sopenharmony_ci}
314bf215546Sopenharmony_ci
315bf215546Sopenharmony_civoid
316bf215546Sopenharmony_ciperform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics)
317bf215546Sopenharmony_ci{
318bf215546Sopenharmony_ci   sync_scope subgroup_scope =
319bf215546Sopenharmony_ci      ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
320bf215546Sopenharmony_ci   if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
321bf215546Sopenharmony_ci      unsigned storage = sync.storage;
322bf215546Sopenharmony_ci      while (storage) {
323bf215546Sopenharmony_ci         unsigned idx = u_bit_scan(&storage);
324bf215546Sopenharmony_ci
325bf215546Sopenharmony_ci         /* LDS is private to the workgroup */
326bf215546Sopenharmony_ci         sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);
327bf215546Sopenharmony_ci
328bf215546Sopenharmony_ci         uint16_t events = ctx.barrier_events[idx];
329bf215546Sopenharmony_ci         if (bar_scope_lds <= subgroup_scope)
330bf215546Sopenharmony_ci            events &= ~event_lds;
331bf215546Sopenharmony_ci
332bf215546Sopenharmony_ci         /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
333bf215546Sopenharmony_ci          * in-order for the same workgroup */
334bf215546Sopenharmony_ci         if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)
335bf215546Sopenharmony_ci            events &= ~(event_vmem | event_vmem_store | event_smem);
336bf215546Sopenharmony_ci
337bf215546Sopenharmony_ci         if (events)
338bf215546Sopenharmony_ci            imm.combine(ctx.barrier_imm[idx]);
339bf215546Sopenharmony_ci      }
340bf215546Sopenharmony_ci   }
341bf215546Sopenharmony_ci}
342bf215546Sopenharmony_ci
343bf215546Sopenharmony_civoid
344bf215546Sopenharmony_ciforce_waitcnt(wait_ctx& ctx, wait_imm& imm)
345bf215546Sopenharmony_ci{
346bf215546Sopenharmony_ci   if (ctx.vm_cnt)
347bf215546Sopenharmony_ci      imm.vm = 0;
348bf215546Sopenharmony_ci   if (ctx.exp_cnt)
349bf215546Sopenharmony_ci      imm.exp = 0;
350bf215546Sopenharmony_ci   if (ctx.lgkm_cnt)
351bf215546Sopenharmony_ci      imm.lgkm = 0;
352bf215546Sopenharmony_ci
353bf215546Sopenharmony_ci   if (ctx.gfx_level >= GFX10) {
354bf215546Sopenharmony_ci      if (ctx.vs_cnt)
355bf215546Sopenharmony_ci         imm.vs = 0;
356bf215546Sopenharmony_ci   }
357bf215546Sopenharmony_ci}
358bf215546Sopenharmony_ci
359bf215546Sopenharmony_civoid
360bf215546Sopenharmony_cikill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
361bf215546Sopenharmony_ci{
362bf215546Sopenharmony_ci   if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) {
363bf215546Sopenharmony_ci      /* Force emitting waitcnt states right after the instruction if there is
364bf215546Sopenharmony_ci       * something to wait for. This is also applied for s_setpc_b64 to ensure
365bf215546Sopenharmony_ci       * waitcnt states are inserted before jumping to the PS epilog.
366bf215546Sopenharmony_ci       */
367bf215546Sopenharmony_ci      force_waitcnt(ctx, imm);
368bf215546Sopenharmony_ci   }
369bf215546Sopenharmony_ci
370bf215546Sopenharmony_ci   if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
371bf215546Sopenharmony_ci      check_instr(ctx, imm, instr);
372bf215546Sopenharmony_ci
373bf215546Sopenharmony_ci   /* It's required to wait for scalar stores before "writing back" data.
374bf215546Sopenharmony_ci    * It shouldn't cost anything anyways since we're about to do s_endpgm.
375bf215546Sopenharmony_ci    */
376bf215546Sopenharmony_ci   if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) {
377bf215546Sopenharmony_ci      assert(ctx.gfx_level >= GFX8);
378bf215546Sopenharmony_ci      imm.lgkm = 0;
379bf215546Sopenharmony_ci   }
380bf215546Sopenharmony_ci
381bf215546Sopenharmony_ci   if (ctx.gfx_level >= GFX10 && instr->isSMEM()) {
382bf215546Sopenharmony_ci      /* GFX10: A store followed by a load at the same address causes a problem because
383bf215546Sopenharmony_ci       * the load doesn't load the correct values unless we wait for the store first.
384bf215546Sopenharmony_ci       * This is NOT mitigated by an s_nop.
385bf215546Sopenharmony_ci       *
386bf215546Sopenharmony_ci       * TODO: Refine this when we have proper alias analysis.
387bf215546Sopenharmony_ci       */
388bf215546Sopenharmony_ci      if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() &&
389bf215546Sopenharmony_ci          !instr->smem().sync.can_reorder()) {
390bf215546Sopenharmony_ci         imm.lgkm = 0;
391bf215546Sopenharmony_ci      }
392bf215546Sopenharmony_ci   }
393bf215546Sopenharmony_ci
394bf215546Sopenharmony_ci   if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) {
395bf215546Sopenharmony_ci      if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {
396bf215546Sopenharmony_ci
397bf215546Sopenharmony_ci         /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos
398bf215546Sopenharmony_ci          * export. Wait for all stores (and atomics) to complete, so PS can read them.
399bf215546Sopenharmony_ci          * TODO: This only really applies to DONE pos exports.
400bf215546Sopenharmony_ci          *       Consider setting the DONE bit earlier.
401bf215546Sopenharmony_ci          */
402bf215546Sopenharmony_ci         if (ctx.vs_cnt > 0)
403bf215546Sopenharmony_ci            imm.vs = 0;
404bf215546Sopenharmony_ci         if (ctx.vm_cnt > 0)
405bf215546Sopenharmony_ci            imm.vm = 0;
406bf215546Sopenharmony_ci      }
407bf215546Sopenharmony_ci   }
408bf215546Sopenharmony_ci
409bf215546Sopenharmony_ci   if (instr->opcode == aco_opcode::p_barrier)
410bf215546Sopenharmony_ci      perform_barrier(ctx, imm, instr->barrier().sync, semantic_acqrel);
411bf215546Sopenharmony_ci   else
412bf215546Sopenharmony_ci      perform_barrier(ctx, imm, sync_info, semantic_release);
413bf215546Sopenharmony_ci
414bf215546Sopenharmony_ci   if (!imm.empty()) {
415bf215546Sopenharmony_ci      if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
416bf215546Sopenharmony_ci         imm.vm = 0;
417bf215546Sopenharmony_ci      if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)
418bf215546Sopenharmony_ci         imm.lgkm = 0;
419bf215546Sopenharmony_ci
420bf215546Sopenharmony_ci      /* reset counters */
421bf215546Sopenharmony_ci      ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp);
422bf215546Sopenharmony_ci      ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm);
423bf215546Sopenharmony_ci      ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm);
424bf215546Sopenharmony_ci      ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
425bf215546Sopenharmony_ci
426bf215546Sopenharmony_ci      /* update barrier wait imms */
427bf215546Sopenharmony_ci      for (unsigned i = 0; i < storage_count; i++) {
428bf215546Sopenharmony_ci         wait_imm& bar = ctx.barrier_imm[i];
429bf215546Sopenharmony_ci         uint16_t& bar_ev = ctx.barrier_events[i];
430bf215546Sopenharmony_ci         if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
431bf215546Sopenharmony_ci            bar.exp = wait_imm::unset_counter;
432bf215546Sopenharmony_ci            bar_ev &= ~exp_events;
433bf215546Sopenharmony_ci         }
434bf215546Sopenharmony_ci         if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) {
435bf215546Sopenharmony_ci            bar.vm = wait_imm::unset_counter;
436bf215546Sopenharmony_ci            bar_ev &= ~(vm_events & ~event_flat);
437bf215546Sopenharmony_ci         }
438bf215546Sopenharmony_ci         if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) {
439bf215546Sopenharmony_ci            bar.lgkm = wait_imm::unset_counter;
440bf215546Sopenharmony_ci            bar_ev &= ~(lgkm_events & ~event_flat);
441bf215546Sopenharmony_ci         }
442bf215546Sopenharmony_ci         if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) {
443bf215546Sopenharmony_ci            bar.vs = wait_imm::unset_counter;
444bf215546Sopenharmony_ci            bar_ev &= ~vs_events;
445bf215546Sopenharmony_ci         }
446bf215546Sopenharmony_ci         if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter)
447bf215546Sopenharmony_ci            bar_ev &= ~event_flat;
448bf215546Sopenharmony_ci      }
449bf215546Sopenharmony_ci
450bf215546Sopenharmony_ci      /* remove all gprs with higher counter from map */
451bf215546Sopenharmony_ci      std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
452bf215546Sopenharmony_ci      while (it != ctx.gpr_map.end()) {
453bf215546Sopenharmony_ci         if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
454bf215546Sopenharmony_ci            ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
455bf215546Sopenharmony_ci         if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
456bf215546Sopenharmony_ci            ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm);
457bf215546Sopenharmony_ci         if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
458bf215546Sopenharmony_ci            ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm);
459bf215546Sopenharmony_ci         if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
460bf215546Sopenharmony_ci            ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs);
461bf215546Sopenharmony_ci         if (!it->second.counters)
462bf215546Sopenharmony_ci            it = ctx.gpr_map.erase(it);
463bf215546Sopenharmony_ci         else
464bf215546Sopenharmony_ci            it++;
465bf215546Sopenharmony_ci      }
466bf215546Sopenharmony_ci   }
467bf215546Sopenharmony_ci
468bf215546Sopenharmony_ci   if (imm.vm == 0)
469bf215546Sopenharmony_ci      ctx.pending_flat_vm = false;
470bf215546Sopenharmony_ci   if (imm.lgkm == 0) {
471bf215546Sopenharmony_ci      ctx.pending_flat_lgkm = false;
472bf215546Sopenharmony_ci      ctx.pending_s_buffer_store = false;
473bf215546Sopenharmony_ci   }
474bf215546Sopenharmony_ci}
475bf215546Sopenharmony_ci
476bf215546Sopenharmony_civoid
477bf215546Sopenharmony_ciupdate_barrier_counter(uint8_t* ctr, unsigned max)
478bf215546Sopenharmony_ci{
479bf215546Sopenharmony_ci   if (*ctr != wait_imm::unset_counter && *ctr < max)
480bf215546Sopenharmony_ci      (*ctr)++;
481bf215546Sopenharmony_ci}
482bf215546Sopenharmony_ci
483bf215546Sopenharmony_civoid
484bf215546Sopenharmony_ciupdate_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
485bf215546Sopenharmony_ci{
486bf215546Sopenharmony_ci   for (unsigned i = 0; i < storage_count; i++) {
487bf215546Sopenharmony_ci      wait_imm& bar = ctx.barrier_imm[i];
488bf215546Sopenharmony_ci      uint16_t& bar_ev = ctx.barrier_events[i];
489bf215546Sopenharmony_ci      if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
490bf215546Sopenharmony_ci         bar_ev |= event;
491bf215546Sopenharmony_ci         if (counters & counter_lgkm)
492bf215546Sopenharmony_ci            bar.lgkm = 0;
493bf215546Sopenharmony_ci         if (counters & counter_vm)
494bf215546Sopenharmony_ci            bar.vm = 0;
495bf215546Sopenharmony_ci         if (counters & counter_exp)
496bf215546Sopenharmony_ci            bar.exp = 0;
497bf215546Sopenharmony_ci         if (counters & counter_vs)
498bf215546Sopenharmony_ci            bar.vs = 0;
499bf215546Sopenharmony_ci      } else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) {
500bf215546Sopenharmony_ci         if (counters & counter_lgkm && (bar_ev & lgkm_events) == event)
501bf215546Sopenharmony_ci            update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt);
502bf215546Sopenharmony_ci         if (counters & counter_vm && (bar_ev & vm_events) == event)
503bf215546Sopenharmony_ci            update_barrier_counter(&bar.vm, ctx.max_vm_cnt);
504bf215546Sopenharmony_ci         if (counters & counter_exp && (bar_ev & exp_events) == event)
505bf215546Sopenharmony_ci            update_barrier_counter(&bar.exp, ctx.max_exp_cnt);
506bf215546Sopenharmony_ci         if (counters & counter_vs && (bar_ev & vs_events) == event)
507bf215546Sopenharmony_ci            update_barrier_counter(&bar.vs, ctx.max_vs_cnt);
508bf215546Sopenharmony_ci      }
509bf215546Sopenharmony_ci   }
510bf215546Sopenharmony_ci}
511bf215546Sopenharmony_ci
512bf215546Sopenharmony_civoid
513bf215546Sopenharmony_ciupdate_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())
514bf215546Sopenharmony_ci{
515bf215546Sopenharmony_ci   uint8_t counters = get_counters_for_event(event);
516bf215546Sopenharmony_ci
517bf215546Sopenharmony_ci   if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
518bf215546Sopenharmony_ci      ctx.lgkm_cnt++;
519bf215546Sopenharmony_ci   if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt)
520bf215546Sopenharmony_ci      ctx.vm_cnt++;
521bf215546Sopenharmony_ci   if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt)
522bf215546Sopenharmony_ci      ctx.exp_cnt++;
523bf215546Sopenharmony_ci   if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
524bf215546Sopenharmony_ci      ctx.vs_cnt++;
525bf215546Sopenharmony_ci
526bf215546Sopenharmony_ci   update_barrier_imm(ctx, counters, event, sync);
527bf215546Sopenharmony_ci
528bf215546Sopenharmony_ci   if (ctx.unordered_events & event)
529bf215546Sopenharmony_ci      return;
530bf215546Sopenharmony_ci
531bf215546Sopenharmony_ci   if (ctx.pending_flat_lgkm)
532bf215546Sopenharmony_ci      counters &= ~counter_lgkm;
533bf215546Sopenharmony_ci   if (ctx.pending_flat_vm)
534bf215546Sopenharmony_ci      counters &= ~counter_vm;
535bf215546Sopenharmony_ci
536bf215546Sopenharmony_ci   for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
537bf215546Sopenharmony_ci      wait_entry& entry = e.second;
538bf215546Sopenharmony_ci
539bf215546Sopenharmony_ci      if (entry.events & ctx.unordered_events)
540bf215546Sopenharmony_ci         continue;
541bf215546Sopenharmony_ci
542bf215546Sopenharmony_ci      assert(entry.events);
543bf215546Sopenharmony_ci
544bf215546Sopenharmony_ci      if ((counters & counter_exp) && (entry.events & exp_events) == event &&
545bf215546Sopenharmony_ci          entry.imm.exp < ctx.max_exp_cnt)
546bf215546Sopenharmony_ci         entry.imm.exp++;
547bf215546Sopenharmony_ci      if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event &&
548bf215546Sopenharmony_ci          entry.imm.lgkm < ctx.max_lgkm_cnt)
549bf215546Sopenharmony_ci         entry.imm.lgkm++;
550bf215546Sopenharmony_ci      if ((counters & counter_vm) && (entry.events & vm_events) == event &&
551bf215546Sopenharmony_ci          entry.imm.vm < ctx.max_vm_cnt)
552bf215546Sopenharmony_ci         entry.imm.vm++;
553bf215546Sopenharmony_ci      if ((counters & counter_vs) && (entry.events & vs_events) == event &&
554bf215546Sopenharmony_ci          entry.imm.vs < ctx.max_vs_cnt)
555bf215546Sopenharmony_ci         entry.imm.vs++;
556bf215546Sopenharmony_ci   }
557bf215546Sopenharmony_ci}
558bf215546Sopenharmony_ci
559bf215546Sopenharmony_civoid
560bf215546Sopenharmony_ciupdate_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
561bf215546Sopenharmony_ci{
562bf215546Sopenharmony_ci   assert(ctx.gfx_level < GFX10);
563bf215546Sopenharmony_ci
564bf215546Sopenharmony_ci   if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
565bf215546Sopenharmony_ci      ctx.lgkm_cnt++;
566bf215546Sopenharmony_ci   if (ctx.vm_cnt <= ctx.max_vm_cnt)
567bf215546Sopenharmony_ci      ctx.vm_cnt++;
568bf215546Sopenharmony_ci
569bf215546Sopenharmony_ci   update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
570bf215546Sopenharmony_ci
571bf215546Sopenharmony_ci   for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {
572bf215546Sopenharmony_ci      if (e.second.counters & counter_vm)
573bf215546Sopenharmony_ci         e.second.imm.vm = 0;
574bf215546Sopenharmony_ci      if (e.second.counters & counter_lgkm)
575bf215546Sopenharmony_ci         e.second.imm.lgkm = 0;
576bf215546Sopenharmony_ci   }
577bf215546Sopenharmony_ci   ctx.pending_flat_lgkm = true;
578bf215546Sopenharmony_ci   ctx.pending_flat_vm = true;
579bf215546Sopenharmony_ci}
580bf215546Sopenharmony_ci
581bf215546Sopenharmony_civoid
582bf215546Sopenharmony_ciinsert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
583bf215546Sopenharmony_ci                  uint8_t vmem_types = 0)
584bf215546Sopenharmony_ci{
585bf215546Sopenharmony_ci   uint16_t counters = get_counters_for_event(event);
586bf215546Sopenharmony_ci   wait_imm imm;
587bf215546Sopenharmony_ci   if (counters & counter_lgkm)
588bf215546Sopenharmony_ci      imm.lgkm = 0;
589bf215546Sopenharmony_ci   if (counters & counter_vm)
590bf215546Sopenharmony_ci      imm.vm = 0;
591bf215546Sopenharmony_ci   if (counters & counter_exp)
592bf215546Sopenharmony_ci      imm.exp = 0;
593bf215546Sopenharmony_ci   if (counters & counter_vs)
594bf215546Sopenharmony_ci      imm.vs = 0;
595bf215546Sopenharmony_ci
596bf215546Sopenharmony_ci   wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read);
597bf215546Sopenharmony_ci   new_entry.vmem_types |= vmem_types;
598bf215546Sopenharmony_ci
599bf215546Sopenharmony_ci   for (unsigned i = 0; i < rc.size(); i++) {
600bf215546Sopenharmony_ci      auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
601bf215546Sopenharmony_ci      if (!it.second)
602bf215546Sopenharmony_ci         it.first->second.join(new_entry);
603bf215546Sopenharmony_ci   }
604bf215546Sopenharmony_ci}
605bf215546Sopenharmony_ci
606bf215546Sopenharmony_civoid
607bf215546Sopenharmony_ciinsert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0)
608bf215546Sopenharmony_ci{
609bf215546Sopenharmony_ci   if (!op.isConstant() && !op.isUndefined())
610bf215546Sopenharmony_ci      insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types);
611bf215546Sopenharmony_ci}
612bf215546Sopenharmony_ci
613bf215546Sopenharmony_civoid
614bf215546Sopenharmony_ciinsert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0)
615bf215546Sopenharmony_ci{
616bf215546Sopenharmony_ci   insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types);
617bf215546Sopenharmony_ci}
618bf215546Sopenharmony_ci
619bf215546Sopenharmony_civoid
620bf215546Sopenharmony_cigen(Instruction* instr, wait_ctx& ctx)
621bf215546Sopenharmony_ci{
622bf215546Sopenharmony_ci   switch (instr->format) {
623bf215546Sopenharmony_ci   case Format::EXP: {
624bf215546Sopenharmony_ci      Export_instruction& exp_instr = instr->exp();
625bf215546Sopenharmony_ci
626bf215546Sopenharmony_ci      wait_event ev;
627bf215546Sopenharmony_ci      if (exp_instr.dest <= 9)
628bf215546Sopenharmony_ci         ev = event_exp_mrt_null;
629bf215546Sopenharmony_ci      else if (exp_instr.dest <= 15)
630bf215546Sopenharmony_ci         ev = event_exp_pos;
631bf215546Sopenharmony_ci      else
632bf215546Sopenharmony_ci         ev = event_exp_param;
633bf215546Sopenharmony_ci      update_counters(ctx, ev);
634bf215546Sopenharmony_ci
635bf215546Sopenharmony_ci      /* insert new entries for exported vgprs */
636bf215546Sopenharmony_ci      for (unsigned i = 0; i < 4; i++) {
637bf215546Sopenharmony_ci         if (exp_instr.enabled_mask & (1 << i)) {
638bf215546Sopenharmony_ci            unsigned idx = exp_instr.compressed ? i >> 1 : i;
639bf215546Sopenharmony_ci            assert(idx < exp_instr.operands.size());
640bf215546Sopenharmony_ci            insert_wait_entry(ctx, exp_instr.operands[idx], ev);
641bf215546Sopenharmony_ci         }
642bf215546Sopenharmony_ci      }
643bf215546Sopenharmony_ci      insert_wait_entry(ctx, exec, s2, ev, false);
644bf215546Sopenharmony_ci      break;
645bf215546Sopenharmony_ci   }
646bf215546Sopenharmony_ci   case Format::FLAT: {
647bf215546Sopenharmony_ci      FLAT_instruction& flat = instr->flat();
648bf215546Sopenharmony_ci      if (ctx.gfx_level < GFX10 && !instr->definitions.empty())
649bf215546Sopenharmony_ci         update_counters_for_flat_load(ctx, flat.sync);
650bf215546Sopenharmony_ci      else
651bf215546Sopenharmony_ci         update_counters(ctx, event_flat, flat.sync);
652bf215546Sopenharmony_ci
653bf215546Sopenharmony_ci      if (!instr->definitions.empty())
654bf215546Sopenharmony_ci         insert_wait_entry(ctx, instr->definitions[0], event_flat);
655bf215546Sopenharmony_ci      break;
656bf215546Sopenharmony_ci   }
657bf215546Sopenharmony_ci   case Format::SMEM: {
658bf215546Sopenharmony_ci      SMEM_instruction& smem = instr->smem();
659bf215546Sopenharmony_ci      update_counters(ctx, event_smem, smem.sync);
660bf215546Sopenharmony_ci
661bf215546Sopenharmony_ci      if (!instr->definitions.empty())
662bf215546Sopenharmony_ci         insert_wait_entry(ctx, instr->definitions[0], event_smem);
663bf215546Sopenharmony_ci      else if (ctx.gfx_level >= GFX10 && !smem.sync.can_reorder())
664bf215546Sopenharmony_ci         ctx.pending_s_buffer_store = true;
665bf215546Sopenharmony_ci
666bf215546Sopenharmony_ci      break;
667bf215546Sopenharmony_ci   }
668bf215546Sopenharmony_ci   case Format::DS: {
669bf215546Sopenharmony_ci      DS_instruction& ds = instr->ds();
670bf215546Sopenharmony_ci      update_counters(ctx, ds.gds ? event_gds : event_lds, ds.sync);
671bf215546Sopenharmony_ci      if (ds.gds)
672bf215546Sopenharmony_ci         update_counters(ctx, event_gds_gpr_lock);
673bf215546Sopenharmony_ci
674bf215546Sopenharmony_ci      if (!instr->definitions.empty())
675bf215546Sopenharmony_ci         insert_wait_entry(ctx, instr->definitions[0], ds.gds ? event_gds : event_lds);
676bf215546Sopenharmony_ci
677bf215546Sopenharmony_ci      if (ds.gds) {
678bf215546Sopenharmony_ci         for (const Operand& op : instr->operands)
679bf215546Sopenharmony_ci            insert_wait_entry(ctx, op, event_gds_gpr_lock);
680bf215546Sopenharmony_ci         insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
681bf215546Sopenharmony_ci      }
682bf215546Sopenharmony_ci      break;
683bf215546Sopenharmony_ci   }
684bf215546Sopenharmony_ci   case Format::MUBUF:
685bf215546Sopenharmony_ci   case Format::MTBUF:
686bf215546Sopenharmony_ci   case Format::MIMG:
687bf215546Sopenharmony_ci   case Format::GLOBAL:
688bf215546Sopenharmony_ci   case Format::SCRATCH: {
689bf215546Sopenharmony_ci      wait_event ev =
690bf215546Sopenharmony_ci         !instr->definitions.empty() || ctx.gfx_level < GFX10 ? event_vmem : event_vmem_store;
691bf215546Sopenharmony_ci      update_counters(ctx, ev, get_sync_info(instr));
692bf215546Sopenharmony_ci
693bf215546Sopenharmony_ci      if (!instr->definitions.empty())
694bf215546Sopenharmony_ci         insert_wait_entry(ctx, instr->definitions[0], ev, get_vmem_type(instr));
695bf215546Sopenharmony_ci
696bf215546Sopenharmony_ci      if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
697bf215546Sopenharmony_ci         update_counters(ctx, event_vmem_gpr_lock);
698bf215546Sopenharmony_ci         insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
699bf215546Sopenharmony_ci      } else if (ctx.gfx_level == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
700bf215546Sopenharmony_ci         update_counters(ctx, event_vmem_gpr_lock);
701bf215546Sopenharmony_ci         insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);
702bf215546Sopenharmony_ci      }
703bf215546Sopenharmony_ci
704bf215546Sopenharmony_ci      break;
705bf215546Sopenharmony_ci   }
706bf215546Sopenharmony_ci   case Format::SOPP: {
707bf215546Sopenharmony_ci      if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt)
708bf215546Sopenharmony_ci         update_counters(ctx, event_sendmsg);
709bf215546Sopenharmony_ci      break;
710bf215546Sopenharmony_ci   }
711bf215546Sopenharmony_ci   default: break;
712bf215546Sopenharmony_ci   }
713bf215546Sopenharmony_ci}
714bf215546Sopenharmony_ci
715bf215546Sopenharmony_civoid
716bf215546Sopenharmony_ciemit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm)
717bf215546Sopenharmony_ci{
718bf215546Sopenharmony_ci   if (imm.vs != wait_imm::unset_counter) {
719bf215546Sopenharmony_ci      assert(ctx.gfx_level >= GFX10);
720bf215546Sopenharmony_ci      SOPK_instruction* waitcnt_vs =
721bf215546Sopenharmony_ci         create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
722bf215546Sopenharmony_ci      waitcnt_vs->definitions[0] = Definition(sgpr_null, s1);
723bf215546Sopenharmony_ci      waitcnt_vs->imm = imm.vs;
724bf215546Sopenharmony_ci      instructions.emplace_back(waitcnt_vs);
725bf215546Sopenharmony_ci      imm.vs = wait_imm::unset_counter;
726bf215546Sopenharmony_ci   }
727bf215546Sopenharmony_ci   if (!imm.empty()) {
728bf215546Sopenharmony_ci      SOPP_instruction* waitcnt =
729bf215546Sopenharmony_ci         create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
730bf215546Sopenharmony_ci      waitcnt->imm = imm.pack(ctx.gfx_level);
731bf215546Sopenharmony_ci      waitcnt->block = -1;
732bf215546Sopenharmony_ci      instructions.emplace_back(waitcnt);
733bf215546Sopenharmony_ci   }
734bf215546Sopenharmony_ci   imm = wait_imm();
735bf215546Sopenharmony_ci}
736bf215546Sopenharmony_ci
737bf215546Sopenharmony_civoid
738bf215546Sopenharmony_cihandle_block(Program* program, Block& block, wait_ctx& ctx)
739bf215546Sopenharmony_ci{
740bf215546Sopenharmony_ci   std::vector<aco_ptr<Instruction>> new_instructions;
741bf215546Sopenharmony_ci
742bf215546Sopenharmony_ci   wait_imm queued_imm;
743bf215546Sopenharmony_ci
744bf215546Sopenharmony_ci   for (aco_ptr<Instruction>& instr : block.instructions) {
745bf215546Sopenharmony_ci      bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get());
746bf215546Sopenharmony_ci
747bf215546Sopenharmony_ci      memory_sync_info sync_info = get_sync_info(instr.get());
748bf215546Sopenharmony_ci      kill(queued_imm, instr.get(), ctx, sync_info);
749bf215546Sopenharmony_ci
750bf215546Sopenharmony_ci      gen(instr.get(), ctx);
751bf215546Sopenharmony_ci
752bf215546Sopenharmony_ci      if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
753bf215546Sopenharmony_ci         if (!queued_imm.empty())
754bf215546Sopenharmony_ci            emit_waitcnt(ctx, new_instructions, queued_imm);
755bf215546Sopenharmony_ci
756bf215546Sopenharmony_ci         new_instructions.emplace_back(std::move(instr));
757bf215546Sopenharmony_ci         perform_barrier(ctx, queued_imm, sync_info, semantic_acquire);
758bf215546Sopenharmony_ci      }
759bf215546Sopenharmony_ci   }
760bf215546Sopenharmony_ci
761bf215546Sopenharmony_ci   if (!queued_imm.empty())
762bf215546Sopenharmony_ci      emit_waitcnt(ctx, new_instructions, queued_imm);
763bf215546Sopenharmony_ci
764bf215546Sopenharmony_ci   block.instructions.swap(new_instructions);
765bf215546Sopenharmony_ci}
766bf215546Sopenharmony_ci
767bf215546Sopenharmony_ci} /* end namespace */
768bf215546Sopenharmony_ci
769bf215546Sopenharmony_civoid
770bf215546Sopenharmony_ciinsert_wait_states(Program* program)
771bf215546Sopenharmony_ci{
772bf215546Sopenharmony_ci   /* per BB ctx */
773bf215546Sopenharmony_ci   std::vector<bool> done(program->blocks.size());
774bf215546Sopenharmony_ci   std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program));
775bf215546Sopenharmony_ci   std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program));
776bf215546Sopenharmony_ci
777bf215546Sopenharmony_ci   std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
778bf215546Sopenharmony_ci   unsigned loop_progress = 0;
779bf215546Sopenharmony_ci
780bf215546Sopenharmony_ci   if (program->stage.has(SWStage::VS) && program->info.vs.dynamic_inputs) {
781bf215546Sopenharmony_ci      for (Definition def : program->vs_inputs) {
782bf215546Sopenharmony_ci         update_counters(in_ctx[0], event_vmem);
783bf215546Sopenharmony_ci         insert_wait_entry(in_ctx[0], def, event_vmem);
784bf215546Sopenharmony_ci      }
785bf215546Sopenharmony_ci   }
786bf215546Sopenharmony_ci
787bf215546Sopenharmony_ci   for (unsigned i = 0; i < program->blocks.size();) {
788bf215546Sopenharmony_ci      Block& current = program->blocks[i++];
789bf215546Sopenharmony_ci      wait_ctx ctx = in_ctx[current.index];
790bf215546Sopenharmony_ci
791bf215546Sopenharmony_ci      if (current.kind & block_kind_loop_header) {
792bf215546Sopenharmony_ci         loop_header_indices.push(current.index);
793bf215546Sopenharmony_ci      } else if (current.kind & block_kind_loop_exit) {
794bf215546Sopenharmony_ci         bool repeat = false;
795bf215546Sopenharmony_ci         if (loop_progress == loop_header_indices.size()) {
796bf215546Sopenharmony_ci            i = loop_header_indices.top();
797bf215546Sopenharmony_ci            repeat = true;
798bf215546Sopenharmony_ci         }
799bf215546Sopenharmony_ci         loop_header_indices.pop();
800bf215546Sopenharmony_ci         loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size());
801bf215546Sopenharmony_ci         if (repeat)
802bf215546Sopenharmony_ci            continue;
803bf215546Sopenharmony_ci      }
804bf215546Sopenharmony_ci
805bf215546Sopenharmony_ci      bool changed = false;
806bf215546Sopenharmony_ci      for (unsigned b : current.linear_preds)
807bf215546Sopenharmony_ci         changed |= ctx.join(&out_ctx[b], false);
808bf215546Sopenharmony_ci      for (unsigned b : current.logical_preds)
809bf215546Sopenharmony_ci         changed |= ctx.join(&out_ctx[b], true);
810bf215546Sopenharmony_ci
811bf215546Sopenharmony_ci      if (done[current.index] && !changed) {
812bf215546Sopenharmony_ci         in_ctx[current.index] = std::move(ctx);
813bf215546Sopenharmony_ci         continue;
814bf215546Sopenharmony_ci      } else {
815bf215546Sopenharmony_ci         in_ctx[current.index] = ctx;
816bf215546Sopenharmony_ci      }
817bf215546Sopenharmony_ci
818bf215546Sopenharmony_ci      if (current.instructions.empty()) {
819bf215546Sopenharmony_ci         out_ctx[current.index] = std::move(ctx);
820bf215546Sopenharmony_ci         continue;
821bf215546Sopenharmony_ci      }
822bf215546Sopenharmony_ci
823bf215546Sopenharmony_ci      loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);
824bf215546Sopenharmony_ci      done[current.index] = true;
825bf215546Sopenharmony_ci
826bf215546Sopenharmony_ci      handle_block(program, current, ctx);
827bf215546Sopenharmony_ci
828bf215546Sopenharmony_ci      out_ctx[current.index] = std::move(ctx);
829bf215546Sopenharmony_ci   }
830bf215546Sopenharmony_ci}
831bf215546Sopenharmony_ci
832bf215546Sopenharmony_ci} // namespace aco
833