1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2018 Valve Corporation
3bf215546Sopenharmony_ci * Copyright © 2018 Google
4bf215546Sopenharmony_ci *
5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
8bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
10bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
11bf215546Sopenharmony_ci *
12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
14bf215546Sopenharmony_ci * Software.
15bf215546Sopenharmony_ci *
16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22bf215546Sopenharmony_ci * IN THE SOFTWARE.
23bf215546Sopenharmony_ci *
24bf215546Sopenharmony_ci */
25bf215546Sopenharmony_ci
26bf215546Sopenharmony_ci#include "aco_ir.h"
27bf215546Sopenharmony_ci
28bf215546Sopenharmony_ci#include "util/u_math.h"
29bf215546Sopenharmony_ci
30bf215546Sopenharmony_ci#include <set>
31bf215546Sopenharmony_ci#include <vector>
32bf215546Sopenharmony_ci
33bf215546Sopenharmony_cinamespace aco {
34bf215546Sopenharmony_ciRegisterDemand
35bf215546Sopenharmony_ciget_live_changes(aco_ptr<Instruction>& instr)
36bf215546Sopenharmony_ci{
37bf215546Sopenharmony_ci   RegisterDemand changes;
38bf215546Sopenharmony_ci   for (const Definition& def : instr->definitions) {
39bf215546Sopenharmony_ci      if (!def.isTemp() || def.isKill())
40bf215546Sopenharmony_ci         continue;
41bf215546Sopenharmony_ci      changes += def.getTemp();
42bf215546Sopenharmony_ci   }
43bf215546Sopenharmony_ci
44bf215546Sopenharmony_ci   for (const Operand& op : instr->operands) {
45bf215546Sopenharmony_ci      if (!op.isTemp() || !op.isFirstKill())
46bf215546Sopenharmony_ci         continue;
47bf215546Sopenharmony_ci      changes -= op.getTemp();
48bf215546Sopenharmony_ci   }
49bf215546Sopenharmony_ci
50bf215546Sopenharmony_ci   return changes;
51bf215546Sopenharmony_ci}
52bf215546Sopenharmony_ci
53bf215546Sopenharmony_ciRegisterDemand
54bf215546Sopenharmony_ciget_temp_registers(aco_ptr<Instruction>& instr)
55bf215546Sopenharmony_ci{
56bf215546Sopenharmony_ci   RegisterDemand temp_registers;
57bf215546Sopenharmony_ci
58bf215546Sopenharmony_ci   for (Definition def : instr->definitions) {
59bf215546Sopenharmony_ci      if (!def.isTemp())
60bf215546Sopenharmony_ci         continue;
61bf215546Sopenharmony_ci      if (def.isKill())
62bf215546Sopenharmony_ci         temp_registers += def.getTemp();
63bf215546Sopenharmony_ci   }
64bf215546Sopenharmony_ci
65bf215546Sopenharmony_ci   for (Operand op : instr->operands) {
66bf215546Sopenharmony_ci      if (op.isTemp() && op.isLateKill() && op.isFirstKill())
67bf215546Sopenharmony_ci         temp_registers += op.getTemp();
68bf215546Sopenharmony_ci   }
69bf215546Sopenharmony_ci
70bf215546Sopenharmony_ci   return temp_registers;
71bf215546Sopenharmony_ci}
72bf215546Sopenharmony_ci
73bf215546Sopenharmony_ciRegisterDemand
74bf215546Sopenharmony_ciget_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
75bf215546Sopenharmony_ci                  aco_ptr<Instruction>& instr_before)
76bf215546Sopenharmony_ci{
77bf215546Sopenharmony_ci   demand -= get_live_changes(instr);
78bf215546Sopenharmony_ci   demand -= get_temp_registers(instr);
79bf215546Sopenharmony_ci   if (instr_before)
80bf215546Sopenharmony_ci      demand += get_temp_registers(instr_before);
81bf215546Sopenharmony_ci   return demand;
82bf215546Sopenharmony_ci}
83bf215546Sopenharmony_ci
84bf215546Sopenharmony_cinamespace {
85bf215546Sopenharmony_cistruct PhiInfo {
86bf215546Sopenharmony_ci   uint16_t logical_phi_sgpr_ops = 0;
87bf215546Sopenharmony_ci   uint16_t linear_phi_ops = 0;
88bf215546Sopenharmony_ci   uint16_t linear_phi_defs = 0;
89bf215546Sopenharmony_ci};
90bf215546Sopenharmony_ci
91bf215546Sopenharmony_cibool
92bf215546Sopenharmony_ciinstr_needs_vcc(Instruction* instr)
93bf215546Sopenharmony_ci{
94bf215546Sopenharmony_ci   if (instr->isVOPC())
95bf215546Sopenharmony_ci      return true;
96bf215546Sopenharmony_ci   if (instr->isVOP2() && !instr->isVOP3()) {
97bf215546Sopenharmony_ci      if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
98bf215546Sopenharmony_ci          instr->operands[2].regClass().type() == RegType::sgpr)
99bf215546Sopenharmony_ci         return true;
100bf215546Sopenharmony_ci      if (instr->definitions.size() == 2)
101bf215546Sopenharmony_ci         return true;
102bf215546Sopenharmony_ci   }
103bf215546Sopenharmony_ci   return false;
104bf215546Sopenharmony_ci}
105bf215546Sopenharmony_ci
106bf215546Sopenharmony_civoid
107bf215546Sopenharmony_ciprocess_live_temps_per_block(Program* program, live& lives, Block* block, unsigned& worklist,
108bf215546Sopenharmony_ci                             std::vector<PhiInfo>& phi_info)
109bf215546Sopenharmony_ci{
110bf215546Sopenharmony_ci   std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
111bf215546Sopenharmony_ci   RegisterDemand new_demand;
112bf215546Sopenharmony_ci
113bf215546Sopenharmony_ci   register_demand.resize(block->instructions.size());
114bf215546Sopenharmony_ci   RegisterDemand block_register_demand;
115bf215546Sopenharmony_ci   IDSet live = lives.live_out[block->index];
116bf215546Sopenharmony_ci
117bf215546Sopenharmony_ci   /* initialize register demand */
118bf215546Sopenharmony_ci   for (unsigned t : live)
119bf215546Sopenharmony_ci      new_demand += Temp(t, program->temp_rc[t]);
120bf215546Sopenharmony_ci   new_demand.sgpr -= phi_info[block->index].logical_phi_sgpr_ops;
121bf215546Sopenharmony_ci
122bf215546Sopenharmony_ci   /* traverse the instructions backwards */
123bf215546Sopenharmony_ci   int idx;
124bf215546Sopenharmony_ci   for (idx = block->instructions.size() - 1; idx >= 0; idx--) {
125bf215546Sopenharmony_ci      Instruction* insn = block->instructions[idx].get();
126bf215546Sopenharmony_ci      if (is_phi(insn))
127bf215546Sopenharmony_ci         break;
128bf215546Sopenharmony_ci
129bf215546Sopenharmony_ci      program->needs_vcc |= instr_needs_vcc(insn);
130bf215546Sopenharmony_ci      register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr);
131bf215546Sopenharmony_ci
132bf215546Sopenharmony_ci      /* KILL */
133bf215546Sopenharmony_ci      for (Definition& definition : insn->definitions) {
134bf215546Sopenharmony_ci         if (!definition.isTemp()) {
135bf215546Sopenharmony_ci            continue;
136bf215546Sopenharmony_ci         }
137bf215546Sopenharmony_ci         if (definition.isFixed() && definition.physReg() == vcc)
138bf215546Sopenharmony_ci            program->needs_vcc = true;
139bf215546Sopenharmony_ci
140bf215546Sopenharmony_ci         const Temp temp = definition.getTemp();
141bf215546Sopenharmony_ci         const size_t n = live.erase(temp.id());
142bf215546Sopenharmony_ci
143bf215546Sopenharmony_ci         if (n) {
144bf215546Sopenharmony_ci            new_demand -= temp;
145bf215546Sopenharmony_ci            definition.setKill(false);
146bf215546Sopenharmony_ci         } else {
147bf215546Sopenharmony_ci            register_demand[idx] += temp;
148bf215546Sopenharmony_ci            definition.setKill(true);
149bf215546Sopenharmony_ci         }
150bf215546Sopenharmony_ci      }
151bf215546Sopenharmony_ci
152bf215546Sopenharmony_ci      /* GEN */
153bf215546Sopenharmony_ci      if (insn->opcode == aco_opcode::p_logical_end) {
154bf215546Sopenharmony_ci         new_demand.sgpr += phi_info[block->index].logical_phi_sgpr_ops;
155bf215546Sopenharmony_ci      } else {
156bf215546Sopenharmony_ci         /* we need to do this in a separate loop because the next one can
157bf215546Sopenharmony_ci          * setKill() for several operands at once and we don't want to
158bf215546Sopenharmony_ci          * overwrite that in a later iteration */
159bf215546Sopenharmony_ci         for (Operand& op : insn->operands)
160bf215546Sopenharmony_ci            op.setKill(false);
161bf215546Sopenharmony_ci
162bf215546Sopenharmony_ci         for (unsigned i = 0; i < insn->operands.size(); ++i) {
163bf215546Sopenharmony_ci            Operand& operand = insn->operands[i];
164bf215546Sopenharmony_ci            if (!operand.isTemp())
165bf215546Sopenharmony_ci               continue;
166bf215546Sopenharmony_ci            if (operand.isFixed() && operand.physReg() == vcc)
167bf215546Sopenharmony_ci               program->needs_vcc = true;
168bf215546Sopenharmony_ci            const Temp temp = operand.getTemp();
169bf215546Sopenharmony_ci            const bool inserted = live.insert(temp.id()).second;
170bf215546Sopenharmony_ci            if (inserted) {
171bf215546Sopenharmony_ci               operand.setFirstKill(true);
172bf215546Sopenharmony_ci               for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
173bf215546Sopenharmony_ci                  if (insn->operands[j].isTemp() &&
174bf215546Sopenharmony_ci                      insn->operands[j].tempId() == operand.tempId()) {
175bf215546Sopenharmony_ci                     insn->operands[j].setFirstKill(false);
176bf215546Sopenharmony_ci                     insn->operands[j].setKill(true);
177bf215546Sopenharmony_ci                  }
178bf215546Sopenharmony_ci               }
179bf215546Sopenharmony_ci               if (operand.isLateKill())
180bf215546Sopenharmony_ci                  register_demand[idx] += temp;
181bf215546Sopenharmony_ci               new_demand += temp;
182bf215546Sopenharmony_ci            }
183bf215546Sopenharmony_ci         }
184bf215546Sopenharmony_ci      }
185bf215546Sopenharmony_ci
186bf215546Sopenharmony_ci      block_register_demand.update(register_demand[idx]);
187bf215546Sopenharmony_ci   }
188bf215546Sopenharmony_ci
189bf215546Sopenharmony_ci   /* update block's register demand for a last time */
190bf215546Sopenharmony_ci   block_register_demand.update(new_demand);
191bf215546Sopenharmony_ci   if (program->progress < CompilationProgress::after_ra)
192bf215546Sopenharmony_ci      block->register_demand = block_register_demand;
193bf215546Sopenharmony_ci
194bf215546Sopenharmony_ci   /* handle phi definitions */
195bf215546Sopenharmony_ci   uint16_t linear_phi_defs = 0;
196bf215546Sopenharmony_ci   int phi_idx = idx;
197bf215546Sopenharmony_ci   while (phi_idx >= 0) {
198bf215546Sopenharmony_ci      register_demand[phi_idx] = new_demand;
199bf215546Sopenharmony_ci      Instruction* insn = block->instructions[phi_idx].get();
200bf215546Sopenharmony_ci
201bf215546Sopenharmony_ci      assert(is_phi(insn) && insn->definitions.size() == 1);
202bf215546Sopenharmony_ci      if (!insn->definitions[0].isTemp()) {
203bf215546Sopenharmony_ci         assert(insn->definitions[0].isFixed() && insn->definitions[0].physReg() == exec);
204bf215546Sopenharmony_ci         phi_idx--;
205bf215546Sopenharmony_ci         continue;
206bf215546Sopenharmony_ci      }
207bf215546Sopenharmony_ci      Definition& definition = insn->definitions[0];
208bf215546Sopenharmony_ci      if (definition.isFixed() && definition.physReg() == vcc)
209bf215546Sopenharmony_ci         program->needs_vcc = true;
210bf215546Sopenharmony_ci      const Temp temp = definition.getTemp();
211bf215546Sopenharmony_ci      const size_t n = live.erase(temp.id());
212bf215546Sopenharmony_ci
213bf215546Sopenharmony_ci      if (n)
214bf215546Sopenharmony_ci         definition.setKill(false);
215bf215546Sopenharmony_ci      else
216bf215546Sopenharmony_ci         definition.setKill(true);
217bf215546Sopenharmony_ci
218bf215546Sopenharmony_ci      if (insn->opcode == aco_opcode::p_linear_phi) {
219bf215546Sopenharmony_ci         assert(definition.getTemp().type() == RegType::sgpr);
220bf215546Sopenharmony_ci         linear_phi_defs += definition.size();
221bf215546Sopenharmony_ci      }
222bf215546Sopenharmony_ci
223bf215546Sopenharmony_ci      phi_idx--;
224bf215546Sopenharmony_ci   }
225bf215546Sopenharmony_ci
226bf215546Sopenharmony_ci   for (unsigned pred_idx : block->linear_preds)
227bf215546Sopenharmony_ci      phi_info[pred_idx].linear_phi_defs = linear_phi_defs;
228bf215546Sopenharmony_ci
229bf215546Sopenharmony_ci   /* now, we need to merge the live-ins into the live-out sets */
230bf215546Sopenharmony_ci   for (unsigned t : live) {
231bf215546Sopenharmony_ci      RegClass rc = program->temp_rc[t];
232bf215546Sopenharmony_ci      std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
233bf215546Sopenharmony_ci
234bf215546Sopenharmony_ci#ifndef NDEBUG
235bf215546Sopenharmony_ci      if (preds.empty())
236bf215546Sopenharmony_ci         aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t,
237bf215546Sopenharmony_ci                 block->index);
238bf215546Sopenharmony_ci#endif
239bf215546Sopenharmony_ci
240bf215546Sopenharmony_ci      for (unsigned pred_idx : preds) {
241bf215546Sopenharmony_ci         auto it = lives.live_out[pred_idx].insert(t);
242bf215546Sopenharmony_ci         if (it.second)
243bf215546Sopenharmony_ci            worklist = std::max(worklist, pred_idx + 1);
244bf215546Sopenharmony_ci      }
245bf215546Sopenharmony_ci   }
246bf215546Sopenharmony_ci
247bf215546Sopenharmony_ci   /* handle phi operands */
248bf215546Sopenharmony_ci   phi_idx = idx;
249bf215546Sopenharmony_ci   while (phi_idx >= 0) {
250bf215546Sopenharmony_ci      Instruction* insn = block->instructions[phi_idx].get();
251bf215546Sopenharmony_ci      assert(is_phi(insn));
252bf215546Sopenharmony_ci      /* directly insert into the predecessors live-out set */
253bf215546Sopenharmony_ci      std::vector<unsigned>& preds =
254bf215546Sopenharmony_ci         insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
255bf215546Sopenharmony_ci      for (unsigned i = 0; i < preds.size(); ++i) {
256bf215546Sopenharmony_ci         Operand& operand = insn->operands[i];
257bf215546Sopenharmony_ci         if (!operand.isTemp())
258bf215546Sopenharmony_ci            continue;
259bf215546Sopenharmony_ci         if (operand.isFixed() && operand.physReg() == vcc)
260bf215546Sopenharmony_ci            program->needs_vcc = true;
261bf215546Sopenharmony_ci         /* check if we changed an already processed block */
262bf215546Sopenharmony_ci         const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second;
263bf215546Sopenharmony_ci         if (inserted) {
264bf215546Sopenharmony_ci            worklist = std::max(worklist, preds[i] + 1);
265bf215546Sopenharmony_ci            if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr) {
266bf215546Sopenharmony_ci               phi_info[preds[i]].logical_phi_sgpr_ops += operand.size();
267bf215546Sopenharmony_ci            } else if (insn->opcode == aco_opcode::p_linear_phi) {
268bf215546Sopenharmony_ci               assert(operand.getTemp().type() == RegType::sgpr);
269bf215546Sopenharmony_ci               phi_info[preds[i]].linear_phi_ops += operand.size();
270bf215546Sopenharmony_ci            }
271bf215546Sopenharmony_ci         }
272bf215546Sopenharmony_ci
273bf215546Sopenharmony_ci         /* set if the operand is killed by this (or another) phi instruction */
274bf215546Sopenharmony_ci         operand.setKill(!live.count(operand.tempId()));
275bf215546Sopenharmony_ci      }
276bf215546Sopenharmony_ci      phi_idx--;
277bf215546Sopenharmony_ci   }
278bf215546Sopenharmony_ci
279bf215546Sopenharmony_ci   assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));
280bf215546Sopenharmony_ci}
281bf215546Sopenharmony_ci
282bf215546Sopenharmony_ciunsigned
283bf215546Sopenharmony_cicalc_waves_per_workgroup(Program* program)
284bf215546Sopenharmony_ci{
285bf215546Sopenharmony_ci   /* When workgroup size is not known, just go with wave_size */
286bf215546Sopenharmony_ci   unsigned workgroup_size =
287bf215546Sopenharmony_ci      program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size;
288bf215546Sopenharmony_ci
289bf215546Sopenharmony_ci   return align(workgroup_size, program->wave_size) / program->wave_size;
290bf215546Sopenharmony_ci}
291bf215546Sopenharmony_ci} /* end namespace */
292bf215546Sopenharmony_ci
293bf215546Sopenharmony_ciuint16_t
294bf215546Sopenharmony_ciget_extra_sgprs(Program* program)
295bf215546Sopenharmony_ci{
296bf215546Sopenharmony_ci   /* We don't use this register on GFX6-8 and it's removed on GFX10+. */
297bf215546Sopenharmony_ci   bool needs_flat_scr = program->config->scratch_bytes_per_wave && program->gfx_level == GFX9;
298bf215546Sopenharmony_ci
299bf215546Sopenharmony_ci   if (program->gfx_level >= GFX10) {
300bf215546Sopenharmony_ci      assert(!program->dev.xnack_enabled);
301bf215546Sopenharmony_ci      return 0;
302bf215546Sopenharmony_ci   } else if (program->gfx_level >= GFX8) {
303bf215546Sopenharmony_ci      if (needs_flat_scr)
304bf215546Sopenharmony_ci         return 6;
305bf215546Sopenharmony_ci      else if (program->dev.xnack_enabled)
306bf215546Sopenharmony_ci         return 4;
307bf215546Sopenharmony_ci      else if (program->needs_vcc)
308bf215546Sopenharmony_ci         return 2;
309bf215546Sopenharmony_ci      else
310bf215546Sopenharmony_ci         return 0;
311bf215546Sopenharmony_ci   } else {
312bf215546Sopenharmony_ci      assert(!program->dev.xnack_enabled);
313bf215546Sopenharmony_ci      if (needs_flat_scr)
314bf215546Sopenharmony_ci         return 4;
315bf215546Sopenharmony_ci      else if (program->needs_vcc)
316bf215546Sopenharmony_ci         return 2;
317bf215546Sopenharmony_ci      else
318bf215546Sopenharmony_ci         return 0;
319bf215546Sopenharmony_ci   }
320bf215546Sopenharmony_ci}
321bf215546Sopenharmony_ci
322bf215546Sopenharmony_ciuint16_t
323bf215546Sopenharmony_ciget_sgpr_alloc(Program* program, uint16_t addressable_sgprs)
324bf215546Sopenharmony_ci{
325bf215546Sopenharmony_ci   uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
326bf215546Sopenharmony_ci   uint16_t granule = program->dev.sgpr_alloc_granule;
327bf215546Sopenharmony_ci   return ALIGN_NPOT(std::max(sgprs, granule), granule);
328bf215546Sopenharmony_ci}
329bf215546Sopenharmony_ci
330bf215546Sopenharmony_ciuint16_t
331bf215546Sopenharmony_ciget_vgpr_alloc(Program* program, uint16_t addressable_vgprs)
332bf215546Sopenharmony_ci{
333bf215546Sopenharmony_ci   assert(addressable_vgprs <= program->dev.vgpr_limit);
334bf215546Sopenharmony_ci   uint16_t granule = program->dev.vgpr_alloc_granule;
335bf215546Sopenharmony_ci   return align(std::max(addressable_vgprs, granule), granule);
336bf215546Sopenharmony_ci}
337bf215546Sopenharmony_ci
338bf215546Sopenharmony_ciunsigned
339bf215546Sopenharmony_ciround_down(unsigned a, unsigned b)
340bf215546Sopenharmony_ci{
341bf215546Sopenharmony_ci   return a - (a % b);
342bf215546Sopenharmony_ci}
343bf215546Sopenharmony_ci
344bf215546Sopenharmony_ciuint16_t
345bf215546Sopenharmony_ciget_addr_sgpr_from_waves(Program* program, uint16_t waves)
346bf215546Sopenharmony_ci{
347bf215546Sopenharmony_ci   /* it's not possible to allocate more than 128 SGPRs */
348bf215546Sopenharmony_ci   uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);
349bf215546Sopenharmony_ci   sgprs = round_down(sgprs, program->dev.sgpr_alloc_granule);
350bf215546Sopenharmony_ci   sgprs -= get_extra_sgprs(program);
351bf215546Sopenharmony_ci   return std::min(sgprs, program->dev.sgpr_limit);
352bf215546Sopenharmony_ci}
353bf215546Sopenharmony_ci
354bf215546Sopenharmony_ciuint16_t
355bf215546Sopenharmony_ciget_addr_vgpr_from_waves(Program* program, uint16_t waves)
356bf215546Sopenharmony_ci{
357bf215546Sopenharmony_ci   uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);
358bf215546Sopenharmony_ci   vgprs -= program->config->num_shared_vgprs / 2;
359bf215546Sopenharmony_ci   return std::min(vgprs, program->dev.vgpr_limit);
360bf215546Sopenharmony_ci}
361bf215546Sopenharmony_ci
362bf215546Sopenharmony_civoid
363bf215546Sopenharmony_cicalc_min_waves(Program* program)
364bf215546Sopenharmony_ci{
365bf215546Sopenharmony_ci   unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
366bf215546Sopenharmony_ci   unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
367bf215546Sopenharmony_ci   program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
368bf215546Sopenharmony_ci}
369bf215546Sopenharmony_ci
370bf215546Sopenharmony_ciuint16_t
371bf215546Sopenharmony_cimax_suitable_waves(Program* program, uint16_t waves)
372bf215546Sopenharmony_ci{
373bf215546Sopenharmony_ci   unsigned num_simd = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
374bf215546Sopenharmony_ci   unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
375bf215546Sopenharmony_ci   unsigned num_workgroups = waves * num_simd / waves_per_workgroup;
376bf215546Sopenharmony_ci
377bf215546Sopenharmony_ci   /* Adjust #workgroups for LDS */
378bf215546Sopenharmony_ci   unsigned lds_per_workgroup = align(program->config->lds_size * program->dev.lds_encoding_granule,
379bf215546Sopenharmony_ci                                      program->dev.lds_alloc_granule);
380bf215546Sopenharmony_ci
381bf215546Sopenharmony_ci   if (program->stage == fragment_fs) {
382bf215546Sopenharmony_ci      /* PS inputs are moved from PC (parameter cache) to LDS before PS waves are launched.
383bf215546Sopenharmony_ci       * Each PS input occupies 3x vec4 of LDS space. See Figure 10.3 in GCN3 ISA manual.
384bf215546Sopenharmony_ci       * These limit occupancy the same way as other stages' LDS usage does.
385bf215546Sopenharmony_ci       */
386bf215546Sopenharmony_ci      unsigned lds_bytes_per_interp = 3 * 16;
387bf215546Sopenharmony_ci      unsigned lds_param_bytes = lds_bytes_per_interp * program->info.ps.num_interp;
388bf215546Sopenharmony_ci      lds_per_workgroup += align(lds_param_bytes, program->dev.lds_alloc_granule);
389bf215546Sopenharmony_ci   }
390bf215546Sopenharmony_ci   unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit;
391bf215546Sopenharmony_ci   if (lds_per_workgroup)
392bf215546Sopenharmony_ci      num_workgroups = std::min(num_workgroups, lds_limit / lds_per_workgroup);
393bf215546Sopenharmony_ci
394bf215546Sopenharmony_ci   /* Hardware limitation */
395bf215546Sopenharmony_ci   if (waves_per_workgroup > 1)
396bf215546Sopenharmony_ci      num_workgroups = std::min(num_workgroups, program->wgp_mode ? 32u : 16u);
397bf215546Sopenharmony_ci
398bf215546Sopenharmony_ci   /* Adjust #waves for workgroup multiples:
399bf215546Sopenharmony_ci    * In cases like waves_per_workgroup=3 or lds=65536 and
400bf215546Sopenharmony_ci    * waves_per_workgroup=1, we want the maximum possible number of waves per
401bf215546Sopenharmony_ci    * SIMD and not the minimum. so DIV_ROUND_UP is used
402bf215546Sopenharmony_ci    */
403bf215546Sopenharmony_ci   unsigned workgroup_waves = num_workgroups * waves_per_workgroup;
404bf215546Sopenharmony_ci   return DIV_ROUND_UP(workgroup_waves, num_simd);
405bf215546Sopenharmony_ci}
406bf215546Sopenharmony_ci
407bf215546Sopenharmony_civoid
408bf215546Sopenharmony_ciupdate_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
409bf215546Sopenharmony_ci{
410bf215546Sopenharmony_ci   assert(program->min_waves >= 1);
411bf215546Sopenharmony_ci   uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
412bf215546Sopenharmony_ci   uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
413bf215546Sopenharmony_ci
414bf215546Sopenharmony_ci   /* this won't compile, register pressure reduction necessary */
415bf215546Sopenharmony_ci   if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) {
416bf215546Sopenharmony_ci      program->num_waves = 0;
417bf215546Sopenharmony_ci      program->max_reg_demand = new_demand;
418bf215546Sopenharmony_ci   } else {
419bf215546Sopenharmony_ci      program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
420bf215546Sopenharmony_ci      uint16_t vgpr_demand =
421bf215546Sopenharmony_ci         get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
422bf215546Sopenharmony_ci      program->num_waves =
423bf215546Sopenharmony_ci         std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
424bf215546Sopenharmony_ci      uint16_t max_waves = program->dev.max_wave64_per_simd * (64 / program->wave_size);
425bf215546Sopenharmony_ci      program->num_waves = std::min(program->num_waves, max_waves);
426bf215546Sopenharmony_ci
427bf215546Sopenharmony_ci      /* Adjust for LDS and workgroup multiples and calculate max_reg_demand */
428bf215546Sopenharmony_ci      program->num_waves = max_suitable_waves(program, program->num_waves);
429bf215546Sopenharmony_ci      program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
430bf215546Sopenharmony_ci      program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
431bf215546Sopenharmony_ci   }
432bf215546Sopenharmony_ci}
433bf215546Sopenharmony_ci
434bf215546Sopenharmony_cilive
435bf215546Sopenharmony_cilive_var_analysis(Program* program)
436bf215546Sopenharmony_ci{
437bf215546Sopenharmony_ci   live result;
438bf215546Sopenharmony_ci   result.live_out.resize(program->blocks.size());
439bf215546Sopenharmony_ci   result.register_demand.resize(program->blocks.size());
440bf215546Sopenharmony_ci   unsigned worklist = program->blocks.size();
441bf215546Sopenharmony_ci   std::vector<PhiInfo> phi_info(program->blocks.size());
442bf215546Sopenharmony_ci   RegisterDemand new_demand;
443bf215546Sopenharmony_ci
444bf215546Sopenharmony_ci   program->needs_vcc = program->gfx_level >= GFX10;
445bf215546Sopenharmony_ci
446bf215546Sopenharmony_ci   /* this implementation assumes that the block idx corresponds to the block's position in
447bf215546Sopenharmony_ci    * program->blocks vector */
448bf215546Sopenharmony_ci   while (worklist) {
449bf215546Sopenharmony_ci      unsigned block_idx = --worklist;
450bf215546Sopenharmony_ci      process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist,
451bf215546Sopenharmony_ci                                   phi_info);
452bf215546Sopenharmony_ci      new_demand.update(program->blocks[block_idx].register_demand);
453bf215546Sopenharmony_ci   }
454bf215546Sopenharmony_ci
455bf215546Sopenharmony_ci   /* Handle branches: we will insert copies created for linear phis just before the branch. */
456bf215546Sopenharmony_ci   for (Block& block : program->blocks) {
457bf215546Sopenharmony_ci      result.register_demand[block.index].back().sgpr += phi_info[block.index].linear_phi_defs;
458bf215546Sopenharmony_ci      result.register_demand[block.index].back().sgpr -= phi_info[block.index].linear_phi_ops;
459bf215546Sopenharmony_ci   }
460bf215546Sopenharmony_ci
461bf215546Sopenharmony_ci   /* calculate the program's register demand and number of waves */
462bf215546Sopenharmony_ci   if (program->progress < CompilationProgress::after_ra)
463bf215546Sopenharmony_ci      update_vgpr_sgpr_demand(program, new_demand);
464bf215546Sopenharmony_ci
465bf215546Sopenharmony_ci   return result;
466bf215546Sopenharmony_ci}
467bf215546Sopenharmony_ci
468bf215546Sopenharmony_ci} // namespace aco
469