1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2018 Valve Corporation
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci *
23bf215546Sopenharmony_ci */
24bf215546Sopenharmony_ci
25bf215546Sopenharmony_ci#include "aco_builder.h"
26bf215546Sopenharmony_ci#include "aco_ir.h"
27bf215546Sopenharmony_ci
28bf215546Sopenharmony_ci#include "common/amdgfxregs.h"
29bf215546Sopenharmony_ci
30bf215546Sopenharmony_ci#include <algorithm>
31bf215546Sopenharmony_ci#include <unordered_set>
32bf215546Sopenharmony_ci#include <vector>
33bf215546Sopenharmony_ci
34bf215546Sopenharmony_ci#define SMEM_WINDOW_SIZE    (350 - ctx.num_waves * 35)
35bf215546Sopenharmony_ci#define VMEM_WINDOW_SIZE    (1024 - ctx.num_waves * 64)
36bf215546Sopenharmony_ci#define POS_EXP_WINDOW_SIZE 512
37bf215546Sopenharmony_ci#define SMEM_MAX_MOVES      (64 - ctx.num_waves * 4)
38bf215546Sopenharmony_ci#define VMEM_MAX_MOVES      (256 - ctx.num_waves * 16)
39bf215546Sopenharmony_ci/* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
40bf215546Sopenharmony_ci#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2)
41bf215546Sopenharmony_ci#define POS_EXP_MAX_MOVES         512
42bf215546Sopenharmony_ci
43bf215546Sopenharmony_cinamespace aco {
44bf215546Sopenharmony_ci
45bf215546Sopenharmony_cienum MoveResult {
46bf215546Sopenharmony_ci   move_success,
47bf215546Sopenharmony_ci   move_fail_ssa,
48bf215546Sopenharmony_ci   move_fail_rar,
49bf215546Sopenharmony_ci   move_fail_pressure,
50bf215546Sopenharmony_ci};
51bf215546Sopenharmony_ci
52bf215546Sopenharmony_ci/**
53bf215546Sopenharmony_ci * Cursor for downwards moves, where a single instruction is moved towards
54bf215546Sopenharmony_ci * or below a group of instruction that hardware can execute as a clause.
55bf215546Sopenharmony_ci */
56bf215546Sopenharmony_cistruct DownwardsCursor {
57bf215546Sopenharmony_ci   int source_idx; /* Current instruction to consider for moving */
58bf215546Sopenharmony_ci
59bf215546Sopenharmony_ci   int insert_idx_clause; /* First clause instruction */
60bf215546Sopenharmony_ci   int insert_idx;        /* First instruction *after* the clause */
61bf215546Sopenharmony_ci
62bf215546Sopenharmony_ci   /* Maximum demand of all clause instructions,
63bf215546Sopenharmony_ci    * i.e. from insert_idx_clause (inclusive) to insert_idx (exclusive) */
64bf215546Sopenharmony_ci   RegisterDemand clause_demand;
65bf215546Sopenharmony_ci   /* Maximum demand of instructions from source_idx to insert_idx_clause (both exclusive) */
66bf215546Sopenharmony_ci   RegisterDemand total_demand;
67bf215546Sopenharmony_ci
68bf215546Sopenharmony_ci   DownwardsCursor(int current_idx, RegisterDemand initial_clause_demand)
69bf215546Sopenharmony_ci       : source_idx(current_idx - 1), insert_idx_clause(current_idx), insert_idx(current_idx + 1),
70bf215546Sopenharmony_ci         clause_demand(initial_clause_demand)
71bf215546Sopenharmony_ci   {}
72bf215546Sopenharmony_ci
73bf215546Sopenharmony_ci   void verify_invariants(const RegisterDemand* register_demand);
74bf215546Sopenharmony_ci};
75bf215546Sopenharmony_ci
76bf215546Sopenharmony_ci/**
77bf215546Sopenharmony_ci * Cursor for upwards moves, where a single instruction is moved below
78bf215546Sopenharmony_ci * another instruction.
79bf215546Sopenharmony_ci */
80bf215546Sopenharmony_cistruct UpwardsCursor {
81bf215546Sopenharmony_ci   int source_idx; /* Current instruction to consider for moving */
82bf215546Sopenharmony_ci   int insert_idx; /* Instruction to move in front of */
83bf215546Sopenharmony_ci
84bf215546Sopenharmony_ci   /* Maximum demand of instructions from insert_idx (inclusive) to source_idx (exclusive) */
85bf215546Sopenharmony_ci   RegisterDemand total_demand;
86bf215546Sopenharmony_ci
87bf215546Sopenharmony_ci   UpwardsCursor(int source_idx_) : source_idx(source_idx_)
88bf215546Sopenharmony_ci   {
89bf215546Sopenharmony_ci      insert_idx = -1; /* to be initialized later */
90bf215546Sopenharmony_ci   }
91bf215546Sopenharmony_ci
92bf215546Sopenharmony_ci   bool has_insert_idx() const { return insert_idx != -1; }
93bf215546Sopenharmony_ci   void verify_invariants(const RegisterDemand* register_demand);
94bf215546Sopenharmony_ci};
95bf215546Sopenharmony_ci
96bf215546Sopenharmony_cistruct MoveState {
97bf215546Sopenharmony_ci   RegisterDemand max_registers;
98bf215546Sopenharmony_ci
99bf215546Sopenharmony_ci   Block* block;
100bf215546Sopenharmony_ci   Instruction* current;
101bf215546Sopenharmony_ci   RegisterDemand* register_demand; /* demand per instruction */
102bf215546Sopenharmony_ci   bool improved_rar;
103bf215546Sopenharmony_ci
104bf215546Sopenharmony_ci   std::vector<bool> depends_on;
105bf215546Sopenharmony_ci   /* Two are needed because, for downwards VMEM scheduling, one needs to
106bf215546Sopenharmony_ci    * exclude the instructions in the clause, since new instructions in the
107bf215546Sopenharmony_ci    * clause are not moved past any other instructions in the clause. */
108bf215546Sopenharmony_ci   std::vector<bool> RAR_dependencies;
109bf215546Sopenharmony_ci   std::vector<bool> RAR_dependencies_clause;
110bf215546Sopenharmony_ci
111bf215546Sopenharmony_ci   /* for moving instructions before the current instruction to after it */
112bf215546Sopenharmony_ci   DownwardsCursor downwards_init(int current_idx, bool improved_rar, bool may_form_clauses);
113bf215546Sopenharmony_ci   MoveResult downwards_move(DownwardsCursor&, bool clause);
114bf215546Sopenharmony_ci   void downwards_skip(DownwardsCursor&);
115bf215546Sopenharmony_ci
116bf215546Sopenharmony_ci   /* for moving instructions after the first use of the current instruction upwards */
117bf215546Sopenharmony_ci   UpwardsCursor upwards_init(int source_idx, bool improved_rar);
118bf215546Sopenharmony_ci   bool upwards_check_deps(UpwardsCursor&);
119bf215546Sopenharmony_ci   void upwards_update_insert_idx(UpwardsCursor&);
120bf215546Sopenharmony_ci   MoveResult upwards_move(UpwardsCursor&);
121bf215546Sopenharmony_ci   void upwards_skip(UpwardsCursor&);
122bf215546Sopenharmony_ci};
123bf215546Sopenharmony_ci
124bf215546Sopenharmony_cistruct sched_ctx {
125bf215546Sopenharmony_ci   int16_t num_waves;
126bf215546Sopenharmony_ci   int16_t last_SMEM_stall;
127bf215546Sopenharmony_ci   int last_SMEM_dep_idx;
128bf215546Sopenharmony_ci   MoveState mv;
129bf215546Sopenharmony_ci   bool schedule_pos_exports = true;
130bf215546Sopenharmony_ci   unsigned schedule_pos_export_div = 1;
131bf215546Sopenharmony_ci};
132bf215546Sopenharmony_ci
133bf215546Sopenharmony_ci/* This scheduler is a simple bottom-up pass based on ideas from
134bf215546Sopenharmony_ci * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
135bf215546Sopenharmony_ci * from Xiaohua Shi and Peng Guo.
136bf215546Sopenharmony_ci * The basic approach is to iterate over all instructions. When a memory instruction
137bf215546Sopenharmony_ci * is encountered it tries to move independent instructions from above and below
138bf215546Sopenharmony_ci * between the memory instruction and it's first user.
139bf215546Sopenharmony_ci * The novelty is that this scheduler cares for the current register pressure:
140bf215546Sopenharmony_ci * Instructions will only be moved if the register pressure won't exceed a certain bound.
141bf215546Sopenharmony_ci */
142bf215546Sopenharmony_ci
143bf215546Sopenharmony_citemplate <typename T>
144bf215546Sopenharmony_civoid
145bf215546Sopenharmony_cimove_element(T begin_it, size_t idx, size_t before)
146bf215546Sopenharmony_ci{
147bf215546Sopenharmony_ci   if (idx < before) {
148bf215546Sopenharmony_ci      auto begin = std::next(begin_it, idx);
149bf215546Sopenharmony_ci      auto end = std::next(begin_it, before);
150bf215546Sopenharmony_ci      std::rotate(begin, begin + 1, end);
151bf215546Sopenharmony_ci   } else if (idx > before) {
152bf215546Sopenharmony_ci      auto begin = std::next(begin_it, before);
153bf215546Sopenharmony_ci      auto end = std::next(begin_it, idx + 1);
154bf215546Sopenharmony_ci      std::rotate(begin, end - 1, end);
155bf215546Sopenharmony_ci   }
156bf215546Sopenharmony_ci}
157bf215546Sopenharmony_ci
158bf215546Sopenharmony_civoid
159bf215546Sopenharmony_ciDownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
160bf215546Sopenharmony_ci{
161bf215546Sopenharmony_ci   assert(source_idx < insert_idx_clause);
162bf215546Sopenharmony_ci   assert(insert_idx_clause < insert_idx);
163bf215546Sopenharmony_ci
164bf215546Sopenharmony_ci#ifndef NDEBUG
165bf215546Sopenharmony_ci   RegisterDemand reference_demand;
166bf215546Sopenharmony_ci   for (int i = source_idx + 1; i < insert_idx_clause; ++i) {
167bf215546Sopenharmony_ci      reference_demand.update(register_demand[i]);
168bf215546Sopenharmony_ci   }
169bf215546Sopenharmony_ci   assert(total_demand == reference_demand);
170bf215546Sopenharmony_ci
171bf215546Sopenharmony_ci   reference_demand = {};
172bf215546Sopenharmony_ci   for (int i = insert_idx_clause; i < insert_idx; ++i) {
173bf215546Sopenharmony_ci      reference_demand.update(register_demand[i]);
174bf215546Sopenharmony_ci   }
175bf215546Sopenharmony_ci   assert(clause_demand == reference_demand);
176bf215546Sopenharmony_ci#endif
177bf215546Sopenharmony_ci}
178bf215546Sopenharmony_ci
179bf215546Sopenharmony_ciDownwardsCursor
180bf215546Sopenharmony_ciMoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
181bf215546Sopenharmony_ci{
182bf215546Sopenharmony_ci   improved_rar = improved_rar_;
183bf215546Sopenharmony_ci
184bf215546Sopenharmony_ci   std::fill(depends_on.begin(), depends_on.end(), false);
185bf215546Sopenharmony_ci   if (improved_rar) {
186bf215546Sopenharmony_ci      std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
187bf215546Sopenharmony_ci      if (may_form_clauses)
188bf215546Sopenharmony_ci         std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false);
189bf215546Sopenharmony_ci   }
190bf215546Sopenharmony_ci
191bf215546Sopenharmony_ci   for (const Operand& op : current->operands) {
192bf215546Sopenharmony_ci      if (op.isTemp()) {
193bf215546Sopenharmony_ci         depends_on[op.tempId()] = true;
194bf215546Sopenharmony_ci         if (improved_rar && op.isFirstKill())
195bf215546Sopenharmony_ci            RAR_dependencies[op.tempId()] = true;
196bf215546Sopenharmony_ci      }
197bf215546Sopenharmony_ci   }
198bf215546Sopenharmony_ci
199bf215546Sopenharmony_ci   DownwardsCursor cursor(current_idx, register_demand[current_idx]);
200bf215546Sopenharmony_ci   cursor.verify_invariants(register_demand);
201bf215546Sopenharmony_ci   return cursor;
202bf215546Sopenharmony_ci}
203bf215546Sopenharmony_ci
204bf215546Sopenharmony_ci/* If add_to_clause is true, the current clause is extended by moving the
205bf215546Sopenharmony_ci * instruction at source_idx in front of the clause. Otherwise, the instruction
206bf215546Sopenharmony_ci * is moved past the end of the clause without extending it */
207bf215546Sopenharmony_ciMoveResult
208bf215546Sopenharmony_ciMoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause)
209bf215546Sopenharmony_ci{
210bf215546Sopenharmony_ci   aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
211bf215546Sopenharmony_ci
212bf215546Sopenharmony_ci   for (const Definition& def : instr->definitions)
213bf215546Sopenharmony_ci      if (def.isTemp() && depends_on[def.tempId()])
214bf215546Sopenharmony_ci         return move_fail_ssa;
215bf215546Sopenharmony_ci
216bf215546Sopenharmony_ci   /* check if one of candidate's operands is killed by depending instruction */
217bf215546Sopenharmony_ci   std::vector<bool>& RAR_deps =
218bf215546Sopenharmony_ci      improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
219bf215546Sopenharmony_ci   for (const Operand& op : instr->operands) {
220bf215546Sopenharmony_ci      if (op.isTemp() && RAR_deps[op.tempId()]) {
221bf215546Sopenharmony_ci         // FIXME: account for difference in register pressure
222bf215546Sopenharmony_ci         return move_fail_rar;
223bf215546Sopenharmony_ci      }
224bf215546Sopenharmony_ci   }
225bf215546Sopenharmony_ci
226bf215546Sopenharmony_ci   if (add_to_clause) {
227bf215546Sopenharmony_ci      for (const Operand& op : instr->operands) {
228bf215546Sopenharmony_ci         if (op.isTemp()) {
229bf215546Sopenharmony_ci            depends_on[op.tempId()] = true;
230bf215546Sopenharmony_ci            if (op.isFirstKill())
231bf215546Sopenharmony_ci               RAR_dependencies[op.tempId()] = true;
232bf215546Sopenharmony_ci         }
233bf215546Sopenharmony_ci      }
234bf215546Sopenharmony_ci   }
235bf215546Sopenharmony_ci
236bf215546Sopenharmony_ci   const int dest_insert_idx = add_to_clause ? cursor.insert_idx_clause : cursor.insert_idx;
237bf215546Sopenharmony_ci   RegisterDemand register_pressure = cursor.total_demand;
238bf215546Sopenharmony_ci   if (!add_to_clause) {
239bf215546Sopenharmony_ci      register_pressure.update(cursor.clause_demand);
240bf215546Sopenharmony_ci   }
241bf215546Sopenharmony_ci
242bf215546Sopenharmony_ci   /* Check the new demand of the instructions being moved over */
243bf215546Sopenharmony_ci   const RegisterDemand candidate_diff = get_live_changes(instr);
244bf215546Sopenharmony_ci   if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers))
245bf215546Sopenharmony_ci      return move_fail_pressure;
246bf215546Sopenharmony_ci
247bf215546Sopenharmony_ci   /* New demand for the moved instruction */
248bf215546Sopenharmony_ci   const RegisterDemand temp = get_temp_registers(instr);
249bf215546Sopenharmony_ci   const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]);
250bf215546Sopenharmony_ci   const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp;
251bf215546Sopenharmony_ci   if (new_demand.exceeds(max_registers))
252bf215546Sopenharmony_ci      return move_fail_pressure;
253bf215546Sopenharmony_ci
254bf215546Sopenharmony_ci   /* move the candidate below the memory load */
255bf215546Sopenharmony_ci   move_element(block->instructions.begin(), cursor.source_idx, dest_insert_idx);
256bf215546Sopenharmony_ci
257bf215546Sopenharmony_ci   /* update register pressure */
258bf215546Sopenharmony_ci   move_element(register_demand, cursor.source_idx, dest_insert_idx);
259bf215546Sopenharmony_ci   for (int i = cursor.source_idx; i < dest_insert_idx - 1; i++)
260bf215546Sopenharmony_ci      register_demand[i] -= candidate_diff;
261bf215546Sopenharmony_ci   register_demand[dest_insert_idx - 1] = new_demand;
262bf215546Sopenharmony_ci   cursor.insert_idx_clause--;
263bf215546Sopenharmony_ci   if (cursor.source_idx != cursor.insert_idx_clause) {
264bf215546Sopenharmony_ci      /* Update demand if we moved over any instructions before the clause */
265bf215546Sopenharmony_ci      cursor.total_demand -= candidate_diff;
266bf215546Sopenharmony_ci   } else {
267bf215546Sopenharmony_ci      assert(cursor.total_demand == RegisterDemand{});
268bf215546Sopenharmony_ci   }
269bf215546Sopenharmony_ci   if (add_to_clause) {
270bf215546Sopenharmony_ci      cursor.clause_demand.update(new_demand);
271bf215546Sopenharmony_ci   } else {
272bf215546Sopenharmony_ci      cursor.clause_demand -= candidate_diff;
273bf215546Sopenharmony_ci      cursor.insert_idx--;
274bf215546Sopenharmony_ci   }
275bf215546Sopenharmony_ci
276bf215546Sopenharmony_ci   cursor.source_idx--;
277bf215546Sopenharmony_ci   cursor.verify_invariants(register_demand);
278bf215546Sopenharmony_ci   return move_success;
279bf215546Sopenharmony_ci}
280bf215546Sopenharmony_ci
281bf215546Sopenharmony_civoid
282bf215546Sopenharmony_ciMoveState::downwards_skip(DownwardsCursor& cursor)
283bf215546Sopenharmony_ci{
284bf215546Sopenharmony_ci   aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
285bf215546Sopenharmony_ci
286bf215546Sopenharmony_ci   for (const Operand& op : instr->operands) {
287bf215546Sopenharmony_ci      if (op.isTemp()) {
288bf215546Sopenharmony_ci         depends_on[op.tempId()] = true;
289bf215546Sopenharmony_ci         if (improved_rar && op.isFirstKill()) {
290bf215546Sopenharmony_ci            RAR_dependencies[op.tempId()] = true;
291bf215546Sopenharmony_ci            RAR_dependencies_clause[op.tempId()] = true;
292bf215546Sopenharmony_ci         }
293bf215546Sopenharmony_ci      }
294bf215546Sopenharmony_ci   }
295bf215546Sopenharmony_ci   cursor.total_demand.update(register_demand[cursor.source_idx]);
296bf215546Sopenharmony_ci   cursor.source_idx--;
297bf215546Sopenharmony_ci   cursor.verify_invariants(register_demand);
298bf215546Sopenharmony_ci}
299bf215546Sopenharmony_ci
300bf215546Sopenharmony_civoid
301bf215546Sopenharmony_ciUpwardsCursor::verify_invariants(const RegisterDemand* register_demand)
302bf215546Sopenharmony_ci{
303bf215546Sopenharmony_ci#ifndef NDEBUG
304bf215546Sopenharmony_ci   if (!has_insert_idx()) {
305bf215546Sopenharmony_ci      return;
306bf215546Sopenharmony_ci   }
307bf215546Sopenharmony_ci
308bf215546Sopenharmony_ci   assert(insert_idx < source_idx);
309bf215546Sopenharmony_ci
310bf215546Sopenharmony_ci   RegisterDemand reference_demand;
311bf215546Sopenharmony_ci   for (int i = insert_idx; i < source_idx; ++i) {
312bf215546Sopenharmony_ci      reference_demand.update(register_demand[i]);
313bf215546Sopenharmony_ci   }
314bf215546Sopenharmony_ci   assert(total_demand == reference_demand);
315bf215546Sopenharmony_ci#endif
316bf215546Sopenharmony_ci}
317bf215546Sopenharmony_ci
318bf215546Sopenharmony_ciUpwardsCursor
319bf215546Sopenharmony_ciMoveState::upwards_init(int source_idx, bool improved_rar_)
320bf215546Sopenharmony_ci{
321bf215546Sopenharmony_ci   improved_rar = improved_rar_;
322bf215546Sopenharmony_ci
323bf215546Sopenharmony_ci   std::fill(depends_on.begin(), depends_on.end(), false);
324bf215546Sopenharmony_ci   std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
325bf215546Sopenharmony_ci
326bf215546Sopenharmony_ci   for (const Definition& def : current->definitions) {
327bf215546Sopenharmony_ci      if (def.isTemp())
328bf215546Sopenharmony_ci         depends_on[def.tempId()] = true;
329bf215546Sopenharmony_ci   }
330bf215546Sopenharmony_ci
331bf215546Sopenharmony_ci   return UpwardsCursor(source_idx);
332bf215546Sopenharmony_ci}
333bf215546Sopenharmony_ci
334bf215546Sopenharmony_cibool
335bf215546Sopenharmony_ciMoveState::upwards_check_deps(UpwardsCursor& cursor)
336bf215546Sopenharmony_ci{
337bf215546Sopenharmony_ci   aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
338bf215546Sopenharmony_ci   for (const Operand& op : instr->operands) {
339bf215546Sopenharmony_ci      if (op.isTemp() && depends_on[op.tempId()])
340bf215546Sopenharmony_ci         return false;
341bf215546Sopenharmony_ci   }
342bf215546Sopenharmony_ci   return true;
343bf215546Sopenharmony_ci}
344bf215546Sopenharmony_ci
345bf215546Sopenharmony_civoid
346bf215546Sopenharmony_ciMoveState::upwards_update_insert_idx(UpwardsCursor& cursor)
347bf215546Sopenharmony_ci{
348bf215546Sopenharmony_ci   cursor.insert_idx = cursor.source_idx;
349bf215546Sopenharmony_ci   cursor.total_demand = register_demand[cursor.insert_idx];
350bf215546Sopenharmony_ci}
351bf215546Sopenharmony_ci
352bf215546Sopenharmony_ciMoveResult
353bf215546Sopenharmony_ciMoveState::upwards_move(UpwardsCursor& cursor)
354bf215546Sopenharmony_ci{
355bf215546Sopenharmony_ci   assert(cursor.has_insert_idx());
356bf215546Sopenharmony_ci
357bf215546Sopenharmony_ci   aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
358bf215546Sopenharmony_ci   for (const Operand& op : instr->operands) {
359bf215546Sopenharmony_ci      if (op.isTemp() && depends_on[op.tempId()])
360bf215546Sopenharmony_ci         return move_fail_ssa;
361bf215546Sopenharmony_ci   }
362bf215546Sopenharmony_ci
363bf215546Sopenharmony_ci   /* check if candidate uses/kills an operand which is used by a dependency */
364bf215546Sopenharmony_ci   for (const Operand& op : instr->operands) {
365bf215546Sopenharmony_ci      if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()])
366bf215546Sopenharmony_ci         return move_fail_rar;
367bf215546Sopenharmony_ci   }
368bf215546Sopenharmony_ci
369bf215546Sopenharmony_ci   /* check if register pressure is low enough: the diff is negative if register pressure is
370bf215546Sopenharmony_ci    * decreased */
371bf215546Sopenharmony_ci   const RegisterDemand candidate_diff = get_live_changes(instr);
372bf215546Sopenharmony_ci   const RegisterDemand temp = get_temp_registers(instr);
373bf215546Sopenharmony_ci   if (RegisterDemand(cursor.total_demand + candidate_diff).exceeds(max_registers))
374bf215546Sopenharmony_ci      return move_fail_pressure;
375bf215546Sopenharmony_ci   const RegisterDemand temp2 = get_temp_registers(block->instructions[cursor.insert_idx - 1]);
376bf215546Sopenharmony_ci   const RegisterDemand new_demand =
377bf215546Sopenharmony_ci      register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp;
378bf215546Sopenharmony_ci   if (new_demand.exceeds(max_registers))
379bf215546Sopenharmony_ci      return move_fail_pressure;
380bf215546Sopenharmony_ci
381bf215546Sopenharmony_ci   /* move the candidate above the insert_idx */
382bf215546Sopenharmony_ci   move_element(block->instructions.begin(), cursor.source_idx, cursor.insert_idx);
383bf215546Sopenharmony_ci
384bf215546Sopenharmony_ci   /* update register pressure */
385bf215546Sopenharmony_ci   move_element(register_demand, cursor.source_idx, cursor.insert_idx);
386bf215546Sopenharmony_ci   register_demand[cursor.insert_idx] = new_demand;
387bf215546Sopenharmony_ci   for (int i = cursor.insert_idx + 1; i <= cursor.source_idx; i++)
388bf215546Sopenharmony_ci      register_demand[i] += candidate_diff;
389bf215546Sopenharmony_ci   cursor.total_demand += candidate_diff;
390bf215546Sopenharmony_ci
391bf215546Sopenharmony_ci   cursor.total_demand.update(register_demand[cursor.source_idx]);
392bf215546Sopenharmony_ci
393bf215546Sopenharmony_ci   cursor.insert_idx++;
394bf215546Sopenharmony_ci   cursor.source_idx++;
395bf215546Sopenharmony_ci
396bf215546Sopenharmony_ci   cursor.verify_invariants(register_demand);
397bf215546Sopenharmony_ci
398bf215546Sopenharmony_ci   return move_success;
399bf215546Sopenharmony_ci}
400bf215546Sopenharmony_ci
401bf215546Sopenharmony_civoid
402bf215546Sopenharmony_ciMoveState::upwards_skip(UpwardsCursor& cursor)
403bf215546Sopenharmony_ci{
404bf215546Sopenharmony_ci   if (cursor.has_insert_idx()) {
405bf215546Sopenharmony_ci      aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
406bf215546Sopenharmony_ci      for (const Definition& def : instr->definitions) {
407bf215546Sopenharmony_ci         if (def.isTemp())
408bf215546Sopenharmony_ci            depends_on[def.tempId()] = true;
409bf215546Sopenharmony_ci      }
410bf215546Sopenharmony_ci      for (const Operand& op : instr->operands) {
411bf215546Sopenharmony_ci         if (op.isTemp())
412bf215546Sopenharmony_ci            RAR_dependencies[op.tempId()] = true;
413bf215546Sopenharmony_ci      }
414bf215546Sopenharmony_ci      cursor.total_demand.update(register_demand[cursor.source_idx]);
415bf215546Sopenharmony_ci   }
416bf215546Sopenharmony_ci
417bf215546Sopenharmony_ci   cursor.source_idx++;
418bf215546Sopenharmony_ci
419bf215546Sopenharmony_ci   cursor.verify_invariants(register_demand);
420bf215546Sopenharmony_ci}
421bf215546Sopenharmony_ci
422bf215546Sopenharmony_cibool
423bf215546Sopenharmony_ciis_gs_or_done_sendmsg(const Instruction* instr)
424bf215546Sopenharmony_ci{
425bf215546Sopenharmony_ci   if (instr->opcode == aco_opcode::s_sendmsg) {
426bf215546Sopenharmony_ci      uint16_t imm = instr->sopp().imm;
427bf215546Sopenharmony_ci      return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done;
428bf215546Sopenharmony_ci   }
429bf215546Sopenharmony_ci   return false;
430bf215546Sopenharmony_ci}
431bf215546Sopenharmony_ci
432bf215546Sopenharmony_cibool
433bf215546Sopenharmony_ciis_done_sendmsg(const Instruction* instr)
434bf215546Sopenharmony_ci{
435bf215546Sopenharmony_ci   if (instr->opcode == aco_opcode::s_sendmsg)
436bf215546Sopenharmony_ci      return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done;
437bf215546Sopenharmony_ci   return false;
438bf215546Sopenharmony_ci}
439bf215546Sopenharmony_ci
440bf215546Sopenharmony_cimemory_sync_info
441bf215546Sopenharmony_ciget_sync_info_with_hack(const Instruction* instr)
442bf215546Sopenharmony_ci{
443bf215546Sopenharmony_ci   memory_sync_info sync = get_sync_info(instr);
444bf215546Sopenharmony_ci   if (instr->isSMEM() && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
445bf215546Sopenharmony_ci      // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
446bf215546Sopenharmony_ci      sync.storage = (storage_class)(sync.storage | storage_buffer);
447bf215546Sopenharmony_ci      sync.semantics =
448bf215546Sopenharmony_ci         (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder);
449bf215546Sopenharmony_ci   }
450bf215546Sopenharmony_ci   return sync;
451bf215546Sopenharmony_ci}
452bf215546Sopenharmony_ci
453bf215546Sopenharmony_cistruct memory_event_set {
454bf215546Sopenharmony_ci   bool has_control_barrier;
455bf215546Sopenharmony_ci
456bf215546Sopenharmony_ci   unsigned bar_acquire;
457bf215546Sopenharmony_ci   unsigned bar_release;
458bf215546Sopenharmony_ci   unsigned bar_classes;
459bf215546Sopenharmony_ci
460bf215546Sopenharmony_ci   unsigned access_acquire;
461bf215546Sopenharmony_ci   unsigned access_release;
462bf215546Sopenharmony_ci   unsigned access_relaxed;
463bf215546Sopenharmony_ci   unsigned access_atomic;
464bf215546Sopenharmony_ci};
465bf215546Sopenharmony_ci
466bf215546Sopenharmony_cistruct hazard_query {
467bf215546Sopenharmony_ci   bool contains_spill;
468bf215546Sopenharmony_ci   bool contains_sendmsg;
469bf215546Sopenharmony_ci   bool uses_exec;
470bf215546Sopenharmony_ci   memory_event_set mem_events;
471bf215546Sopenharmony_ci   unsigned aliasing_storage;      /* storage classes which are accessed (non-SMEM) */
472bf215546Sopenharmony_ci   unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
473bf215546Sopenharmony_ci};
474bf215546Sopenharmony_ci
475bf215546Sopenharmony_civoid
476bf215546Sopenharmony_ciinit_hazard_query(hazard_query* query)
477bf215546Sopenharmony_ci{
478bf215546Sopenharmony_ci   query->contains_spill = false;
479bf215546Sopenharmony_ci   query->contains_sendmsg = false;
480bf215546Sopenharmony_ci   query->uses_exec = false;
481bf215546Sopenharmony_ci   memset(&query->mem_events, 0, sizeof(query->mem_events));
482bf215546Sopenharmony_ci   query->aliasing_storage = 0;
483bf215546Sopenharmony_ci   query->aliasing_storage_smem = 0;
484bf215546Sopenharmony_ci}
485bf215546Sopenharmony_ci
486bf215546Sopenharmony_civoid
487bf215546Sopenharmony_ciadd_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync)
488bf215546Sopenharmony_ci{
489bf215546Sopenharmony_ci   set->has_control_barrier |= is_done_sendmsg(instr);
490bf215546Sopenharmony_ci   if (instr->opcode == aco_opcode::p_barrier) {
491bf215546Sopenharmony_ci      Pseudo_barrier_instruction& bar = instr->barrier();
492bf215546Sopenharmony_ci      if (bar.sync.semantics & semantic_acquire)
493bf215546Sopenharmony_ci         set->bar_acquire |= bar.sync.storage;
494bf215546Sopenharmony_ci      if (bar.sync.semantics & semantic_release)
495bf215546Sopenharmony_ci         set->bar_release |= bar.sync.storage;
496bf215546Sopenharmony_ci      set->bar_classes |= bar.sync.storage;
497bf215546Sopenharmony_ci
498bf215546Sopenharmony_ci      set->has_control_barrier |= bar.exec_scope > scope_invocation;
499bf215546Sopenharmony_ci   }
500bf215546Sopenharmony_ci
501bf215546Sopenharmony_ci   if (!sync->storage)
502bf215546Sopenharmony_ci      return;
503bf215546Sopenharmony_ci
504bf215546Sopenharmony_ci   if (sync->semantics & semantic_acquire)
505bf215546Sopenharmony_ci      set->access_acquire |= sync->storage;
506bf215546Sopenharmony_ci   if (sync->semantics & semantic_release)
507bf215546Sopenharmony_ci      set->access_release |= sync->storage;
508bf215546Sopenharmony_ci
509bf215546Sopenharmony_ci   if (!(sync->semantics & semantic_private)) {
510bf215546Sopenharmony_ci      if (sync->semantics & semantic_atomic)
511bf215546Sopenharmony_ci         set->access_atomic |= sync->storage;
512bf215546Sopenharmony_ci      else
513bf215546Sopenharmony_ci         set->access_relaxed |= sync->storage;
514bf215546Sopenharmony_ci   }
515bf215546Sopenharmony_ci}
516bf215546Sopenharmony_ci
517bf215546Sopenharmony_civoid
518bf215546Sopenharmony_ciadd_to_hazard_query(hazard_query* query, Instruction* instr)
519bf215546Sopenharmony_ci{
520bf215546Sopenharmony_ci   if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
521bf215546Sopenharmony_ci      query->contains_spill = true;
522bf215546Sopenharmony_ci   query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
523bf215546Sopenharmony_ci   query->uses_exec |= needs_exec_mask(instr);
524bf215546Sopenharmony_ci
525bf215546Sopenharmony_ci   memory_sync_info sync = get_sync_info_with_hack(instr);
526bf215546Sopenharmony_ci
527bf215546Sopenharmony_ci   add_memory_event(&query->mem_events, instr, &sync);
528bf215546Sopenharmony_ci
529bf215546Sopenharmony_ci   if (!(sync.semantics & semantic_can_reorder)) {
530bf215546Sopenharmony_ci      unsigned storage = sync.storage;
531bf215546Sopenharmony_ci      /* images and buffer/global memory can alias */ // TODO: more precisely, buffer images and
532bf215546Sopenharmony_ci                                                      // buffer/global memory can alias
533bf215546Sopenharmony_ci      if (storage & (storage_buffer | storage_image))
534bf215546Sopenharmony_ci         storage |= storage_buffer | storage_image;
535bf215546Sopenharmony_ci      if (instr->isSMEM())
536bf215546Sopenharmony_ci         query->aliasing_storage_smem |= storage;
537bf215546Sopenharmony_ci      else
538bf215546Sopenharmony_ci         query->aliasing_storage |= storage;
539bf215546Sopenharmony_ci   }
540bf215546Sopenharmony_ci}
541bf215546Sopenharmony_ci
542bf215546Sopenharmony_cienum HazardResult {
543bf215546Sopenharmony_ci   hazard_success,
544bf215546Sopenharmony_ci   hazard_fail_reorder_vmem_smem,
545bf215546Sopenharmony_ci   hazard_fail_reorder_ds,
546bf215546Sopenharmony_ci   hazard_fail_reorder_sendmsg,
547bf215546Sopenharmony_ci   hazard_fail_spill,
548bf215546Sopenharmony_ci   hazard_fail_export,
549bf215546Sopenharmony_ci   hazard_fail_barrier,
550bf215546Sopenharmony_ci   /* Must stop at these failures. The hazard query code doesn't consider them
551bf215546Sopenharmony_ci    * when added. */
552bf215546Sopenharmony_ci   hazard_fail_exec,
553bf215546Sopenharmony_ci   hazard_fail_unreorderable,
554bf215546Sopenharmony_ci};
555bf215546Sopenharmony_ci
556bf215546Sopenharmony_ciHazardResult
557bf215546Sopenharmony_ciperform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
558bf215546Sopenharmony_ci{
559bf215546Sopenharmony_ci   /* don't schedule discards downwards */
560bf215546Sopenharmony_ci   if (!upwards && instr->opcode == aco_opcode::p_exit_early_if)
561bf215546Sopenharmony_ci      return hazard_fail_unreorderable;
562bf215546Sopenharmony_ci
563bf215546Sopenharmony_ci   if (query->uses_exec) {
564bf215546Sopenharmony_ci      for (const Definition& def : instr->definitions) {
565bf215546Sopenharmony_ci         if (def.isFixed() && def.physReg() == exec)
566bf215546Sopenharmony_ci            return hazard_fail_exec;
567bf215546Sopenharmony_ci      }
568bf215546Sopenharmony_ci   }
569bf215546Sopenharmony_ci
570bf215546Sopenharmony_ci   /* don't move exports so that they stay closer together */
571bf215546Sopenharmony_ci   if (instr->isEXP())
572bf215546Sopenharmony_ci      return hazard_fail_export;
573bf215546Sopenharmony_ci
574bf215546Sopenharmony_ci   /* don't move non-reorderable instructions */
575bf215546Sopenharmony_ci   if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime ||
576bf215546Sopenharmony_ci       instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 ||
577bf215546Sopenharmony_ci       instr->opcode == aco_opcode::p_init_scratch || instr->opcode == aco_opcode::p_jump_to_epilog)
578bf215546Sopenharmony_ci      return hazard_fail_unreorderable;
579bf215546Sopenharmony_ci
580bf215546Sopenharmony_ci   memory_event_set instr_set;
581bf215546Sopenharmony_ci   memset(&instr_set, 0, sizeof(instr_set));
582bf215546Sopenharmony_ci   memory_sync_info sync = get_sync_info_with_hack(instr);
583bf215546Sopenharmony_ci   add_memory_event(&instr_set, instr, &sync);
584bf215546Sopenharmony_ci
585bf215546Sopenharmony_ci   memory_event_set* first = &instr_set;
586bf215546Sopenharmony_ci   memory_event_set* second = &query->mem_events;
587bf215546Sopenharmony_ci   if (upwards)
588bf215546Sopenharmony_ci      std::swap(first, second);
589bf215546Sopenharmony_ci
590bf215546Sopenharmony_ci   /* everything after barrier(acquire) happens after the atomics/control_barriers before
591bf215546Sopenharmony_ci    * everything after load(acquire) happens after the load
592bf215546Sopenharmony_ci    */
593bf215546Sopenharmony_ci   if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
594bf215546Sopenharmony_ci      return hazard_fail_barrier;
595bf215546Sopenharmony_ci   if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
596bf215546Sopenharmony_ci       ((first->access_acquire | first->bar_acquire) &
597bf215546Sopenharmony_ci        (second->access_relaxed | second->access_atomic)))
598bf215546Sopenharmony_ci      return hazard_fail_barrier;
599bf215546Sopenharmony_ci
600bf215546Sopenharmony_ci   /* everything before barrier(release) happens before the atomics/control_barriers after *
601bf215546Sopenharmony_ci    * everything before store(release) happens before the store
602bf215546Sopenharmony_ci    */
603bf215546Sopenharmony_ci   if (first->bar_release && (second->has_control_barrier || second->access_atomic))
604bf215546Sopenharmony_ci      return hazard_fail_barrier;
605bf215546Sopenharmony_ci   if ((first->bar_classes && (second->bar_release || second->access_release)) ||
606bf215546Sopenharmony_ci       ((first->access_relaxed | first->access_atomic) &
607bf215546Sopenharmony_ci        (second->bar_release | second->access_release)))
608bf215546Sopenharmony_ci      return hazard_fail_barrier;
609bf215546Sopenharmony_ci
610bf215546Sopenharmony_ci   /* don't move memory barriers around other memory barriers */
611bf215546Sopenharmony_ci   if (first->bar_classes && second->bar_classes)
612bf215546Sopenharmony_ci      return hazard_fail_barrier;
613bf215546Sopenharmony_ci
614bf215546Sopenharmony_ci   /* Don't move memory accesses to before control barriers. I don't think
615bf215546Sopenharmony_ci    * this is necessary for the Vulkan memory model, but it might be for GLSL450. */
616bf215546Sopenharmony_ci   unsigned control_classes =
617bf215546Sopenharmony_ci      storage_buffer | storage_atomic_counter | storage_image | storage_shared |
618bf215546Sopenharmony_ci      storage_task_payload;
619bf215546Sopenharmony_ci   if (first->has_control_barrier &&
620bf215546Sopenharmony_ci       ((second->access_atomic | second->access_relaxed) & control_classes))
621bf215546Sopenharmony_ci      return hazard_fail_barrier;
622bf215546Sopenharmony_ci
623bf215546Sopenharmony_ci   /* don't move memory loads/stores past potentially aliasing loads/stores */
624bf215546Sopenharmony_ci   unsigned aliasing_storage =
625bf215546Sopenharmony_ci      instr->isSMEM() ? query->aliasing_storage_smem : query->aliasing_storage;
626bf215546Sopenharmony_ci   if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
627bf215546Sopenharmony_ci      unsigned intersect = sync.storage & aliasing_storage;
628bf215546Sopenharmony_ci      if (intersect & storage_shared)
629bf215546Sopenharmony_ci         return hazard_fail_reorder_ds;
630bf215546Sopenharmony_ci      return hazard_fail_reorder_vmem_smem;
631bf215546Sopenharmony_ci   }
632bf215546Sopenharmony_ci
633bf215546Sopenharmony_ci   if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
634bf215546Sopenharmony_ci       query->contains_spill)
635bf215546Sopenharmony_ci      return hazard_fail_spill;
636bf215546Sopenharmony_ci
637bf215546Sopenharmony_ci   if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
638bf215546Sopenharmony_ci      return hazard_fail_reorder_sendmsg;
639bf215546Sopenharmony_ci
640bf215546Sopenharmony_ci   return hazard_success;
641bf215546Sopenharmony_ci}
642bf215546Sopenharmony_ci
643bf215546Sopenharmony_civoid
644bf215546Sopenharmony_cischedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
645bf215546Sopenharmony_ci              Instruction* current, int idx)
646bf215546Sopenharmony_ci{
647bf215546Sopenharmony_ci   assert(idx != 0);
648bf215546Sopenharmony_ci   int window_size = SMEM_WINDOW_SIZE;
649bf215546Sopenharmony_ci   int max_moves = SMEM_MAX_MOVES;
650bf215546Sopenharmony_ci   int16_t k = 0;
651bf215546Sopenharmony_ci
652bf215546Sopenharmony_ci   /* don't move s_memtime/s_memrealtime */
653bf215546Sopenharmony_ci   if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
654bf215546Sopenharmony_ci      return;
655bf215546Sopenharmony_ci
656bf215546Sopenharmony_ci   /* first, check if we have instructions before current to move down */
657bf215546Sopenharmony_ci   hazard_query hq;
658bf215546Sopenharmony_ci   init_hazard_query(&hq);
659bf215546Sopenharmony_ci   add_to_hazard_query(&hq, current);
660bf215546Sopenharmony_ci
661bf215546Sopenharmony_ci   DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false);
662bf215546Sopenharmony_ci
663bf215546Sopenharmony_ci   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
664bf215546Sopenharmony_ci        candidate_idx--) {
665bf215546Sopenharmony_ci      assert(candidate_idx >= 0);
666bf215546Sopenharmony_ci      assert(candidate_idx == cursor.source_idx);
667bf215546Sopenharmony_ci      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
668bf215546Sopenharmony_ci
669bf215546Sopenharmony_ci      /* break if we'd make the previous SMEM instruction stall */
670bf215546Sopenharmony_ci      bool can_stall_prev_smem =
671bf215546Sopenharmony_ci         idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
672bf215546Sopenharmony_ci      if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
673bf215546Sopenharmony_ci         break;
674bf215546Sopenharmony_ci
675bf215546Sopenharmony_ci      /* break when encountering another MEM instruction, logical_start or barriers */
676bf215546Sopenharmony_ci      if (candidate->opcode == aco_opcode::p_logical_start)
677bf215546Sopenharmony_ci         break;
678bf215546Sopenharmony_ci      /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
679bf215546Sopenharmony_ci       * to help create more vmem clauses */
680bf215546Sopenharmony_ci      if ((candidate->isVMEM() || candidate->isFlatLike()) &&
681bf215546Sopenharmony_ci          (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
682bf215546Sopenharmony_ci           current->operands[0].size() == 4))
683bf215546Sopenharmony_ci         break;
684bf215546Sopenharmony_ci      /* don't move descriptor loads below buffer loads */
685bf215546Sopenharmony_ci      if (candidate->format == Format::SMEM && current->operands[0].size() == 4 &&
686bf215546Sopenharmony_ci          candidate->operands[0].size() == 2)
687bf215546Sopenharmony_ci         break;
688bf215546Sopenharmony_ci
689bf215546Sopenharmony_ci      bool can_move_down = true;
690bf215546Sopenharmony_ci
691bf215546Sopenharmony_ci      HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
692bf215546Sopenharmony_ci      if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
693bf215546Sopenharmony_ci          haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
694bf215546Sopenharmony_ci          haz == hazard_fail_export)
695bf215546Sopenharmony_ci         can_move_down = false;
696bf215546Sopenharmony_ci      else if (haz != hazard_success)
697bf215546Sopenharmony_ci         break;
698bf215546Sopenharmony_ci
699bf215546Sopenharmony_ci      /* don't use LDS/GDS instructions to hide latency since it can
700bf215546Sopenharmony_ci       * significanly worsen LDS scheduling */
701bf215546Sopenharmony_ci      if (candidate->isDS() || !can_move_down) {
702bf215546Sopenharmony_ci         add_to_hazard_query(&hq, candidate.get());
703bf215546Sopenharmony_ci         ctx.mv.downwards_skip(cursor);
704bf215546Sopenharmony_ci         continue;
705bf215546Sopenharmony_ci      }
706bf215546Sopenharmony_ci
707bf215546Sopenharmony_ci      MoveResult res = ctx.mv.downwards_move(cursor, false);
708bf215546Sopenharmony_ci      if (res == move_fail_ssa || res == move_fail_rar) {
709bf215546Sopenharmony_ci         add_to_hazard_query(&hq, candidate.get());
710bf215546Sopenharmony_ci         ctx.mv.downwards_skip(cursor);
711bf215546Sopenharmony_ci         continue;
712bf215546Sopenharmony_ci      } else if (res == move_fail_pressure) {
713bf215546Sopenharmony_ci         break;
714bf215546Sopenharmony_ci      }
715bf215546Sopenharmony_ci
716bf215546Sopenharmony_ci      if (candidate_idx < ctx.last_SMEM_dep_idx)
717bf215546Sopenharmony_ci         ctx.last_SMEM_stall++;
718bf215546Sopenharmony_ci      k++;
719bf215546Sopenharmony_ci   }
720bf215546Sopenharmony_ci
721bf215546Sopenharmony_ci   /* find the first instruction depending on current or find another MEM */
722bf215546Sopenharmony_ci   UpwardsCursor up_cursor = ctx.mv.upwards_init(idx + 1, false);
723bf215546Sopenharmony_ci
724bf215546Sopenharmony_ci   bool found_dependency = false;
725bf215546Sopenharmony_ci   /* second, check if we have instructions after current to move up */
726bf215546Sopenharmony_ci   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
727bf215546Sopenharmony_ci        candidate_idx++) {
728bf215546Sopenharmony_ci      assert(candidate_idx == up_cursor.source_idx);
729bf215546Sopenharmony_ci      assert(candidate_idx < (int)block->instructions.size());
730bf215546Sopenharmony_ci      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
731bf215546Sopenharmony_ci
732bf215546Sopenharmony_ci      if (candidate->opcode == aco_opcode::p_logical_end)
733bf215546Sopenharmony_ci         break;
734bf215546Sopenharmony_ci
735bf215546Sopenharmony_ci      /* check if candidate depends on current */
736bf215546Sopenharmony_ci      bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps(up_cursor);
737bf215546Sopenharmony_ci      /* no need to steal from following VMEM instructions */
738bf215546Sopenharmony_ci      if (is_dependency && (candidate->isVMEM() || candidate->isFlatLike()))
739bf215546Sopenharmony_ci         break;
740bf215546Sopenharmony_ci
741bf215546Sopenharmony_ci      if (found_dependency) {
742bf215546Sopenharmony_ci         HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
743bf215546Sopenharmony_ci         if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
744bf215546Sopenharmony_ci             haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
745bf215546Sopenharmony_ci             haz == hazard_fail_export)
746bf215546Sopenharmony_ci            is_dependency = true;
747bf215546Sopenharmony_ci         else if (haz != hazard_success)
748bf215546Sopenharmony_ci            break;
749bf215546Sopenharmony_ci      }
750bf215546Sopenharmony_ci
751bf215546Sopenharmony_ci      if (is_dependency) {
752bf215546Sopenharmony_ci         if (!found_dependency) {
753bf215546Sopenharmony_ci            ctx.mv.upwards_update_insert_idx(up_cursor);
754bf215546Sopenharmony_ci            init_hazard_query(&hq);
755bf215546Sopenharmony_ci            found_dependency = true;
756bf215546Sopenharmony_ci         }
757bf215546Sopenharmony_ci      }
758bf215546Sopenharmony_ci
759bf215546Sopenharmony_ci      if (is_dependency || !found_dependency) {
760bf215546Sopenharmony_ci         if (found_dependency)
761bf215546Sopenharmony_ci            add_to_hazard_query(&hq, candidate.get());
762bf215546Sopenharmony_ci         else
763bf215546Sopenharmony_ci            k++;
764bf215546Sopenharmony_ci         ctx.mv.upwards_skip(up_cursor);
765bf215546Sopenharmony_ci         continue;
766bf215546Sopenharmony_ci      }
767bf215546Sopenharmony_ci
768bf215546Sopenharmony_ci      MoveResult res = ctx.mv.upwards_move(up_cursor);
769bf215546Sopenharmony_ci      if (res == move_fail_ssa || res == move_fail_rar) {
770bf215546Sopenharmony_ci         /* no need to steal from following VMEM instructions */
771bf215546Sopenharmony_ci         if (res == move_fail_ssa && (candidate->isVMEM() || candidate->isFlatLike()))
772bf215546Sopenharmony_ci            break;
773bf215546Sopenharmony_ci         add_to_hazard_query(&hq, candidate.get());
774bf215546Sopenharmony_ci         ctx.mv.upwards_skip(up_cursor);
775bf215546Sopenharmony_ci         continue;
776bf215546Sopenharmony_ci      } else if (res == move_fail_pressure) {
777bf215546Sopenharmony_ci         break;
778bf215546Sopenharmony_ci      }
779bf215546Sopenharmony_ci      k++;
780bf215546Sopenharmony_ci   }
781bf215546Sopenharmony_ci
782bf215546Sopenharmony_ci   ctx.last_SMEM_dep_idx = found_dependency ? up_cursor.insert_idx : 0;
783bf215546Sopenharmony_ci   ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
784bf215546Sopenharmony_ci}
785bf215546Sopenharmony_ci
786bf215546Sopenharmony_civoid
787bf215546Sopenharmony_cischedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
788bf215546Sopenharmony_ci              Instruction* current, int idx)
789bf215546Sopenharmony_ci{
790bf215546Sopenharmony_ci   assert(idx != 0);
791bf215546Sopenharmony_ci   int window_size = VMEM_WINDOW_SIZE;
792bf215546Sopenharmony_ci   int max_moves = VMEM_MAX_MOVES;
793bf215546Sopenharmony_ci   int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
794bf215546Sopenharmony_ci   bool only_clauses = false;
795bf215546Sopenharmony_ci   int16_t k = 0;
796bf215546Sopenharmony_ci
797bf215546Sopenharmony_ci   /* first, check if we have instructions before current to move down */
798bf215546Sopenharmony_ci   hazard_query indep_hq;
799bf215546Sopenharmony_ci   hazard_query clause_hq;
800bf215546Sopenharmony_ci   init_hazard_query(&indep_hq);
801bf215546Sopenharmony_ci   init_hazard_query(&clause_hq);
802bf215546Sopenharmony_ci   add_to_hazard_query(&indep_hq, current);
803bf215546Sopenharmony_ci
804bf215546Sopenharmony_ci   DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
805bf215546Sopenharmony_ci
806bf215546Sopenharmony_ci   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
807bf215546Sopenharmony_ci        candidate_idx--) {
808bf215546Sopenharmony_ci      assert(candidate_idx == cursor.source_idx);
809bf215546Sopenharmony_ci      assert(candidate_idx >= 0);
810bf215546Sopenharmony_ci      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
811bf215546Sopenharmony_ci      bool is_vmem = candidate->isVMEM() || candidate->isFlatLike();
812bf215546Sopenharmony_ci
813bf215546Sopenharmony_ci      /* break when encountering another VMEM instruction, logical_start or barriers */
814bf215546Sopenharmony_ci      if (candidate->opcode == aco_opcode::p_logical_start)
815bf215546Sopenharmony_ci         break;
816bf215546Sopenharmony_ci
817bf215546Sopenharmony_ci      /* break if we'd make the previous SMEM instruction stall */
818bf215546Sopenharmony_ci      bool can_stall_prev_smem =
819bf215546Sopenharmony_ci         idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
820bf215546Sopenharmony_ci      if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
821bf215546Sopenharmony_ci         break;
822bf215546Sopenharmony_ci
823bf215546Sopenharmony_ci      bool part_of_clause = false;
824bf215546Sopenharmony_ci      if (current->isVMEM() == candidate->isVMEM()) {
825bf215546Sopenharmony_ci         int grab_dist = cursor.insert_idx_clause - candidate_idx;
826bf215546Sopenharmony_ci         /* We can't easily tell how much this will decrease the def-to-use
827bf215546Sopenharmony_ci          * distances, so just use how far it will be moved as a heuristic. */
828bf215546Sopenharmony_ci         part_of_clause =
829bf215546Sopenharmony_ci            grab_dist < clause_max_grab_dist + k && should_form_clause(current, candidate.get());
830bf215546Sopenharmony_ci      }
831bf215546Sopenharmony_ci
832bf215546Sopenharmony_ci      /* if current depends on candidate, add additional dependencies and continue */
833bf215546Sopenharmony_ci      bool can_move_down = !is_vmem || part_of_clause || candidate->definitions.empty();
834bf215546Sopenharmony_ci      if (only_clauses) {
835bf215546Sopenharmony_ci         /* In case of high register pressure, only try to form clauses,
836bf215546Sopenharmony_ci          * and only if the previous clause is not larger
837bf215546Sopenharmony_ci          * than the current one will be.
838bf215546Sopenharmony_ci          */
839bf215546Sopenharmony_ci         if (part_of_clause) {
840bf215546Sopenharmony_ci            int clause_size = cursor.insert_idx - cursor.insert_idx_clause;
841bf215546Sopenharmony_ci            int prev_clause_size = 1;
842bf215546Sopenharmony_ci            while (should_form_clause(current,
843bf215546Sopenharmony_ci                                      block->instructions[candidate_idx - prev_clause_size].get()))
844bf215546Sopenharmony_ci               prev_clause_size++;
845bf215546Sopenharmony_ci            if (prev_clause_size > clause_size + 1)
846bf215546Sopenharmony_ci               break;
847bf215546Sopenharmony_ci         } else {
848bf215546Sopenharmony_ci            can_move_down = false;
849bf215546Sopenharmony_ci         }
850bf215546Sopenharmony_ci      }
851bf215546Sopenharmony_ci      HazardResult haz =
852bf215546Sopenharmony_ci         perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
853bf215546Sopenharmony_ci      if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
854bf215546Sopenharmony_ci          haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
855bf215546Sopenharmony_ci          haz == hazard_fail_export)
856bf215546Sopenharmony_ci         can_move_down = false;
857bf215546Sopenharmony_ci      else if (haz != hazard_success)
858bf215546Sopenharmony_ci         break;
859bf215546Sopenharmony_ci
860bf215546Sopenharmony_ci      if (!can_move_down) {
861bf215546Sopenharmony_ci         if (part_of_clause)
862bf215546Sopenharmony_ci            break;
863bf215546Sopenharmony_ci         add_to_hazard_query(&indep_hq, candidate.get());
864bf215546Sopenharmony_ci         add_to_hazard_query(&clause_hq, candidate.get());
865bf215546Sopenharmony_ci         ctx.mv.downwards_skip(cursor);
866bf215546Sopenharmony_ci         continue;
867bf215546Sopenharmony_ci      }
868bf215546Sopenharmony_ci
869bf215546Sopenharmony_ci      Instruction* candidate_ptr = candidate.get();
870bf215546Sopenharmony_ci      MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause);
871bf215546Sopenharmony_ci      if (res == move_fail_ssa || res == move_fail_rar) {
872bf215546Sopenharmony_ci         if (part_of_clause)
873bf215546Sopenharmony_ci            break;
874bf215546Sopenharmony_ci         add_to_hazard_query(&indep_hq, candidate.get());
875bf215546Sopenharmony_ci         add_to_hazard_query(&clause_hq, candidate.get());
876bf215546Sopenharmony_ci         ctx.mv.downwards_skip(cursor);
877bf215546Sopenharmony_ci         continue;
878bf215546Sopenharmony_ci      } else if (res == move_fail_pressure) {
879bf215546Sopenharmony_ci         only_clauses = true;
880bf215546Sopenharmony_ci         if (part_of_clause)
881bf215546Sopenharmony_ci            break;
882bf215546Sopenharmony_ci         add_to_hazard_query(&indep_hq, candidate.get());
883bf215546Sopenharmony_ci         add_to_hazard_query(&clause_hq, candidate.get());
884bf215546Sopenharmony_ci         ctx.mv.downwards_skip(cursor);
885bf215546Sopenharmony_ci         continue;
886bf215546Sopenharmony_ci      }
887bf215546Sopenharmony_ci      if (part_of_clause)
888bf215546Sopenharmony_ci         add_to_hazard_query(&indep_hq, candidate_ptr);
889bf215546Sopenharmony_ci      else
890bf215546Sopenharmony_ci         k++;
891bf215546Sopenharmony_ci      if (candidate_idx < ctx.last_SMEM_dep_idx)
892bf215546Sopenharmony_ci         ctx.last_SMEM_stall++;
893bf215546Sopenharmony_ci   }
894bf215546Sopenharmony_ci
895bf215546Sopenharmony_ci   /* find the first instruction depending on current or find another VMEM */
896bf215546Sopenharmony_ci   UpwardsCursor up_cursor = ctx.mv.upwards_init(idx + 1, true);
897bf215546Sopenharmony_ci
898bf215546Sopenharmony_ci   bool found_dependency = false;
899bf215546Sopenharmony_ci   /* second, check if we have instructions after current to move up */
900bf215546Sopenharmony_ci   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
901bf215546Sopenharmony_ci        candidate_idx++) {
902bf215546Sopenharmony_ci      assert(candidate_idx == up_cursor.source_idx);
903bf215546Sopenharmony_ci      assert(candidate_idx < (int)block->instructions.size());
904bf215546Sopenharmony_ci      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
905bf215546Sopenharmony_ci      bool is_vmem = candidate->isVMEM() || candidate->isFlatLike();
906bf215546Sopenharmony_ci
907bf215546Sopenharmony_ci      if (candidate->opcode == aco_opcode::p_logical_end)
908bf215546Sopenharmony_ci         break;
909bf215546Sopenharmony_ci
910bf215546Sopenharmony_ci      /* check if candidate depends on current */
911bf215546Sopenharmony_ci      bool is_dependency = false;
912bf215546Sopenharmony_ci      if (found_dependency) {
913bf215546Sopenharmony_ci         HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
914bf215546Sopenharmony_ci         if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
915bf215546Sopenharmony_ci             haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
916bf215546Sopenharmony_ci             haz == hazard_fail_barrier || haz == hazard_fail_export)
917bf215546Sopenharmony_ci            is_dependency = true;
918bf215546Sopenharmony_ci         else if (haz != hazard_success)
919bf215546Sopenharmony_ci            break;
920bf215546Sopenharmony_ci      }
921bf215546Sopenharmony_ci
922bf215546Sopenharmony_ci      is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps(up_cursor);
923bf215546Sopenharmony_ci      if (is_dependency) {
924bf215546Sopenharmony_ci         if (!found_dependency) {
925bf215546Sopenharmony_ci            ctx.mv.upwards_update_insert_idx(up_cursor);
926bf215546Sopenharmony_ci            init_hazard_query(&indep_hq);
927bf215546Sopenharmony_ci            found_dependency = true;
928bf215546Sopenharmony_ci         }
929bf215546Sopenharmony_ci      } else if (is_vmem) {
930bf215546Sopenharmony_ci         /* don't move up dependencies of other VMEM instructions */
931bf215546Sopenharmony_ci         for (const Definition& def : candidate->definitions) {
932bf215546Sopenharmony_ci            if (def.isTemp())
933bf215546Sopenharmony_ci               ctx.mv.depends_on[def.tempId()] = true;
934bf215546Sopenharmony_ci         }
935bf215546Sopenharmony_ci      }
936bf215546Sopenharmony_ci
937bf215546Sopenharmony_ci      if (is_dependency || !found_dependency) {
938bf215546Sopenharmony_ci         if (found_dependency)
939bf215546Sopenharmony_ci            add_to_hazard_query(&indep_hq, candidate.get());
940bf215546Sopenharmony_ci         else
941bf215546Sopenharmony_ci            k++;
942bf215546Sopenharmony_ci         ctx.mv.upwards_skip(up_cursor);
943bf215546Sopenharmony_ci         continue;
944bf215546Sopenharmony_ci      }
945bf215546Sopenharmony_ci
946bf215546Sopenharmony_ci      MoveResult res = ctx.mv.upwards_move(up_cursor);
947bf215546Sopenharmony_ci      if (res == move_fail_ssa || res == move_fail_rar) {
948bf215546Sopenharmony_ci         add_to_hazard_query(&indep_hq, candidate.get());
949bf215546Sopenharmony_ci         ctx.mv.upwards_skip(up_cursor);
950bf215546Sopenharmony_ci         continue;
951bf215546Sopenharmony_ci      } else if (res == move_fail_pressure) {
952bf215546Sopenharmony_ci         break;
953bf215546Sopenharmony_ci      }
954bf215546Sopenharmony_ci      k++;
955bf215546Sopenharmony_ci   }
956bf215546Sopenharmony_ci}
957bf215546Sopenharmony_ci
958bf215546Sopenharmony_civoid
959bf215546Sopenharmony_cischedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
960bf215546Sopenharmony_ci                         Instruction* current, int idx)
961bf215546Sopenharmony_ci{
962bf215546Sopenharmony_ci   assert(idx != 0);
963bf215546Sopenharmony_ci   int window_size = POS_EXP_WINDOW_SIZE / ctx.schedule_pos_export_div;
964bf215546Sopenharmony_ci   int max_moves = POS_EXP_MAX_MOVES / ctx.schedule_pos_export_div;
965bf215546Sopenharmony_ci   int16_t k = 0;
966bf215546Sopenharmony_ci
967bf215546Sopenharmony_ci   DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false);
968bf215546Sopenharmony_ci
969bf215546Sopenharmony_ci   hazard_query hq;
970bf215546Sopenharmony_ci   init_hazard_query(&hq);
971bf215546Sopenharmony_ci   add_to_hazard_query(&hq, current);
972bf215546Sopenharmony_ci
973bf215546Sopenharmony_ci   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
974bf215546Sopenharmony_ci        candidate_idx--) {
975bf215546Sopenharmony_ci      assert(candidate_idx >= 0);
976bf215546Sopenharmony_ci      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
977bf215546Sopenharmony_ci
978bf215546Sopenharmony_ci      if (candidate->opcode == aco_opcode::p_logical_start)
979bf215546Sopenharmony_ci         break;
980bf215546Sopenharmony_ci      if (candidate->isVMEM() || candidate->isSMEM() || candidate->isFlatLike())
981bf215546Sopenharmony_ci         break;
982bf215546Sopenharmony_ci
983bf215546Sopenharmony_ci      HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
984bf215546Sopenharmony_ci      if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
985bf215546Sopenharmony_ci         break;
986bf215546Sopenharmony_ci
987bf215546Sopenharmony_ci      if (haz != hazard_success) {
988bf215546Sopenharmony_ci         add_to_hazard_query(&hq, candidate.get());
989bf215546Sopenharmony_ci         ctx.mv.downwards_skip(cursor);
990bf215546Sopenharmony_ci         continue;
991bf215546Sopenharmony_ci      }
992bf215546Sopenharmony_ci
993bf215546Sopenharmony_ci      MoveResult res = ctx.mv.downwards_move(cursor, false);
994bf215546Sopenharmony_ci      if (res == move_fail_ssa || res == move_fail_rar) {
995bf215546Sopenharmony_ci         add_to_hazard_query(&hq, candidate.get());
996bf215546Sopenharmony_ci         ctx.mv.downwards_skip(cursor);
997bf215546Sopenharmony_ci         continue;
998bf215546Sopenharmony_ci      } else if (res == move_fail_pressure) {
999bf215546Sopenharmony_ci         break;
1000bf215546Sopenharmony_ci      }
1001bf215546Sopenharmony_ci      k++;
1002bf215546Sopenharmony_ci   }
1003bf215546Sopenharmony_ci}
1004bf215546Sopenharmony_ci
1005bf215546Sopenharmony_civoid
1006bf215546Sopenharmony_cischedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
1007bf215546Sopenharmony_ci{
1008bf215546Sopenharmony_ci   ctx.last_SMEM_dep_idx = 0;
1009bf215546Sopenharmony_ci   ctx.last_SMEM_stall = INT16_MIN;
1010bf215546Sopenharmony_ci   ctx.mv.block = block;
1011bf215546Sopenharmony_ci   ctx.mv.register_demand = live_vars.register_demand[block->index].data();
1012bf215546Sopenharmony_ci
1013bf215546Sopenharmony_ci   /* go through all instructions and find memory loads */
1014bf215546Sopenharmony_ci   for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
1015bf215546Sopenharmony_ci      Instruction* current = block->instructions[idx].get();
1016bf215546Sopenharmony_ci
1017bf215546Sopenharmony_ci      if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) {
1018bf215546Sopenharmony_ci         unsigned target = current->exp().dest;
1019bf215546Sopenharmony_ci         if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
1020bf215546Sopenharmony_ci            ctx.mv.current = current;
1021bf215546Sopenharmony_ci            schedule_position_export(ctx, block, live_vars.register_demand[block->index], current,
1022bf215546Sopenharmony_ci                                     idx);
1023bf215546Sopenharmony_ci         }
1024bf215546Sopenharmony_ci      }
1025bf215546Sopenharmony_ci
1026bf215546Sopenharmony_ci      if (current->definitions.empty())
1027bf215546Sopenharmony_ci         continue;
1028bf215546Sopenharmony_ci
1029bf215546Sopenharmony_ci      if (current->isVMEM() || current->isFlatLike()) {
1030bf215546Sopenharmony_ci         ctx.mv.current = current;
1031bf215546Sopenharmony_ci         schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
1032bf215546Sopenharmony_ci      }
1033bf215546Sopenharmony_ci
1034bf215546Sopenharmony_ci      if (current->isSMEM()) {
1035bf215546Sopenharmony_ci         ctx.mv.current = current;
1036bf215546Sopenharmony_ci         schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
1037bf215546Sopenharmony_ci      }
1038bf215546Sopenharmony_ci   }
1039bf215546Sopenharmony_ci
1040bf215546Sopenharmony_ci   /* resummarize the block's register demand */
1041bf215546Sopenharmony_ci   block->register_demand = RegisterDemand();
1042bf215546Sopenharmony_ci   for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
1043bf215546Sopenharmony_ci      block->register_demand.update(live_vars.register_demand[block->index][idx]);
1044bf215546Sopenharmony_ci   }
1045bf215546Sopenharmony_ci}
1046bf215546Sopenharmony_ci
1047bf215546Sopenharmony_civoid
1048bf215546Sopenharmony_cischedule_program(Program* program, live& live_vars)
1049bf215546Sopenharmony_ci{
1050bf215546Sopenharmony_ci   /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
1051bf215546Sopenharmony_ci   RegisterDemand demand;
1052bf215546Sopenharmony_ci   for (Block& block : program->blocks)
1053bf215546Sopenharmony_ci      demand.update(block.register_demand);
1054bf215546Sopenharmony_ci   demand.vgpr += program->config->num_shared_vgprs / 2;
1055bf215546Sopenharmony_ci
1056bf215546Sopenharmony_ci   sched_ctx ctx;
1057bf215546Sopenharmony_ci   ctx.mv.depends_on.resize(program->peekAllocationId());
1058bf215546Sopenharmony_ci   ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
1059bf215546Sopenharmony_ci   ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
1060bf215546Sopenharmony_ci   /* Allowing the scheduler to reduce the number of waves to as low as 5
1061bf215546Sopenharmony_ci    * improves performance of Thrones of Britannia significantly and doesn't
1062bf215546Sopenharmony_ci    * seem to hurt anything else. */
1063bf215546Sopenharmony_ci   // TODO: account for possible uneven num_waves on GFX10+
1064bf215546Sopenharmony_ci   unsigned wave_fac = program->dev.physical_vgprs / 256;
1065bf215546Sopenharmony_ci   if (program->num_waves <= 5 * wave_fac)
1066bf215546Sopenharmony_ci      ctx.num_waves = program->num_waves;
1067bf215546Sopenharmony_ci   else if (demand.vgpr >= 29)
1068bf215546Sopenharmony_ci      ctx.num_waves = 5 * wave_fac;
1069bf215546Sopenharmony_ci   else if (demand.vgpr >= 25)
1070bf215546Sopenharmony_ci      ctx.num_waves = 6 * wave_fac;
1071bf215546Sopenharmony_ci   else
1072bf215546Sopenharmony_ci      ctx.num_waves = 7 * wave_fac;
1073bf215546Sopenharmony_ci   ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
1074bf215546Sopenharmony_ci   ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
1075bf215546Sopenharmony_ci   ctx.num_waves = max_suitable_waves(program, ctx.num_waves);
1076bf215546Sopenharmony_ci
1077bf215546Sopenharmony_ci   /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */
1078bf215546Sopenharmony_ci   ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
1079bf215546Sopenharmony_ci
1080bf215546Sopenharmony_ci   assert(ctx.num_waves > 0);
1081bf215546Sopenharmony_ci   ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
1082bf215546Sopenharmony_ci                           int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
1083bf215546Sopenharmony_ci
1084bf215546Sopenharmony_ci   /* NGG culling shaders are very sensitive to position export scheduling.
1085bf215546Sopenharmony_ci    * Schedule less aggressively when early primitive export is used, and
1086bf215546Sopenharmony_ci    * keep the position export at the very bottom when late primitive export is used.
1087bf215546Sopenharmony_ci    */
1088bf215546Sopenharmony_ci   if (program->info.has_ngg_culling && program->stage.num_sw_stages() == 1) {
1089bf215546Sopenharmony_ci      if (!program->info.has_ngg_early_prim_export)
1090bf215546Sopenharmony_ci         ctx.schedule_pos_exports = false;
1091bf215546Sopenharmony_ci      else
1092bf215546Sopenharmony_ci         ctx.schedule_pos_export_div = 4;
1093bf215546Sopenharmony_ci   }
1094bf215546Sopenharmony_ci
1095bf215546Sopenharmony_ci   for (Block& block : program->blocks)
1096bf215546Sopenharmony_ci      schedule_block(ctx, program, &block, live_vars);
1097bf215546Sopenharmony_ci
1098bf215546Sopenharmony_ci   /* update max_reg_demand and num_waves */
1099bf215546Sopenharmony_ci   RegisterDemand new_demand;
1100bf215546Sopenharmony_ci   for (Block& block : program->blocks) {
1101bf215546Sopenharmony_ci      new_demand.update(block.register_demand);
1102bf215546Sopenharmony_ci   }
1103bf215546Sopenharmony_ci   update_vgpr_sgpr_demand(program, new_demand);
1104bf215546Sopenharmony_ci
1105bf215546Sopenharmony_ci/* if enabled, this code asserts that register_demand is updated correctly */
1106bf215546Sopenharmony_ci#if 0
1107bf215546Sopenharmony_ci   int prev_num_waves = program->num_waves;
1108bf215546Sopenharmony_ci   const RegisterDemand prev_max_demand = program->max_reg_demand;
1109bf215546Sopenharmony_ci
1110bf215546Sopenharmony_ci   std::vector<RegisterDemand> demands(program->blocks.size());
1111bf215546Sopenharmony_ci   for (unsigned j = 0; j < program->blocks.size(); j++) {
1112bf215546Sopenharmony_ci      demands[j] = program->blocks[j].register_demand;
1113bf215546Sopenharmony_ci   }
1114bf215546Sopenharmony_ci
1115bf215546Sopenharmony_ci   live live_vars2 = aco::live_var_analysis(program);
1116bf215546Sopenharmony_ci
1117bf215546Sopenharmony_ci   for (unsigned j = 0; j < program->blocks.size(); j++) {
1118bf215546Sopenharmony_ci      Block &b = program->blocks[j];
1119bf215546Sopenharmony_ci      for (unsigned i = 0; i < b.instructions.size(); i++)
1120bf215546Sopenharmony_ci         assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
1121bf215546Sopenharmony_ci      assert(b.register_demand == demands[j]);
1122bf215546Sopenharmony_ci   }
1123bf215546Sopenharmony_ci
1124bf215546Sopenharmony_ci   assert(program->max_reg_demand == prev_max_demand);
1125bf215546Sopenharmony_ci   assert(program->num_waves == prev_num_waves);
1126bf215546Sopenharmony_ci#endif
1127bf215546Sopenharmony_ci}
1128bf215546Sopenharmony_ci
1129bf215546Sopenharmony_ci} // namespace aco
1130