1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright (c) 2017 Lima Project
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sub license,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the
12bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions
13bf215546Sopenharmony_ci * of the Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21bf215546Sopenharmony_ci * DEALINGS IN THE SOFTWARE.
22bf215546Sopenharmony_ci *
23bf215546Sopenharmony_ci */
24bf215546Sopenharmony_ci
25bf215546Sopenharmony_ci#include <limits.h>
26bf215546Sopenharmony_ci
27bf215546Sopenharmony_ci#include "gpir.h"
28bf215546Sopenharmony_ci
29bf215546Sopenharmony_ci/*
30bf215546Sopenharmony_ci * GP scheduling algorithm (by Connor Abbott <cwabbott0@gmail.com>)
31bf215546Sopenharmony_ci *
32bf215546Sopenharmony_ci * The GP pipeline has three main stages:
33bf215546Sopenharmony_ci *
34bf215546Sopenharmony_ci * --------------------------------------------------------
35bf215546Sopenharmony_ci * |                                                      |
36bf215546Sopenharmony_ci * |               Register/Attr/Temp Fetch               |
37bf215546Sopenharmony_ci * |                                                      |
38bf215546Sopenharmony_ci * --------------------------------------------------------
39bf215546Sopenharmony_ci * |        |        |        |        |        |         |
40bf215546Sopenharmony_ci * |  Mul0  |  Mul1  |  Add0  |  Add1  |  Cplx  |  Pass   |
41bf215546Sopenharmony_ci * |        |        |        |        |        |         |
42bf215546Sopenharmony_ci * --------------------------------------------------------
43bf215546Sopenharmony_ci * |                 |                          |         |
44bf215546Sopenharmony_ci * |    Complex1     |   Temp/Register/Varying  |  Pass   |
45bf215546Sopenharmony_ci * |    Stage 2      |         Store            | Stage 2 |
46bf215546Sopenharmony_ci * |                 |                          |         |
47bf215546Sopenharmony_ci * --------------------------------------------------------
48bf215546Sopenharmony_ci *
49bf215546Sopenharmony_ci * Because of this setup, storing a register has a latency of three cycles.
50bf215546Sopenharmony_ci * Also, the register file is organized into 4-component vectors, and the
51bf215546Sopenharmony_ci * load stage can only load two vectors at a time. Aside from these highly
52bf215546Sopenharmony_ci * constrained register load/store units, there is an explicit bypass
53bf215546Sopenharmony_ci * network, where each unit (mul0/mul1/etc.) can access the results of the
54bf215546Sopenharmony_ci * any unit from the previous two cycles directly, except for the complex
55bf215546Sopenharmony_ci * unit whose result can only be accessed for one cycle (since it's expected
56bf215546Sopenharmony_ci * to be used directly by the complex2 instruction in the following cycle).
57bf215546Sopenharmony_ci *
58bf215546Sopenharmony_ci * Because of the very restricted register file, and because only rarely are
59bf215546Sopenharmony_ci * all the units in use at the same time, it can be very beneficial to use
60bf215546Sopenharmony_ci * the unused units to "thread" a value from source to destination by using
61bf215546Sopenharmony_ci * moves in the otherwise-unused units, without involving the register file
62bf215546Sopenharmony_ci * at all. It's very difficult to fully exploit this with a traditional
63bf215546Sopenharmony_ci * scheduler, so we need to do something a little un-traditional. The 512
64bf215546Sopenharmony_ci * instruction limit means that for more complex shaders, we need to do as
65bf215546Sopenharmony_ci * well as possible or else the app won't even work.
66bf215546Sopenharmony_ci *
67bf215546Sopenharmony_ci * The scheduler works by considering the bypass network as a kind of
68bf215546Sopenharmony_ci * register file. It's a quite unusual register file, since registers have to
69bf215546Sopenharmony_ci * be assigned "on the fly" as we schedule operations, but with some care, we
70bf215546Sopenharmony_ci * can use something conceptually similar to a linear-scan allocator to
71bf215546Sopenharmony_ci * successfully schedule nodes to instructions without running into
72bf215546Sopenharmony_ci * conflicts.
73bf215546Sopenharmony_ci *
74bf215546Sopenharmony_ci * Values in the IR are separated into normal values, or "value registers",
75bf215546Sopenharmony_ci * which is what normal nodes like add, mul, etc. produce, and which only
76bf215546Sopenharmony_ci * live inside one basic block, and registers, which can span multiple basic
77bf215546Sopenharmony_ci * blocks but have to be accessed via special load_reg/store_reg nodes. RA
78bf215546Sopenharmony_ci * assigns physical registers to both value registers and normal registers,
79bf215546Sopenharmony_ci * treating load_reg/store_reg as a move instruction, but these are only used
80bf215546Sopenharmony_ci * directly for normal registers -- the physreg assigned to a value register
81bf215546Sopenharmony_ci * is "fake," and is only used inside the scheduler. Before scheduling we
82bf215546Sopenharmony_ci * insert read-after-write dependencies, even for value registers, as if
83bf215546Sopenharmony_ci * we're going to use those, but then we throw them away. For example, if we
84bf215546Sopenharmony_ci * had something like:
85bf215546Sopenharmony_ci *
86bf215546Sopenharmony_ci * (*)r2 = add (*)r1, (*)r2
87bf215546Sopenharmony_ci * (*)r1 = load_reg r0
88bf215546Sopenharmony_ci *
89bf215546Sopenharmony_ci * we'd insert a write-after-read dependency between the add and load_reg,
90bf215546Sopenharmony_ci * even though the starred registers aren't actually used by the scheduler
91bf215546Sopenharmony_ci * after this step. This step is crucial since it guarantees that during any
92bf215546Sopenharmony_ci * point in the schedule, the number of live registers + live value registers
93bf215546Sopenharmony_ci * will never exceed the capacity of the register file and the bypass network
94bf215546Sopenharmony_ci * combined. This is because each live register/value register will have a
95bf215546Sopenharmony_ci * different fake number, thanks to the fake dependencies inserted before
96bf215546Sopenharmony_ci * scheduling. This allows us to not have to worry about spilling to
97bf215546Sopenharmony_ci * temporaries, which is only done ahead of time.
98bf215546Sopenharmony_ci *
99bf215546Sopenharmony_ci * The scheduler is a bottom-up scheduler. It keeps track of each live value
100bf215546Sopenharmony_ci * register, and decides on-the-fly which value registers to keep in the
101bf215546Sopenharmony_ci * bypass network and which to "spill" to registers. Of particular importance
102bf215546Sopenharmony_ci * is the "ready list," which consists of "input nodes" (nodes that produce a
103bf215546Sopenharmony_ci * value that can be consumed via the bypass network), both "partially ready"
104bf215546Sopenharmony_ci * (only some of the uses have been scheduled) and "fully ready" (all uses
105bf215546Sopenharmony_ci * have been scheduled), as well as other non-input nodes like register
106bf215546Sopenharmony_ci * stores. Each input node on the ready list represents a live value register
107bf215546Sopenharmony_ci * before the current instruction. There must be at most 11 such input nodes
108bf215546Sopenharmony_ci * at all times, since there are only 11 slots in the next two instructions
109bf215546Sopenharmony_ci * which can reach the current instruction.
110bf215546Sopenharmony_ci *
111bf215546Sopenharmony_ci * An input node is a "max node" if it has a use two cycles ago, which must be
112bf215546Sopenharmony_ci * connected to a definition this cycle. Otherwise it may be a "next max node"
113bf215546Sopenharmony_ci * if it will be a max node on the next instruction (i.e. it has a use at most
114bf215546Sopenharmony_ci * one cycle ago), or it may be neither if all of its uses are this cycle. As
115bf215546Sopenharmony_ci * we keep adding instructions to the front, input nodes graduate from
116bf215546Sopenharmony_ci * neither, to next max, to max, unless we decide to insert a move to keep it
117bf215546Sopenharmony_ci * alive longer, at which point any uses after the current instruction are
118bf215546Sopenharmony_ci * rewritten to be uses of the move so that the original node returns to
119bf215546Sopenharmony_ci * neither. The scheduler decides which nodes to try freely, but we have to
120bf215546Sopenharmony_ci * reserve slots for two different reasons: (1) out of the 5 non-complex
121bf215546Sopenharmony_ci * slots, we reserve a slot for each max node, so that we can connect a
122bf215546Sopenharmony_ci * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a
123bf215546Sopenharmony_ci * slot for every next-max node above 5, so that for the next instruction
124bf215546Sopenharmony_ci * there are no more than 5 max nodes. When a max or next-max node gets
125bf215546Sopenharmony_ci * scheduled, the corresponding reservation is reduced by one. At the end, we
126bf215546Sopenharmony_ci * insert moves for every slot that was reserved. The reservation is actually
127bf215546Sopenharmony_ci * managed by nir_instr, and all we have to do is tell it how many to reserve
128bf215546Sopenharmony_ci * at the beginning and then tell it which nodes are max/next-max nodes. When
129bf215546Sopenharmony_ci * we start scheduling an instruction, there will be at most 5 max nodes
130bf215546Sopenharmony_ci * thanks to the previous instruction's next-max reservation/move insertion.
131bf215546Sopenharmony_ci * Since there are at most 11 total input nodes, if there are N max nodes,
132bf215546Sopenharmony_ci * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 =
133bf215546Sopenharmony_ci * 6 - N slots need to be reserved for next-max nodes, and so at most
134bf215546Sopenharmony_ci * 6 - N + N = 6 slots need to be reserved in total, exactly the total number
135bf215546Sopenharmony_ci * of slots. So, thanks to the total input node restriction, we will never
136bf215546Sopenharmony_ci * need to reserve too many slots.
137bf215546Sopenharmony_ci *
138bf215546Sopenharmony_ci * It sometimes happens that scheduling a given node will violate this total
139bf215546Sopenharmony_ci * input node restriction, or that a reservation will mean that we can't
140bf215546Sopenharmony_ci * schedule it. We first schedule a node "speculatively" to see if this is a
141bf215546Sopenharmony_ci * problem. If some of the node's sources are loads, then we can schedule
142bf215546Sopenharmony_ci * the node and its dependent loads in one swoop to avoid going over the
143bf215546Sopenharmony_ci * pressure limit. If that fails, we can try to spill a ready or
144bf215546Sopenharmony_ci * partially-ready input node to a register by rewriting all of its uses to
145bf215546Sopenharmony_ci * refer to a register load. This removes it from the list of ready and
146bf215546Sopenharmony_ci * partially ready input nodes as all of its uses are now unscheduled. If
147bf215546Sopenharmony_ci * successful, we can then proceed with scheduling the original node. All of
148bf215546Sopenharmony_ci * this happens "speculatively," meaning that afterwards the node is removed
149bf215546Sopenharmony_ci * and the entire state of the scheduler is reverted to before it was tried, to
150bf215546Sopenharmony_ci * ensure that we never get into an invalid state and run out of spots for
151bf215546Sopenharmony_ci * moves. In try_nodes(), we try to schedule each node speculatively on the
152bf215546Sopenharmony_ci * ready list, keeping only the nodes that could be successfully scheduled, so
153bf215546Sopenharmony_ci * that when we finally decide which node to actually schedule, we know it
154bf215546Sopenharmony_ci * will succeed.  This is how we decide on the fly which values go in
155bf215546Sopenharmony_ci * registers and which go in the bypass network. Note that "unspilling" a
156bf215546Sopenharmony_ci * value is simply a matter of scheduling the store_reg instruction created
157bf215546Sopenharmony_ci * when we spill.
158bf215546Sopenharmony_ci *
159bf215546Sopenharmony_ci * The careful accounting of live value registers, reservations for moves, and
160bf215546Sopenharmony_ci * speculative scheduling guarantee that we never run into a failure case
161bf215546Sopenharmony_ci * while scheduling. However, we need to make sure that this scheduler will
162bf215546Sopenharmony_ci * not get stuck in an infinite loop, i.e. that we'll always make forward
163bf215546Sopenharmony_ci * progress by eventually scheduling a non-move node. If we run out of value
164bf215546Sopenharmony_ci * registers, then we may have to spill a node to a register. If we
165bf215546Sopenharmony_ci * were to schedule one of the fully-ready nodes, then we'd have 11 + N live
166bf215546Sopenharmony_ci * value registers before the current instruction. But since there are at most
167bf215546Sopenharmony_ci * 64+11 live registers and register values total thanks to the fake
168bf215546Sopenharmony_ci * dependencies we inserted before scheduling, there are at most 64 - N live
169bf215546Sopenharmony_ci * physical registers, and therefore there are at least N registers available
170bf215546Sopenharmony_ci * for spilling. Not all these registers will be available immediately, since
171bf215546Sopenharmony_ci * in order to spill a node to a given register we have to ensure that there
172bf215546Sopenharmony_ci * are slots available to rewrite every use to a load instruction, and that
173bf215546Sopenharmony_ci * may not be the case. There may also be intervening writes which prevent
174bf215546Sopenharmony_ci * some registers from being used. However, these are all temporary problems,
175bf215546Sopenharmony_ci * since as we create each instruction, we create additional register load
176bf215546Sopenharmony_ci * slots that can be freely used for spilling, and we create more move nodes
177bf215546Sopenharmony_ci * which means that the uses of the nodes we're trying to spill keep moving
178bf215546Sopenharmony_ci * forward. This means that eventually, these problems will go away, at which
179bf215546Sopenharmony_ci * point we'll be able to spill a node successfully, so eventually we'll be
180bf215546Sopenharmony_ci * able to schedule the first node on the ready list.
181bf215546Sopenharmony_ci */
182bf215546Sopenharmony_ci
183bf215546Sopenharmony_citypedef struct {
184bf215546Sopenharmony_ci   /* This is the list of ready and partially-ready nodes. A partially-ready
185bf215546Sopenharmony_ci    * node must have at least one input dependency already scheduled.
186bf215546Sopenharmony_ci    */
187bf215546Sopenharmony_ci   struct list_head ready_list;
188bf215546Sopenharmony_ci
189bf215546Sopenharmony_ci   /* The number of ready or partially-ready nodes with at least one input
190bf215546Sopenharmony_ci    * dependency already scheduled. In other words, the number of live value
191bf215546Sopenharmony_ci    * registers. This must be at most 11.
192bf215546Sopenharmony_ci    */
193bf215546Sopenharmony_ci   int ready_list_slots;
194bf215546Sopenharmony_ci
195bf215546Sopenharmony_ci   /* The physical registers live into the current instruction. */
196bf215546Sopenharmony_ci   uint64_t live_physregs;
197bf215546Sopenharmony_ci
198bf215546Sopenharmony_ci   /* The current instruction. */
199bf215546Sopenharmony_ci   gpir_instr *instr;
200bf215546Sopenharmony_ci
201bf215546Sopenharmony_ci   /* The current basic block. */
202bf215546Sopenharmony_ci   gpir_block *block;
203bf215546Sopenharmony_ci
204bf215546Sopenharmony_ci   /* True if at least one node failed to schedule due to lack of available
205bf215546Sopenharmony_ci    * value registers.
206bf215546Sopenharmony_ci    */
207bf215546Sopenharmony_ci   bool try_spill_all;
208bf215546Sopenharmony_ci
209bf215546Sopenharmony_ci   /* The number of max nodes needed to spill to successfully schedule the
210bf215546Sopenharmony_ci    * instruction.
211bf215546Sopenharmony_ci    */
212bf215546Sopenharmony_ci   int max_node_spill_needed;
213bf215546Sopenharmony_ci
214bf215546Sopenharmony_ci   /* The number of max and next-max nodes needed to spill to successfully
215bf215546Sopenharmony_ci    * schedule the instruction.
216bf215546Sopenharmony_ci    */
217bf215546Sopenharmony_ci   int total_spill_needed;
218bf215546Sopenharmony_ci
219bf215546Sopenharmony_ci   /* For each physical register, a linked list of loads associated with it in
220bf215546Sopenharmony_ci    * this block. When we spill a value to a given register, and there are
221bf215546Sopenharmony_ci    * existing loads associated with it that haven't been scheduled yet, we
222bf215546Sopenharmony_ci    * have to make sure that the corresponding unspill happens after the last
223bf215546Sopenharmony_ci    * original use has happened, i.e. is scheduled before.
224bf215546Sopenharmony_ci    */
225bf215546Sopenharmony_ci   struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM];
226bf215546Sopenharmony_ci} sched_ctx;
227bf215546Sopenharmony_ci
228bf215546Sopenharmony_cistatic int gpir_min_dist_alu(gpir_dep *dep)
229bf215546Sopenharmony_ci{
230bf215546Sopenharmony_ci   switch (dep->pred->op) {
231bf215546Sopenharmony_ci   case gpir_op_load_uniform:
232bf215546Sopenharmony_ci   case gpir_op_load_temp:
233bf215546Sopenharmony_ci   case gpir_op_load_reg:
234bf215546Sopenharmony_ci   case gpir_op_load_attribute:
235bf215546Sopenharmony_ci      return 0;
236bf215546Sopenharmony_ci
237bf215546Sopenharmony_ci   case gpir_op_complex1:
238bf215546Sopenharmony_ci      return 2;
239bf215546Sopenharmony_ci
240bf215546Sopenharmony_ci   default:
241bf215546Sopenharmony_ci      return 1;
242bf215546Sopenharmony_ci   }
243bf215546Sopenharmony_ci}
244bf215546Sopenharmony_ci
245bf215546Sopenharmony_cistatic int gpir_get_min_dist(gpir_dep *dep)
246bf215546Sopenharmony_ci{
247bf215546Sopenharmony_ci   switch (dep->type) {
248bf215546Sopenharmony_ci   case GPIR_DEP_INPUT:
249bf215546Sopenharmony_ci      switch (dep->succ->op) {
250bf215546Sopenharmony_ci      case gpir_op_store_temp:
251bf215546Sopenharmony_ci      case gpir_op_store_reg:
252bf215546Sopenharmony_ci      case gpir_op_store_varying:
253bf215546Sopenharmony_ci         /* Stores must use an alu node as input. Also, complex1 takes two
254bf215546Sopenharmony_ci          * cycles, which means that its result cannot be stored to a register
255bf215546Sopenharmony_ci          * as part of the normal path, and therefore it must also have a move
256bf215546Sopenharmony_ci          * inserted.
257bf215546Sopenharmony_ci          */
258bf215546Sopenharmony_ci         if (dep->pred->type == gpir_node_type_load ||
259bf215546Sopenharmony_ci             dep->pred->op == gpir_op_complex1)
260bf215546Sopenharmony_ci            return INT_MAX >> 2;
261bf215546Sopenharmony_ci         else
262bf215546Sopenharmony_ci            return 0;
263bf215546Sopenharmony_ci
264bf215546Sopenharmony_ci      default:
265bf215546Sopenharmony_ci         return gpir_min_dist_alu(dep);
266bf215546Sopenharmony_ci      }
267bf215546Sopenharmony_ci
268bf215546Sopenharmony_ci   case GPIR_DEP_OFFSET:
269bf215546Sopenharmony_ci      assert(dep->succ->op == gpir_op_store_temp);
270bf215546Sopenharmony_ci      return gpir_min_dist_alu(dep);
271bf215546Sopenharmony_ci
272bf215546Sopenharmony_ci   case GPIR_DEP_READ_AFTER_WRITE:
273bf215546Sopenharmony_ci      if (dep->succ->op == gpir_op_load_temp &&
274bf215546Sopenharmony_ci          dep->pred->op == gpir_op_store_temp) {
275bf215546Sopenharmony_ci         return 4;
276bf215546Sopenharmony_ci      } else if (dep->succ->op == gpir_op_load_reg &&
277bf215546Sopenharmony_ci                 dep->pred->op == gpir_op_store_reg) {
278bf215546Sopenharmony_ci         return 3;
279bf215546Sopenharmony_ci      } else if ((dep->pred->op == gpir_op_store_temp_load_off0 ||
280bf215546Sopenharmony_ci                  dep->pred->op == gpir_op_store_temp_load_off1 ||
281bf215546Sopenharmony_ci                  dep->pred->op == gpir_op_store_temp_load_off2) &&
282bf215546Sopenharmony_ci                 dep->succ->op == gpir_op_load_uniform) {
283bf215546Sopenharmony_ci         return 4;
284bf215546Sopenharmony_ci      } else {
285bf215546Sopenharmony_ci         /* Fake dependency */
286bf215546Sopenharmony_ci         return 0;
287bf215546Sopenharmony_ci      }
288bf215546Sopenharmony_ci
289bf215546Sopenharmony_ci   case GPIR_DEP_WRITE_AFTER_READ:
290bf215546Sopenharmony_ci      return 0;
291bf215546Sopenharmony_ci   }
292bf215546Sopenharmony_ci
293bf215546Sopenharmony_ci   return 0;
294bf215546Sopenharmony_ci}
295bf215546Sopenharmony_ci
296bf215546Sopenharmony_cistatic int gpir_max_dist_alu(gpir_dep *dep)
297bf215546Sopenharmony_ci{
298bf215546Sopenharmony_ci   switch (dep->pred->op) {
299bf215546Sopenharmony_ci   case gpir_op_load_uniform:
300bf215546Sopenharmony_ci   case gpir_op_load_temp:
301bf215546Sopenharmony_ci      return 0;
302bf215546Sopenharmony_ci   case gpir_op_load_attribute:
303bf215546Sopenharmony_ci      return 1;
304bf215546Sopenharmony_ci   case gpir_op_load_reg:
305bf215546Sopenharmony_ci      if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 ||
306bf215546Sopenharmony_ci          dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3)
307bf215546Sopenharmony_ci         return 0;
308bf215546Sopenharmony_ci      else
309bf215546Sopenharmony_ci         return 1;
310bf215546Sopenharmony_ci   case gpir_op_exp2_impl:
311bf215546Sopenharmony_ci   case gpir_op_log2_impl:
312bf215546Sopenharmony_ci   case gpir_op_rcp_impl:
313bf215546Sopenharmony_ci   case gpir_op_rsqrt_impl:
314bf215546Sopenharmony_ci   case gpir_op_store_temp_load_off0:
315bf215546Sopenharmony_ci   case gpir_op_store_temp_load_off1:
316bf215546Sopenharmony_ci   case gpir_op_store_temp_load_off2:
317bf215546Sopenharmony_ci      return 1;
318bf215546Sopenharmony_ci   case gpir_op_mov:
319bf215546Sopenharmony_ci      if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
320bf215546Sopenharmony_ci         return 1;
321bf215546Sopenharmony_ci      else
322bf215546Sopenharmony_ci         return 2;
323bf215546Sopenharmony_ci   default:
324bf215546Sopenharmony_ci      return 2;
325bf215546Sopenharmony_ci   }
326bf215546Sopenharmony_ci}
327bf215546Sopenharmony_ci
328bf215546Sopenharmony_cistatic int gpir_get_max_dist(gpir_dep *dep)
329bf215546Sopenharmony_ci{
330bf215546Sopenharmony_ci   switch (dep->type) {
331bf215546Sopenharmony_ci   case GPIR_DEP_INPUT:
332bf215546Sopenharmony_ci      switch (dep->succ->op) {
333bf215546Sopenharmony_ci      case gpir_op_store_temp:
334bf215546Sopenharmony_ci      case gpir_op_store_reg:
335bf215546Sopenharmony_ci      case gpir_op_store_varying:
336bf215546Sopenharmony_ci         return 0;
337bf215546Sopenharmony_ci
338bf215546Sopenharmony_ci      default:
339bf215546Sopenharmony_ci         return gpir_max_dist_alu(dep);
340bf215546Sopenharmony_ci      }
341bf215546Sopenharmony_ci
342bf215546Sopenharmony_ci   case GPIR_DEP_OFFSET:
343bf215546Sopenharmony_ci      assert(dep->succ->op == gpir_op_store_temp);
344bf215546Sopenharmony_ci      return gpir_max_dist_alu(dep);
345bf215546Sopenharmony_ci
346bf215546Sopenharmony_ci   default:
347bf215546Sopenharmony_ci      return INT_MAX >> 2; /* Don't want to overflow... */
348bf215546Sopenharmony_ci   }
349bf215546Sopenharmony_ci}
350bf215546Sopenharmony_ci
351bf215546Sopenharmony_cistatic void schedule_update_distance(gpir_node *node)
352bf215546Sopenharmony_ci{
353bf215546Sopenharmony_ci   if (gpir_node_is_leaf(node)) {
354bf215546Sopenharmony_ci      node->sched.dist = 0;
355bf215546Sopenharmony_ci      return;
356bf215546Sopenharmony_ci   }
357bf215546Sopenharmony_ci
358bf215546Sopenharmony_ci   gpir_node_foreach_pred(node, dep) {
359bf215546Sopenharmony_ci      gpir_node *pred = dep->pred;
360bf215546Sopenharmony_ci
361bf215546Sopenharmony_ci      if (pred->sched.dist < 0)
362bf215546Sopenharmony_ci         schedule_update_distance(pred);
363bf215546Sopenharmony_ci
364bf215546Sopenharmony_ci      int dist = pred->sched.dist + gpir_min_dist_alu(dep);
365bf215546Sopenharmony_ci      if (node->sched.dist < dist)
366bf215546Sopenharmony_ci         node->sched.dist = dist;
367bf215546Sopenharmony_ci   }
368bf215546Sopenharmony_ci}
369bf215546Sopenharmony_ci
370bf215546Sopenharmony_cistatic bool gpir_is_input_node(gpir_node *node)
371bf215546Sopenharmony_ci{
372bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
373bf215546Sopenharmony_ci      if (dep->type == GPIR_DEP_INPUT)
374bf215546Sopenharmony_ci         return true;
375bf215546Sopenharmony_ci   }
376bf215546Sopenharmony_ci   return false;
377bf215546Sopenharmony_ci}
378bf215546Sopenharmony_ci
379bf215546Sopenharmony_ci
380bf215546Sopenharmony_ci/* Get the number of slots required for a node on the ready list.
381bf215546Sopenharmony_ci */
382bf215546Sopenharmony_cistatic int gpir_get_slots_required(gpir_node *node)
383bf215546Sopenharmony_ci{
384bf215546Sopenharmony_ci   if (!gpir_is_input_node(node))
385bf215546Sopenharmony_ci      return 0;
386bf215546Sopenharmony_ci
387bf215546Sopenharmony_ci   /* Note that we assume every node only consumes one slot, even dual-slot
388bf215546Sopenharmony_ci    * instructions. While dual-slot instructions may consume more than one
389bf215546Sopenharmony_ci    * slot, we can always safely insert a move if it turns out that there
390bf215546Sopenharmony_ci    * isn't enough space for them. There's the risk that we get stuck in an
391bf215546Sopenharmony_ci    * infinite loop if all the fully ready nodes are dual-slot nodes, but we
392bf215546Sopenharmony_ci    * rely on spilling to registers to save us here.
393bf215546Sopenharmony_ci    */
394bf215546Sopenharmony_ci   return 1;
395bf215546Sopenharmony_ci}
396bf215546Sopenharmony_ci
397bf215546Sopenharmony_cistatic void ASSERTED verify_ready_list(sched_ctx *ctx)
398bf215546Sopenharmony_ci{
399bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
400bf215546Sopenharmony_ci      if (!gpir_is_input_node(node)) {
401bf215546Sopenharmony_ci         assert(node->sched.ready);
402bf215546Sopenharmony_ci      }
403bf215546Sopenharmony_ci
404bf215546Sopenharmony_ci      if (node->sched.ready) {
405bf215546Sopenharmony_ci         /* Every successor must have been scheduled */
406bf215546Sopenharmony_ci         gpir_node_foreach_succ(node, dep) {
407bf215546Sopenharmony_ci            assert(dep->succ->sched.instr);
408bf215546Sopenharmony_ci         }
409bf215546Sopenharmony_ci      } else {
410bf215546Sopenharmony_ci         /* There must be at least one successor that's not scheduled. */
411bf215546Sopenharmony_ci         bool unscheduled = false;
412bf215546Sopenharmony_ci         gpir_node_foreach_succ(node, dep) {
413bf215546Sopenharmony_ci            unscheduled |= !(dep->succ->sched.instr);
414bf215546Sopenharmony_ci         }
415bf215546Sopenharmony_ci
416bf215546Sopenharmony_ci         assert(unscheduled);
417bf215546Sopenharmony_ci      }
418bf215546Sopenharmony_ci   }
419bf215546Sopenharmony_ci}
420bf215546Sopenharmony_ci
421bf215546Sopenharmony_cistatic void schedule_insert_ready_list(sched_ctx *ctx,
422bf215546Sopenharmony_ci                                       gpir_node *insert_node)
423bf215546Sopenharmony_ci{
424bf215546Sopenharmony_ci   /* if this node is fully ready or partially ready
425bf215546Sopenharmony_ci    *   fully ready: all successors have been scheduled
426bf215546Sopenharmony_ci    *   partially ready: part of input successors have been scheduled
427bf215546Sopenharmony_ci    *
428bf215546Sopenharmony_ci    * either fully ready or partially ready node need be inserted to
429bf215546Sopenharmony_ci    * the ready list, but we only schedule a move node for partially
430bf215546Sopenharmony_ci    * ready node.
431bf215546Sopenharmony_ci    */
432bf215546Sopenharmony_ci   bool ready = true, insert = false;
433bf215546Sopenharmony_ci   gpir_node_foreach_succ(insert_node, dep) {
434bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
435bf215546Sopenharmony_ci      if (succ->sched.instr) {
436bf215546Sopenharmony_ci         if (dep->type == GPIR_DEP_INPUT)
437bf215546Sopenharmony_ci            insert = true;
438bf215546Sopenharmony_ci      }
439bf215546Sopenharmony_ci      else
440bf215546Sopenharmony_ci         ready = false;
441bf215546Sopenharmony_ci   }
442bf215546Sopenharmony_ci
443bf215546Sopenharmony_ci   insert_node->sched.ready = ready;
444bf215546Sopenharmony_ci   /* for root node */
445bf215546Sopenharmony_ci   insert |= ready;
446bf215546Sopenharmony_ci
447bf215546Sopenharmony_ci   if (!insert || insert_node->sched.inserted)
448bf215546Sopenharmony_ci      return;
449bf215546Sopenharmony_ci
450bf215546Sopenharmony_ci   struct list_head *insert_pos = &ctx->ready_list;
451bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
452bf215546Sopenharmony_ci      if ((insert_node->sched.dist > node->sched.dist ||
453bf215546Sopenharmony_ci          gpir_op_infos[insert_node->op].schedule_first) &&
454bf215546Sopenharmony_ci          !gpir_op_infos[node->op].schedule_first) {
455bf215546Sopenharmony_ci         insert_pos = &node->list;
456bf215546Sopenharmony_ci         break;
457bf215546Sopenharmony_ci      }
458bf215546Sopenharmony_ci   }
459bf215546Sopenharmony_ci
460bf215546Sopenharmony_ci   list_addtail(&insert_node->list, insert_pos);
461bf215546Sopenharmony_ci   insert_node->sched.inserted = true;
462bf215546Sopenharmony_ci   ctx->ready_list_slots += gpir_get_slots_required(insert_node);
463bf215546Sopenharmony_ci}
464bf215546Sopenharmony_ci
465bf215546Sopenharmony_cistatic int gpir_get_max_start(gpir_node *node)
466bf215546Sopenharmony_ci{
467bf215546Sopenharmony_ci   int max_start = 0;
468bf215546Sopenharmony_ci
469bf215546Sopenharmony_ci   /* find the max start instr constrainted by all successors */
470bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
471bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
472bf215546Sopenharmony_ci      if (!succ->sched.instr)
473bf215546Sopenharmony_ci         continue;
474bf215546Sopenharmony_ci
475bf215546Sopenharmony_ci      int start = succ->sched.instr->index + gpir_get_min_dist(dep);
476bf215546Sopenharmony_ci      if (start > max_start)
477bf215546Sopenharmony_ci         max_start = start;
478bf215546Sopenharmony_ci   }
479bf215546Sopenharmony_ci
480bf215546Sopenharmony_ci   return max_start;
481bf215546Sopenharmony_ci}
482bf215546Sopenharmony_ci
483bf215546Sopenharmony_cistatic int gpir_get_min_end(gpir_node *node)
484bf215546Sopenharmony_ci{
485bf215546Sopenharmony_ci   int min_end = INT_MAX;
486bf215546Sopenharmony_ci
487bf215546Sopenharmony_ci   /* find the min end instr constrainted by all successors */
488bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
489bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
490bf215546Sopenharmony_ci      if (!succ->sched.instr)
491bf215546Sopenharmony_ci         continue;
492bf215546Sopenharmony_ci
493bf215546Sopenharmony_ci      int end = succ->sched.instr->index + gpir_get_max_dist(dep);
494bf215546Sopenharmony_ci      if (end < min_end)
495bf215546Sopenharmony_ci         min_end = end;
496bf215546Sopenharmony_ci   }
497bf215546Sopenharmony_ci
498bf215546Sopenharmony_ci   return min_end;
499bf215546Sopenharmony_ci}
500bf215546Sopenharmony_ci
501bf215546Sopenharmony_cistatic gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node)
502bf215546Sopenharmony_ci{
503bf215546Sopenharmony_ci   gpir_load_node *load = gpir_node_to_load(node);
504bf215546Sopenharmony_ci
505bf215546Sopenharmony_ci   for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) {
506bf215546Sopenharmony_ci      if (!instr->slots[i])
507bf215546Sopenharmony_ci         continue;
508bf215546Sopenharmony_ci
509bf215546Sopenharmony_ci      gpir_load_node *iload = gpir_node_to_load(instr->slots[i]);
510bf215546Sopenharmony_ci      if (load->node.op == iload->node.op &&
511bf215546Sopenharmony_ci          load->index == iload->index &&
512bf215546Sopenharmony_ci          load->component == iload->component)
513bf215546Sopenharmony_ci         return &iload->node;
514bf215546Sopenharmony_ci   }
515bf215546Sopenharmony_ci   return NULL;
516bf215546Sopenharmony_ci}
517bf215546Sopenharmony_ci
518bf215546Sopenharmony_ci/* Simply place the node into the given instruction without trying to deal
519bf215546Sopenharmony_ci * with liveness or the ready list. This will only fail if the instruction
520bf215546Sopenharmony_ci * cannot be placed due to a lack of available slots. In addition to normal
521bf215546Sopenharmony_ci * node placement, this is also used for placing loads when spilling to
522bf215546Sopenharmony_ci * registers.
523bf215546Sopenharmony_ci */
524bf215546Sopenharmony_cistatic bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)
525bf215546Sopenharmony_ci{
526bf215546Sopenharmony_ci   if (node->type == gpir_node_type_load) {
527bf215546Sopenharmony_ci      gpir_node *load = gpir_sched_instr_has_load(instr, node);
528bf215546Sopenharmony_ci      if (load) {
529bf215546Sopenharmony_ci         /* This node may have a store as a successor, in which case we have to
530bf215546Sopenharmony_ci          * fail it exactly like below in order to later create a move node in
531bf215546Sopenharmony_ci          * between.
532bf215546Sopenharmony_ci          */
533bf215546Sopenharmony_ci         if (instr->index < gpir_get_max_start(node))
534bf215546Sopenharmony_ci            return false;
535bf215546Sopenharmony_ci
536bf215546Sopenharmony_ci         gpir_debug("same load %d in instr %d for node %d\n",
537bf215546Sopenharmony_ci                    load->index, instr->index, node->index);
538bf215546Sopenharmony_ci
539bf215546Sopenharmony_ci         /* not really merge two node, just fake scheduled same place */
540bf215546Sopenharmony_ci         node->sched.instr = load->sched.instr;
541bf215546Sopenharmony_ci         node->sched.pos = load->sched.pos;
542bf215546Sopenharmony_ci         return true;
543bf215546Sopenharmony_ci      }
544bf215546Sopenharmony_ci   }
545bf215546Sopenharmony_ci
546bf215546Sopenharmony_ci   if (node->op == gpir_op_store_reg) {
547bf215546Sopenharmony_ci      /* This register may be loaded in the next basic block, in which case
548bf215546Sopenharmony_ci       * there still needs to be a 2 instruction gap. We do what the blob
549bf215546Sopenharmony_ci       * seems to do and simply disable stores in the last two instructions of
550bf215546Sopenharmony_ci       * the basic block.
551bf215546Sopenharmony_ci       *
552bf215546Sopenharmony_ci       * TODO: We may be able to do better than this, but we have to check
553bf215546Sopenharmony_ci       * first if storing a register works across branches.
554bf215546Sopenharmony_ci       */
555bf215546Sopenharmony_ci      if (instr->index < 2)
556bf215546Sopenharmony_ci         return false;
557bf215546Sopenharmony_ci   }
558bf215546Sopenharmony_ci
559bf215546Sopenharmony_ci   node->sched.instr = instr;
560bf215546Sopenharmony_ci
561bf215546Sopenharmony_ci   int max_node_spill_needed = INT_MAX;
562bf215546Sopenharmony_ci   int total_spill_needed = INT_MAX;
563bf215546Sopenharmony_ci   int *slots = gpir_op_infos[node->op].slots;
564bf215546Sopenharmony_ci   for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) {
565bf215546Sopenharmony_ci      node->sched.pos = slots[i];
566bf215546Sopenharmony_ci      if (instr->index >= gpir_get_max_start(node) &&
567bf215546Sopenharmony_ci          instr->index <= gpir_get_min_end(node) &&
568bf215546Sopenharmony_ci          gpir_instr_try_insert_node(instr, node))
569bf215546Sopenharmony_ci         return true;
570bf215546Sopenharmony_ci      if (ctx->instr->non_cplx_slot_difference ||
571bf215546Sopenharmony_ci          ctx->instr->slot_difference) {
572bf215546Sopenharmony_ci         /* If one of these fields is non-zero, then we could insert the node
573bf215546Sopenharmony_ci          * here after spilling. To get an accurate count of how many nodes we
574bf215546Sopenharmony_ci          * need to spill, we need to choose one of the positions where there
575bf215546Sopenharmony_ci          * were nonzero slot differences, preferably one with the smallest
576bf215546Sopenharmony_ci          * difference (so we don't have to spill as much).
577bf215546Sopenharmony_ci          */
578bf215546Sopenharmony_ci         if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed ||
579bf215546Sopenharmony_ci             ctx->instr->slot_difference < total_spill_needed) {
580bf215546Sopenharmony_ci            max_node_spill_needed = ctx->instr->non_cplx_slot_difference;
581bf215546Sopenharmony_ci            total_spill_needed = ctx->instr->slot_difference;
582bf215546Sopenharmony_ci         }
583bf215546Sopenharmony_ci      }
584bf215546Sopenharmony_ci   }
585bf215546Sopenharmony_ci
586bf215546Sopenharmony_ci   if (max_node_spill_needed != INT_MAX) {
587bf215546Sopenharmony_ci      /* Indicate how many spill nodes are needed. */
588bf215546Sopenharmony_ci      ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed,
589bf215546Sopenharmony_ci                                        max_node_spill_needed);
590bf215546Sopenharmony_ci      ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
591bf215546Sopenharmony_ci                                     total_spill_needed);
592bf215546Sopenharmony_ci   }
593bf215546Sopenharmony_ci   node->sched.instr = NULL;
594bf215546Sopenharmony_ci   node->sched.pos = -1;
595bf215546Sopenharmony_ci   return false;
596bf215546Sopenharmony_ci}
597bf215546Sopenharmony_ci
598bf215546Sopenharmony_ci/* Try to place just the node given, updating the ready list. If "speculative"
599bf215546Sopenharmony_ci * is true, then this is part of the pre-commit phase. If false, then we have
600bf215546Sopenharmony_ci * committed to placing this node, so update liveness and ready list
601bf215546Sopenharmony_ci * information.
602bf215546Sopenharmony_ci */
603bf215546Sopenharmony_ci
604bf215546Sopenharmony_cistatic bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node,
605bf215546Sopenharmony_ci                                    bool speculative)
606bf215546Sopenharmony_ci{
607bf215546Sopenharmony_ci   if (!_try_place_node(ctx, ctx->instr, node)) {
608bf215546Sopenharmony_ci      if (!speculative)
609bf215546Sopenharmony_ci         gpir_debug("failed to place %d\n", node->index);
610bf215546Sopenharmony_ci      return false;
611bf215546Sopenharmony_ci   }
612bf215546Sopenharmony_ci
613bf215546Sopenharmony_ci   ctx->ready_list_slots -= gpir_get_slots_required(node);
614bf215546Sopenharmony_ci
615bf215546Sopenharmony_ci   if (!speculative) {
616bf215546Sopenharmony_ci      gpir_debug("placed node %d\n", node->index);
617bf215546Sopenharmony_ci
618bf215546Sopenharmony_ci      /* We assume here that writes are placed before reads. If this changes,
619bf215546Sopenharmony_ci       * then this needs to be updated.
620bf215546Sopenharmony_ci       */
621bf215546Sopenharmony_ci      if (node->op == gpir_op_store_reg) {
622bf215546Sopenharmony_ci         gpir_store_node *store = gpir_node_to_store(node);
623bf215546Sopenharmony_ci         ctx->live_physregs &=
624bf215546Sopenharmony_ci            ~(1ull << (4 * store->index + store->component));
625bf215546Sopenharmony_ci         if (store->child->sched.physreg_store == store)
626bf215546Sopenharmony_ci            store->child->sched.physreg_store = NULL;
627bf215546Sopenharmony_ci      }
628bf215546Sopenharmony_ci
629bf215546Sopenharmony_ci      if (node->op == gpir_op_load_reg) {
630bf215546Sopenharmony_ci         gpir_load_node *load = gpir_node_to_load(node);
631bf215546Sopenharmony_ci         ctx->live_physregs |=
632bf215546Sopenharmony_ci            (1ull << (4 * load->index + load->component));
633bf215546Sopenharmony_ci      }
634bf215546Sopenharmony_ci
635bf215546Sopenharmony_ci      list_del(&node->list);
636bf215546Sopenharmony_ci      list_add(&node->list, &ctx->block->node_list);
637bf215546Sopenharmony_ci      gpir_node_foreach_pred(node, dep) {
638bf215546Sopenharmony_ci         gpir_node *pred = dep->pred;
639bf215546Sopenharmony_ci         schedule_insert_ready_list(ctx, pred);
640bf215546Sopenharmony_ci      }
641bf215546Sopenharmony_ci   } else {
642bf215546Sopenharmony_ci      gpir_node_foreach_pred(node, dep) {
643bf215546Sopenharmony_ci         gpir_node *pred = dep->pred;
644bf215546Sopenharmony_ci         if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT)
645bf215546Sopenharmony_ci            ctx->ready_list_slots += gpir_get_slots_required(pred);
646bf215546Sopenharmony_ci      }
647bf215546Sopenharmony_ci   }
648bf215546Sopenharmony_ci
649bf215546Sopenharmony_ci   return true;
650bf215546Sopenharmony_ci}
651bf215546Sopenharmony_ci
652bf215546Sopenharmony_ci/* Create a new node with "node" as the child, replace all uses of "node" with
653bf215546Sopenharmony_ci * this new node, and replace "node" with it in the ready list.
654bf215546Sopenharmony_ci */
655bf215546Sopenharmony_cistatic gpir_node *create_replacement(sched_ctx *ctx, gpir_node *node,
656bf215546Sopenharmony_ci                                     gpir_op op)
657bf215546Sopenharmony_ci{
658bf215546Sopenharmony_ci
659bf215546Sopenharmony_ci   gpir_alu_node *new_node = gpir_node_create(node->block, op);
660bf215546Sopenharmony_ci   if (unlikely(!new_node))
661bf215546Sopenharmony_ci      return NULL;
662bf215546Sopenharmony_ci
663bf215546Sopenharmony_ci   new_node->children[0] = node;
664bf215546Sopenharmony_ci   new_node->num_child = 1;
665bf215546Sopenharmony_ci
666bf215546Sopenharmony_ci   new_node->node.sched.instr = NULL;
667bf215546Sopenharmony_ci   new_node->node.sched.pos = -1;
668bf215546Sopenharmony_ci   new_node->node.sched.dist = node->sched.dist;
669bf215546Sopenharmony_ci   new_node->node.sched.max_node = node->sched.max_node;
670bf215546Sopenharmony_ci   new_node->node.sched.next_max_node = node->sched.next_max_node;
671bf215546Sopenharmony_ci   new_node->node.sched.complex_allowed = node->sched.complex_allowed;
672bf215546Sopenharmony_ci
673bf215546Sopenharmony_ci   ctx->ready_list_slots--;
674bf215546Sopenharmony_ci   list_del(&node->list);
675bf215546Sopenharmony_ci   node->sched.max_node = false;
676bf215546Sopenharmony_ci   node->sched.next_max_node = false;
677bf215546Sopenharmony_ci   node->sched.ready = false;
678bf215546Sopenharmony_ci   node->sched.inserted = false;
679bf215546Sopenharmony_ci   gpir_node_replace_succ(&new_node->node, node);
680bf215546Sopenharmony_ci   gpir_node_add_dep(&new_node->node, node, GPIR_DEP_INPUT);
681bf215546Sopenharmony_ci   schedule_insert_ready_list(ctx, &new_node->node);
682bf215546Sopenharmony_ci   return &new_node->node;
683bf215546Sopenharmony_ci}
684bf215546Sopenharmony_ci
685bf215546Sopenharmony_cistatic gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
686bf215546Sopenharmony_ci{
687bf215546Sopenharmony_ci   gpir_node *move = create_replacement(ctx, node, gpir_op_mov);
688bf215546Sopenharmony_ci   gpir_debug("create move %d for %d\n", move->index, node->index);
689bf215546Sopenharmony_ci   return move;
690bf215546Sopenharmony_ci}
691bf215546Sopenharmony_ci
692bf215546Sopenharmony_cistatic gpir_node *create_postlog2(sched_ctx *ctx, gpir_node *node)
693bf215546Sopenharmony_ci{
694bf215546Sopenharmony_ci   assert(node->op == gpir_op_complex1);
695bf215546Sopenharmony_ci   gpir_node *postlog2 = create_replacement(ctx, node, gpir_op_postlog2);
696bf215546Sopenharmony_ci   gpir_debug("create postlog2 %d for %d\n", postlog2->index, node->index);
697bf215546Sopenharmony_ci   return postlog2;
698bf215546Sopenharmony_ci}
699bf215546Sopenharmony_ci
700bf215546Sopenharmony_ci/* Once we schedule the successor, would the predecessor be fully ready? */
701bf215546Sopenharmony_cistatic bool pred_almost_ready(gpir_dep *dep)
702bf215546Sopenharmony_ci{
703bf215546Sopenharmony_ci   bool fully_ready = true;
704bf215546Sopenharmony_ci   gpir_node_foreach_succ(dep->pred, other_dep) {
705bf215546Sopenharmony_ci      gpir_node *succ = other_dep->succ;
706bf215546Sopenharmony_ci      if (!succ->sched.instr && dep->succ != other_dep->succ) {
707bf215546Sopenharmony_ci         fully_ready = false;
708bf215546Sopenharmony_ci         break;
709bf215546Sopenharmony_ci      }
710bf215546Sopenharmony_ci   }
711bf215546Sopenharmony_ci
712bf215546Sopenharmony_ci   return fully_ready;
713bf215546Sopenharmony_ci}
714bf215546Sopenharmony_ci
715bf215546Sopenharmony_ci/* Recursively try to schedule a node and all its dependent nodes that can fit
716bf215546Sopenharmony_ci * in the same  instruction. There is a simple heuristic scoring system to try
717bf215546Sopenharmony_ci * to group together nodes that load different components of the same input,
718bf215546Sopenharmony_ci * to avoid bottlenecking for operations like matrix multiplies that are
719bf215546Sopenharmony_ci * mostly input-bound.
720bf215546Sopenharmony_ci */
721bf215546Sopenharmony_ci
722bf215546Sopenharmony_cistatic int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
723bf215546Sopenharmony_ci{
724bf215546Sopenharmony_ci   if (!schedule_try_place_node(ctx, node, speculative))
725bf215546Sopenharmony_ci      return INT_MIN;
726bf215546Sopenharmony_ci
727bf215546Sopenharmony_ci   int score = 0;
728bf215546Sopenharmony_ci
729bf215546Sopenharmony_ci   gpir_node_foreach_pred(node, dep) {
730bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
731bf215546Sopenharmony_ci         continue;
732bf215546Sopenharmony_ci
733bf215546Sopenharmony_ci      int pred_score = INT_MIN;
734bf215546Sopenharmony_ci      if (pred_almost_ready(dep)) {
735bf215546Sopenharmony_ci         if (dep->pred->type == gpir_node_type_load ||
736bf215546Sopenharmony_ci             node->type == gpir_node_type_store) {
737bf215546Sopenharmony_ci            pred_score = _schedule_try_node(ctx, dep->pred, speculative);
738bf215546Sopenharmony_ci         }
739bf215546Sopenharmony_ci      }
740bf215546Sopenharmony_ci      if (dep->pred->type == gpir_node_type_load ||
741bf215546Sopenharmony_ci          node->type == gpir_node_type_store) {
742bf215546Sopenharmony_ci         if (pred_score == INT_MIN) {
743bf215546Sopenharmony_ci            if (node->op == gpir_op_mov) {
744bf215546Sopenharmony_ci               /* The only moves on the ready list are for loads that we
745bf215546Sopenharmony_ci                * couldn't schedule immediately, as created below. If we
746bf215546Sopenharmony_ci                * couldn't schedule the load, there's no point scheduling
747bf215546Sopenharmony_ci                * the move. The normal move threading logic will ensure
748bf215546Sopenharmony_ci                * that another move is created if we're about to go too far
749bf215546Sopenharmony_ci                * from the uses of this move.
750bf215546Sopenharmony_ci                */
751bf215546Sopenharmony_ci               assert(speculative);
752bf215546Sopenharmony_ci               return INT_MIN;
753bf215546Sopenharmony_ci            } else if (!speculative && dep->pred->type == gpir_node_type_load) {
754bf215546Sopenharmony_ci               /* We couldn't schedule the load right away, so it will have
755bf215546Sopenharmony_ci                * to happen in some earlier instruction and then be moved
756bf215546Sopenharmony_ci                * into a value register and threaded to the use by "node".
757bf215546Sopenharmony_ci                * We create the move right away, so that later we'll fail
758bf215546Sopenharmony_ci                * to schedule it if there isn't a slot for a move
759bf215546Sopenharmony_ci                * available.
760bf215546Sopenharmony_ci                */
761bf215546Sopenharmony_ci               create_move(ctx, dep->pred);
762bf215546Sopenharmony_ci            }
763bf215546Sopenharmony_ci            /* Penalize nodes whose dependent ops we couldn't schedule.
764bf215546Sopenharmony_ci             */
765bf215546Sopenharmony_ci            score--;
766bf215546Sopenharmony_ci         } else {
767bf215546Sopenharmony_ci            score += pred_score;
768bf215546Sopenharmony_ci            continue;
769bf215546Sopenharmony_ci         }
770bf215546Sopenharmony_ci      }
771bf215546Sopenharmony_ci   }
772bf215546Sopenharmony_ci
773bf215546Sopenharmony_ci   return score;
774bf215546Sopenharmony_ci}
775bf215546Sopenharmony_ci
776bf215546Sopenharmony_ci/* If we speculatively tried a node, undo everything.
777bf215546Sopenharmony_ci */
778bf215546Sopenharmony_ci
779bf215546Sopenharmony_cistatic void schedule_undo_node(sched_ctx *ctx, gpir_node *node)
780bf215546Sopenharmony_ci{
781bf215546Sopenharmony_ci   gpir_instr_remove_node(ctx->instr, node);
782bf215546Sopenharmony_ci
783bf215546Sopenharmony_ci   gpir_node_foreach_pred(node, dep) {
784bf215546Sopenharmony_ci      gpir_node *pred = dep->pred;
785bf215546Sopenharmony_ci      if (pred->sched.instr) {
786bf215546Sopenharmony_ci         schedule_undo_node(ctx, pred);
787bf215546Sopenharmony_ci      }
788bf215546Sopenharmony_ci   }
789bf215546Sopenharmony_ci}
790bf215546Sopenharmony_ci
791bf215546Sopenharmony_ci/* Try to schedule a node. We also try to schedule any predecessors that can
792bf215546Sopenharmony_ci * be part of the same instruction. If "speculative" is true, then we don't
793bf215546Sopenharmony_ci * actually change any state, only returning the score were the node to be
794bf215546Sopenharmony_ci * scheduled, with INT_MIN meaning "cannot be scheduled at all".
795bf215546Sopenharmony_ci */
796bf215546Sopenharmony_cistatic int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
797bf215546Sopenharmony_ci{
798bf215546Sopenharmony_ci   int prev_slots = ctx->ready_list_slots;
799bf215546Sopenharmony_ci
800bf215546Sopenharmony_ci   int score = _schedule_try_node(ctx, node, speculative);
801bf215546Sopenharmony_ci
802bf215546Sopenharmony_ci   if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) {
803bf215546Sopenharmony_ci      assert(speculative);
804bf215546Sopenharmony_ci      ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
805bf215546Sopenharmony_ci                                     ctx->ready_list_slots - GPIR_VALUE_REG_NUM);
806bf215546Sopenharmony_ci      score = INT_MIN;
807bf215546Sopenharmony_ci   }
808bf215546Sopenharmony_ci
809bf215546Sopenharmony_ci   if (speculative) {
810bf215546Sopenharmony_ci      ctx->ready_list_slots = prev_slots;
811bf215546Sopenharmony_ci      if (node->sched.instr)
812bf215546Sopenharmony_ci         schedule_undo_node(ctx, node);
813bf215546Sopenharmony_ci   }
814bf215546Sopenharmony_ci
815bf215546Sopenharmony_ci   return score;
816bf215546Sopenharmony_ci}
817bf215546Sopenharmony_ci
818bf215546Sopenharmony_ci/* This is called when we want to spill "node" by inserting loads at its uses.
819bf215546Sopenharmony_ci * It returns all the possible registers we can use so that all the loads will
820bf215546Sopenharmony_ci * successfully be inserted. Also return the first instruction we'll need to
821bf215546Sopenharmony_ci * insert a load for.
822bf215546Sopenharmony_ci */
823bf215546Sopenharmony_ci
824bf215546Sopenharmony_cistatic uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node,
825bf215546Sopenharmony_ci                                   int *min_index)
826bf215546Sopenharmony_ci{
827bf215546Sopenharmony_ci   uint64_t available = ~0ull;
828bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
829bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
830bf215546Sopenharmony_ci         continue;
831bf215546Sopenharmony_ci
832bf215546Sopenharmony_ci      gpir_node *use = dep->succ;
833bf215546Sopenharmony_ci      gpir_instr *instr = use->sched.instr;
834bf215546Sopenharmony_ci
835bf215546Sopenharmony_ci      if (!instr) {
836bf215546Sopenharmony_ci         /* This use isn't scheduled, so no need to spill it. */
837bf215546Sopenharmony_ci         continue;
838bf215546Sopenharmony_ci      }
839bf215546Sopenharmony_ci
840bf215546Sopenharmony_ci      if (use->type == gpir_node_type_store) {
841bf215546Sopenharmony_ci         /* We're trying to spill something that was recently stored... just
842bf215546Sopenharmony_ci          * bail out.
843bf215546Sopenharmony_ci          */
844bf215546Sopenharmony_ci         return 0;
845bf215546Sopenharmony_ci      }
846bf215546Sopenharmony_ci
847bf215546Sopenharmony_ci      if (use->op == gpir_op_mov && instr == ctx->instr) {
848bf215546Sopenharmony_ci         /* We try to spill the sources of this move, so we can free up space
849bf215546Sopenharmony_ci          * in the current instruction.
850bf215546Sopenharmony_ci          *
851bf215546Sopenharmony_ci          * TODO: should we go back further? It might let us schedule the
852bf215546Sopenharmony_ci          * write earlier in some cases, but then we might fail to spill.
853bf215546Sopenharmony_ci          */
854bf215546Sopenharmony_ci         available &= get_available_regs(ctx, use, min_index);
855bf215546Sopenharmony_ci      } else {
856bf215546Sopenharmony_ci         if (instr->index < *min_index)
857bf215546Sopenharmony_ci            *min_index = instr->index;
858bf215546Sopenharmony_ci
859bf215546Sopenharmony_ci         uint64_t use_available = 0;
860bf215546Sopenharmony_ci
861bf215546Sopenharmony_ci         if (instr->reg0_use_count == 0)
862bf215546Sopenharmony_ci            use_available = ~0ull;
863bf215546Sopenharmony_ci         else if (!instr->reg0_is_attr)
864bf215546Sopenharmony_ci            use_available = 0xfull << (4 * instr->reg0_index);
865bf215546Sopenharmony_ci
866bf215546Sopenharmony_ci         if (instr->reg1_use_count == 0)
867bf215546Sopenharmony_ci            use_available = ~0ull;
868bf215546Sopenharmony_ci         else
869bf215546Sopenharmony_ci            use_available |= 0xfull << (4 * instr->reg1_index);
870bf215546Sopenharmony_ci
871bf215546Sopenharmony_ci         available &= use_available;
872bf215546Sopenharmony_ci      }
873bf215546Sopenharmony_ci   }
874bf215546Sopenharmony_ci
875bf215546Sopenharmony_ci   return available;
876bf215546Sopenharmony_ci}
877bf215546Sopenharmony_ci
878bf215546Sopenharmony_ci/* Using "min_index" returned by get_available_regs(), figure out which
879bf215546Sopenharmony_ci * registers are killed by a write after or during the current instruction and
880bf215546Sopenharmony_ci * hence we can't use for spilling. Writes that haven't been scheduled yet
881bf215546Sopenharmony_ci * should be reflected in live_physregs.
882bf215546Sopenharmony_ci */
883bf215546Sopenharmony_ci
884bf215546Sopenharmony_cistatic uint64_t get_killed_regs(sched_ctx *ctx, int min_index)
885bf215546Sopenharmony_ci{
886bf215546Sopenharmony_ci   uint64_t killed = 0;
887bf215546Sopenharmony_ci
888bf215546Sopenharmony_ci   list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) {
889bf215546Sopenharmony_ci      if (instr->index <= min_index)
890bf215546Sopenharmony_ci         break;
891bf215546Sopenharmony_ci
892bf215546Sopenharmony_ci      for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3;
893bf215546Sopenharmony_ci           slot++) {
894bf215546Sopenharmony_ci         if (!instr->slots[slot])
895bf215546Sopenharmony_ci            continue;
896bf215546Sopenharmony_ci
897bf215546Sopenharmony_ci         gpir_store_node *store = gpir_node_to_store(instr->slots[slot]);
898bf215546Sopenharmony_ci         if (store->node.op != gpir_op_store_reg)
899bf215546Sopenharmony_ci            continue;
900bf215546Sopenharmony_ci
901bf215546Sopenharmony_ci         killed |= 1ull << (4 * store->index + store->component);
902bf215546Sopenharmony_ci      }
903bf215546Sopenharmony_ci   }
904bf215546Sopenharmony_ci
905bf215546Sopenharmony_ci   return killed;
906bf215546Sopenharmony_ci}
907bf215546Sopenharmony_ci
908bf215546Sopenharmony_ci/* Actually spill a node so that it is no longer in the ready list. Note that
909bf215546Sopenharmony_ci * this must exactly follow the logic of get_available_regs() or else the
910bf215546Sopenharmony_ci * loads could fail to schedule.
911bf215546Sopenharmony_ci */
912bf215546Sopenharmony_ci
913bf215546Sopenharmony_cistatic void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)
914bf215546Sopenharmony_ci{
915bf215546Sopenharmony_ci   gpir_node_foreach_succ_safe(node, dep) {
916bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
917bf215546Sopenharmony_ci         continue;
918bf215546Sopenharmony_ci
919bf215546Sopenharmony_ci      gpir_node *use = dep->succ;
920bf215546Sopenharmony_ci      gpir_instr *instr = use->sched.instr;
921bf215546Sopenharmony_ci
922bf215546Sopenharmony_ci      if (!instr)
923bf215546Sopenharmony_ci         continue;
924bf215546Sopenharmony_ci
925bf215546Sopenharmony_ci      if (use->op == gpir_op_mov && instr == ctx->instr) {
926bf215546Sopenharmony_ci         spill_node(ctx, use, store);
927bf215546Sopenharmony_ci      } else {
928bf215546Sopenharmony_ci         gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg);
929bf215546Sopenharmony_ci         load->index = store->index;
930bf215546Sopenharmony_ci         load->component = store->component;
931bf215546Sopenharmony_ci         list_add(&load->node.list, &ctx->block->node_list);
932bf215546Sopenharmony_ci         gpir_node_replace_child(dep->succ, dep->pred, &load->node);
933bf215546Sopenharmony_ci         gpir_node_replace_pred(dep, &load->node);
934bf215546Sopenharmony_ci         gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
935bf215546Sopenharmony_ci         gpir_debug("spilling use %d of node %d to load node %d\n",
936bf215546Sopenharmony_ci                    use->index, node->index, load->node.index);
937bf215546Sopenharmony_ci         ASSERTED bool result = _try_place_node(ctx, use->sched.instr, &load->node);
938bf215546Sopenharmony_ci         assert(result);
939bf215546Sopenharmony_ci      }
940bf215546Sopenharmony_ci   }
941bf215546Sopenharmony_ci
942bf215546Sopenharmony_ci   if (node->op == gpir_op_mov) {
943bf215546Sopenharmony_ci      /* We replaced all the uses of the move, so it's dead now. */
944bf215546Sopenharmony_ci      gpir_instr_remove_node(node->sched.instr, node);
945bf215546Sopenharmony_ci      gpir_node_delete(node);
946bf215546Sopenharmony_ci   } else {
947bf215546Sopenharmony_ci      /* We deleted all the uses of the node except the store, so it's not
948bf215546Sopenharmony_ci       * live anymore.
949bf215546Sopenharmony_ci       */
950bf215546Sopenharmony_ci      list_del(&node->list);
951bf215546Sopenharmony_ci      node->sched.inserted = false;
952bf215546Sopenharmony_ci      ctx->ready_list_slots--;
953bf215546Sopenharmony_ci      if (node->sched.max_node) {
954bf215546Sopenharmony_ci         node->sched.max_node = false;
955bf215546Sopenharmony_ci         ctx->instr->alu_num_slot_needed_by_max--;
956bf215546Sopenharmony_ci      }
957bf215546Sopenharmony_ci      if (node->sched.next_max_node) {
958bf215546Sopenharmony_ci         node->sched.next_max_node = false;
959bf215546Sopenharmony_ci         ctx->instr->alu_num_unscheduled_next_max--;
960bf215546Sopenharmony_ci      }
961bf215546Sopenharmony_ci   }
962bf215546Sopenharmony_ci}
963bf215546Sopenharmony_ci
964bf215546Sopenharmony_cistatic bool used_by_store(gpir_node *node, gpir_instr *instr)
965bf215546Sopenharmony_ci{
966bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
967bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
968bf215546Sopenharmony_ci         continue;
969bf215546Sopenharmony_ci
970bf215546Sopenharmony_ci      if (dep->succ->type == gpir_node_type_store &&
971bf215546Sopenharmony_ci          dep->succ->sched.instr == instr)
972bf215546Sopenharmony_ci         return true;
973bf215546Sopenharmony_ci   }
974bf215546Sopenharmony_ci
975bf215546Sopenharmony_ci   return false;
976bf215546Sopenharmony_ci}
977bf215546Sopenharmony_ci
978bf215546Sopenharmony_cistatic gpir_node *consuming_postlog2(gpir_node *node)
979bf215546Sopenharmony_ci{
980bf215546Sopenharmony_ci   if (node->op != gpir_op_complex1)
981bf215546Sopenharmony_ci      return NULL;
982bf215546Sopenharmony_ci
983bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
984bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
985bf215546Sopenharmony_ci         continue;
986bf215546Sopenharmony_ci      if (dep->succ->op == gpir_op_postlog2)
987bf215546Sopenharmony_ci         return dep->succ;
988bf215546Sopenharmony_ci      else
989bf215546Sopenharmony_ci         return NULL;
990bf215546Sopenharmony_ci   }
991bf215546Sopenharmony_ci
992bf215546Sopenharmony_ci   return NULL;
993bf215546Sopenharmony_ci}
994bf215546Sopenharmony_ci
995bf215546Sopenharmony_cistatic bool try_spill_node(sched_ctx *ctx, gpir_node *node)
996bf215546Sopenharmony_ci{
997bf215546Sopenharmony_ci   assert(node->op != gpir_op_mov);
998bf215546Sopenharmony_ci
999bf215546Sopenharmony_ci   if (used_by_store(node, ctx->instr))
1000bf215546Sopenharmony_ci      return false;
1001bf215546Sopenharmony_ci
1002bf215546Sopenharmony_ci   gpir_debug("trying to spill %d\n", node->index);
1003bf215546Sopenharmony_ci
1004bf215546Sopenharmony_ci   int min_instr = INT_MAX;
1005bf215546Sopenharmony_ci   uint64_t available = get_available_regs(ctx, node, &min_instr);
1006bf215546Sopenharmony_ci   available &= ~get_killed_regs(ctx, min_instr);
1007bf215546Sopenharmony_ci
1008bf215546Sopenharmony_ci   if (node->sched.physreg_store) {
1009bf215546Sopenharmony_ci      gpir_store_node *store = node->sched.physreg_store;
1010bf215546Sopenharmony_ci      if (!(available & (1ull << (4 * store->index + store->component))))
1011bf215546Sopenharmony_ci         return false;
1012bf215546Sopenharmony_ci   } else {
1013bf215546Sopenharmony_ci      available &= ~ctx->live_physregs;
1014bf215546Sopenharmony_ci
1015bf215546Sopenharmony_ci      if (available == 0)
1016bf215546Sopenharmony_ci         return false;
1017bf215546Sopenharmony_ci
1018bf215546Sopenharmony_ci      /* Don't spill complex1 if it's used postlog2, turn the postlog2 into a
1019bf215546Sopenharmony_ci       * move, replace the complex1 with postlog2 and spill that instead. The
1020bf215546Sopenharmony_ci       * store needs a move anyways so the postlog2 is usually free.
1021bf215546Sopenharmony_ci       */
1022bf215546Sopenharmony_ci      gpir_node *postlog2 = consuming_postlog2(node);
1023bf215546Sopenharmony_ci      if (postlog2) {
1024bf215546Sopenharmony_ci         postlog2->op = gpir_op_mov;
1025bf215546Sopenharmony_ci         node = create_postlog2(ctx, node);
1026bf215546Sopenharmony_ci      }
1027bf215546Sopenharmony_ci
1028bf215546Sopenharmony_ci      /* TODO: use a better heuristic for choosing an available register? */
1029bf215546Sopenharmony_ci      int physreg = ffsll(available) - 1;
1030bf215546Sopenharmony_ci
1031bf215546Sopenharmony_ci      ctx->live_physregs |= (1ull << physreg);
1032bf215546Sopenharmony_ci
1033bf215546Sopenharmony_ci      gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);
1034bf215546Sopenharmony_ci      store->index = physreg / 4;
1035bf215546Sopenharmony_ci      store->component = physreg % 4;
1036bf215546Sopenharmony_ci      store->child = node;
1037bf215546Sopenharmony_ci      store->node.sched.max_node = false;
1038bf215546Sopenharmony_ci      store->node.sched.next_max_node = false;
1039bf215546Sopenharmony_ci      store->node.sched.complex_allowed = false;
1040bf215546Sopenharmony_ci      store->node.sched.pos = -1;
1041bf215546Sopenharmony_ci      store->node.sched.instr = NULL;
1042bf215546Sopenharmony_ci      store->node.sched.inserted = false;
1043bf215546Sopenharmony_ci      store->node.sched.dist = node->sched.dist;
1044bf215546Sopenharmony_ci      if (node->op == gpir_op_complex1) {
1045bf215546Sopenharmony_ci         /* Complex1 cannot be directly stored, and has a latency of 2 */
1046bf215546Sopenharmony_ci         store->node.sched.dist += 2;
1047bf215546Sopenharmony_ci      }
1048bf215546Sopenharmony_ci      node->sched.physreg_store = store;
1049bf215546Sopenharmony_ci      gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
1050bf215546Sopenharmony_ci
1051bf215546Sopenharmony_ci      list_for_each_entry(gpir_load_node, load,
1052bf215546Sopenharmony_ci                          &ctx->physreg_reads[physreg], reg_link) {
1053bf215546Sopenharmony_ci         gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ);
1054bf215546Sopenharmony_ci         if (load->node.sched.ready) {
1055bf215546Sopenharmony_ci            list_del(&load->node.list);
1056bf215546Sopenharmony_ci            load->node.sched.ready = false;
1057bf215546Sopenharmony_ci         }
1058bf215546Sopenharmony_ci      }
1059bf215546Sopenharmony_ci
1060bf215546Sopenharmony_ci      node->sched.ready = false;
1061bf215546Sopenharmony_ci      schedule_insert_ready_list(ctx, &store->node);
1062bf215546Sopenharmony_ci   }
1063bf215546Sopenharmony_ci
1064bf215546Sopenharmony_ci   gpir_debug("spilling %d to $%d.%c, store %d\n", node->index,
1065bf215546Sopenharmony_ci              node->sched.physreg_store->index,
1066bf215546Sopenharmony_ci              "xyzw"[node->sched.physreg_store->component],
1067bf215546Sopenharmony_ci              node->sched.physreg_store->node.index);
1068bf215546Sopenharmony_ci
1069bf215546Sopenharmony_ci   spill_node(ctx, node, node->sched.physreg_store);
1070bf215546Sopenharmony_ci
1071bf215546Sopenharmony_ci   return true;
1072bf215546Sopenharmony_ci}
1073bf215546Sopenharmony_ci
1074bf215546Sopenharmony_cistatic bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node)
1075bf215546Sopenharmony_ci{
1076bf215546Sopenharmony_ci   /* First, try to spill max nodes. */
1077bf215546Sopenharmony_ci   list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1078bf215546Sopenharmony_ci      if (ctx->max_node_spill_needed <= 0)
1079bf215546Sopenharmony_ci         break;
1080bf215546Sopenharmony_ci
1081bf215546Sopenharmony_ci      /* orig_node is the node we're trying to schedule, so spilling it makes
1082bf215546Sopenharmony_ci       * no sense. Also don't try to spill any nodes in front of it, since
1083bf215546Sopenharmony_ci       * they might be scheduled instead.
1084bf215546Sopenharmony_ci       */
1085bf215546Sopenharmony_ci      if (node == orig_node)
1086bf215546Sopenharmony_ci         break;
1087bf215546Sopenharmony_ci
1088bf215546Sopenharmony_ci      if (node->op == gpir_op_mov) {
1089bf215546Sopenharmony_ci         /* Don't try to spill loads, since that only adds another load and
1090bf215546Sopenharmony_ci          * store which is likely pointless.
1091bf215546Sopenharmony_ci          */
1092bf215546Sopenharmony_ci         continue;
1093bf215546Sopenharmony_ci      }
1094bf215546Sopenharmony_ci
1095bf215546Sopenharmony_ci      if (!gpir_is_input_node(node) || !node->sched.max_node)
1096bf215546Sopenharmony_ci         continue;
1097bf215546Sopenharmony_ci
1098bf215546Sopenharmony_ci      if (try_spill_node(ctx, node)) {
1099bf215546Sopenharmony_ci         ctx->max_node_spill_needed--;
1100bf215546Sopenharmony_ci         ctx->total_spill_needed--;
1101bf215546Sopenharmony_ci      }
1102bf215546Sopenharmony_ci   }
1103bf215546Sopenharmony_ci
1104bf215546Sopenharmony_ci   /* Now, try to spill the remaining nodes. */
1105bf215546Sopenharmony_ci   list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1106bf215546Sopenharmony_ci      if (ctx->total_spill_needed <= 0)
1107bf215546Sopenharmony_ci         break;
1108bf215546Sopenharmony_ci
1109bf215546Sopenharmony_ci      if (node == orig_node)
1110bf215546Sopenharmony_ci         break;
1111bf215546Sopenharmony_ci
1112bf215546Sopenharmony_ci      if (node->op == gpir_op_mov)
1113bf215546Sopenharmony_ci         continue;
1114bf215546Sopenharmony_ci
1115bf215546Sopenharmony_ci      if (!gpir_is_input_node(node) ||
1116bf215546Sopenharmony_ci          !(node->sched.max_node || node->sched.next_max_node))
1117bf215546Sopenharmony_ci         continue;
1118bf215546Sopenharmony_ci
1119bf215546Sopenharmony_ci      if (try_spill_node(ctx, node))
1120bf215546Sopenharmony_ci         ctx->total_spill_needed--;
1121bf215546Sopenharmony_ci   }
1122bf215546Sopenharmony_ci
1123bf215546Sopenharmony_ci   return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0;
1124bf215546Sopenharmony_ci}
1125bf215546Sopenharmony_ci
1126bf215546Sopenharmony_cistatic int ASSERTED gpir_get_curr_ready_list_slots(sched_ctx *ctx)
1127bf215546Sopenharmony_ci{
1128bf215546Sopenharmony_ci   int total = 0;
1129bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1130bf215546Sopenharmony_ci      total += gpir_get_slots_required(node);
1131bf215546Sopenharmony_ci   }
1132bf215546Sopenharmony_ci
1133bf215546Sopenharmony_ci   return total;
1134bf215546Sopenharmony_ci}
1135bf215546Sopenharmony_ci
1136bf215546Sopenharmony_ci/* What gpir_get_min_end() would return if node were replaced with a move
1137bf215546Sopenharmony_ci * instruction not in the complex slot. Normally this is 2 + min_end, except
1138bf215546Sopenharmony_ci * for some store instructions which must have the move node in the same
1139bf215546Sopenharmony_ci * instruction.
1140bf215546Sopenharmony_ci */
1141bf215546Sopenharmony_cistatic int gpir_get_min_end_as_move(gpir_node *node)
1142bf215546Sopenharmony_ci{
1143bf215546Sopenharmony_ci   int min = INT_MAX;
1144bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
1145bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
1146bf215546Sopenharmony_ci      if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) {
1147bf215546Sopenharmony_ci         switch (succ->op) {
1148bf215546Sopenharmony_ci         case gpir_op_store_temp:
1149bf215546Sopenharmony_ci         case gpir_op_store_reg:
1150bf215546Sopenharmony_ci         case gpir_op_store_varying:
1151bf215546Sopenharmony_ci            continue;
1152bf215546Sopenharmony_ci         default:
1153bf215546Sopenharmony_ci            break;
1154bf215546Sopenharmony_ci         }
1155bf215546Sopenharmony_ci         if (min > succ->sched.instr->index + 2)
1156bf215546Sopenharmony_ci            min = succ->sched.instr->index + 2;
1157bf215546Sopenharmony_ci      }
1158bf215546Sopenharmony_ci   }
1159bf215546Sopenharmony_ci   return min;
1160bf215546Sopenharmony_ci}
1161bf215546Sopenharmony_ci
1162bf215546Sopenharmony_ci/* The second source for add0, add1, mul0, and mul1 units cannot be complex.
1163bf215546Sopenharmony_ci * The hardware overwrites the add second sources with 0 and mul second
1164bf215546Sopenharmony_ci * sources with 1. This can be a problem if we need to insert more next-max
1165bf215546Sopenharmony_ci * moves but we only have values that can't use the complex unit for moves.
1166bf215546Sopenharmony_ci *
1167bf215546Sopenharmony_ci * Fortunately, we only need to insert a next-max move if there are more than
1168bf215546Sopenharmony_ci * 5 next-max nodes, but there are only 4 sources in the previous instruction
1169bf215546Sopenharmony_ci * that make values not complex-capable, which means there can be at most 4
1170bf215546Sopenharmony_ci * non-complex-capable values. Hence there will always be at least two values
1171bf215546Sopenharmony_ci * that can be rewritten to use a move in the complex slot. However, we have
1172bf215546Sopenharmony_ci * to be careful not to waste those values by putting both of them in a
1173bf215546Sopenharmony_ci * non-complex slot. This is handled for us by gpir_instr, which will reject
1174bf215546Sopenharmony_ci * such instructions. We just need to tell it which nodes can use complex, and
1175bf215546Sopenharmony_ci * it will do the accounting to figure out what is safe.
1176bf215546Sopenharmony_ci */
1177bf215546Sopenharmony_ci
1178bf215546Sopenharmony_cistatic bool can_use_complex(gpir_node *node)
1179bf215546Sopenharmony_ci{
1180bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
1181bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
1182bf215546Sopenharmony_ci         continue;
1183bf215546Sopenharmony_ci
1184bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
1185bf215546Sopenharmony_ci      if (succ->type != gpir_node_type_alu ||
1186bf215546Sopenharmony_ci          !succ->sched.instr)
1187bf215546Sopenharmony_ci         continue;
1188bf215546Sopenharmony_ci
1189bf215546Sopenharmony_ci      /* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1}
1190bf215546Sopenharmony_ci       */
1191bf215546Sopenharmony_ci      gpir_alu_node *alu = gpir_node_to_alu(succ);
1192bf215546Sopenharmony_ci      switch (alu->node.op) {
1193bf215546Sopenharmony_ci      case gpir_op_complex1:
1194bf215546Sopenharmony_ci         /* complex1 puts its third source in the fourth slot */
1195bf215546Sopenharmony_ci         if (alu->children[1] == node || alu->children[2] == node)
1196bf215546Sopenharmony_ci            return false;
1197bf215546Sopenharmony_ci         break;
1198bf215546Sopenharmony_ci      case gpir_op_complex2:
1199bf215546Sopenharmony_ci         /* complex2 has its source duplicated, since it actually takes two
1200bf215546Sopenharmony_ci          * sources but we only ever use it with both sources the same. Hence
1201bf215546Sopenharmony_ci          * its source can never be the complex slot.
1202bf215546Sopenharmony_ci          */
1203bf215546Sopenharmony_ci         return false;
1204bf215546Sopenharmony_ci      case gpir_op_select:
1205bf215546Sopenharmony_ci         /* Select has its sources rearranged */
1206bf215546Sopenharmony_ci         if (alu->children[0] == node)
1207bf215546Sopenharmony_ci            return false;
1208bf215546Sopenharmony_ci         break;
1209bf215546Sopenharmony_ci      default:
1210bf215546Sopenharmony_ci         assert(alu->num_child <= 2);
1211bf215546Sopenharmony_ci         if (alu->num_child == 2 && alu->children[1] == node)
1212bf215546Sopenharmony_ci            return false;
1213bf215546Sopenharmony_ci         break;
1214bf215546Sopenharmony_ci      }
1215bf215546Sopenharmony_ci   }
1216bf215546Sopenharmony_ci
1217bf215546Sopenharmony_ci   return true;
1218bf215546Sopenharmony_ci}
1219bf215546Sopenharmony_ci
1220bf215546Sopenharmony_ci/* Initialize node->sched.max_node and node->sched.next_max_node for every
1221bf215546Sopenharmony_ci * input node on the ready list. We should only need to do this once per
1222bf215546Sopenharmony_ci * instruction, at the beginning, since we never add max nodes to the ready
1223bf215546Sopenharmony_ci * list.
1224bf215546Sopenharmony_ci */
1225bf215546Sopenharmony_ci
1226bf215546Sopenharmony_cistatic void sched_find_max_nodes(sched_ctx *ctx)
1227bf215546Sopenharmony_ci{
1228bf215546Sopenharmony_ci   ctx->instr->alu_num_unscheduled_next_max = 0;
1229bf215546Sopenharmony_ci   ctx->instr->alu_num_slot_needed_by_max = 0;
1230bf215546Sopenharmony_ci
1231bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1232bf215546Sopenharmony_ci      if (!gpir_is_input_node(node))
1233bf215546Sopenharmony_ci         continue;
1234bf215546Sopenharmony_ci
1235bf215546Sopenharmony_ci      int min_end_move = gpir_get_min_end_as_move(node);
1236bf215546Sopenharmony_ci      node->sched.max_node = (min_end_move == ctx->instr->index);
1237bf215546Sopenharmony_ci      node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
1238bf215546Sopenharmony_ci      if (node->sched.next_max_node)
1239bf215546Sopenharmony_ci         node->sched.complex_allowed = can_use_complex(node);
1240bf215546Sopenharmony_ci
1241bf215546Sopenharmony_ci      if (node->sched.max_node)
1242bf215546Sopenharmony_ci         ctx->instr->alu_num_slot_needed_by_max++;
1243bf215546Sopenharmony_ci      if (node->sched.next_max_node)
1244bf215546Sopenharmony_ci         ctx->instr->alu_num_unscheduled_next_max++;
1245bf215546Sopenharmony_ci   }
1246bf215546Sopenharmony_ci}
1247bf215546Sopenharmony_ci
1248bf215546Sopenharmony_ci/* Verify the invariants described in gpir.h, as well as making sure the
1249bf215546Sopenharmony_ci * counts are correct.
1250bf215546Sopenharmony_ci */
1251bf215546Sopenharmony_cistatic void ASSERTED verify_max_nodes(sched_ctx *ctx)
1252bf215546Sopenharmony_ci{
1253bf215546Sopenharmony_ci   int alu_num_slot_needed_by_max = 0;
1254bf215546Sopenharmony_ci   int alu_num_unscheduled_next_max = 0;
1255bf215546Sopenharmony_ci   int alu_num_slot_needed_by_store = 0;
1256bf215546Sopenharmony_ci   int alu_num_slot_needed_by_non_cplx_store = 0;
1257bf215546Sopenharmony_ci   ASSERTED int alu_max_allowed_next_max = 5;
1258bf215546Sopenharmony_ci
1259bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1260bf215546Sopenharmony_ci      if (!gpir_is_input_node(node))
1261bf215546Sopenharmony_ci         continue;
1262bf215546Sopenharmony_ci
1263bf215546Sopenharmony_ci      if (node->sched.max_node)
1264bf215546Sopenharmony_ci         alu_num_slot_needed_by_max++;
1265bf215546Sopenharmony_ci      if (node->sched.next_max_node)
1266bf215546Sopenharmony_ci         alu_num_unscheduled_next_max++;
1267bf215546Sopenharmony_ci      if (used_by_store(node, ctx->instr)) {
1268bf215546Sopenharmony_ci         alu_num_slot_needed_by_store++;
1269bf215546Sopenharmony_ci         if (node->sched.next_max_node && !node->sched.complex_allowed)
1270bf215546Sopenharmony_ci            alu_num_slot_needed_by_non_cplx_store++;
1271bf215546Sopenharmony_ci      }
1272bf215546Sopenharmony_ci   }
1273bf215546Sopenharmony_ci
1274bf215546Sopenharmony_ci   if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] &&
1275bf215546Sopenharmony_ci       ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1)
1276bf215546Sopenharmony_ci      alu_max_allowed_next_max = 4;
1277bf215546Sopenharmony_ci
1278bf215546Sopenharmony_ci   assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
1279bf215546Sopenharmony_ci   assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max);
1280bf215546Sopenharmony_ci   assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max);
1281bf215546Sopenharmony_ci   assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
1282bf215546Sopenharmony_ci   assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
1283bf215546Sopenharmony_ci          alu_num_slot_needed_by_non_cplx_store);
1284bf215546Sopenharmony_ci   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0));
1285bf215546Sopenharmony_ci   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
1286bf215546Sopenharmony_ci}
1287bf215546Sopenharmony_ci
1288bf215546Sopenharmony_cistatic bool try_node(sched_ctx *ctx)
1289bf215546Sopenharmony_ci{
1290bf215546Sopenharmony_ci   gpir_node *best_node = NULL;
1291bf215546Sopenharmony_ci   int best_score = INT_MIN;
1292bf215546Sopenharmony_ci
1293bf215546Sopenharmony_ci   /* Spilling will delete arbitrary nodes after the current one in the ready
1294bf215546Sopenharmony_ci    * list, which means that we always need to look up the next node in the
1295bf215546Sopenharmony_ci    * list at the end of each iteration. While list_for_each_entry() works for
1296bf215546Sopenharmony_ci    * this purpose, its sanity checking assumes that you don't want to modify
1297bf215546Sopenharmony_ci    * the list at all. We know better here, so we have to open-code
1298bf215546Sopenharmony_ci    * list_for_each_entry() without the check in order to not assert.
1299bf215546Sopenharmony_ci    */
1300bf215546Sopenharmony_ci   for (gpir_node *node = list_entry(ctx->ready_list.next, gpir_node, list);
1301bf215546Sopenharmony_ci        &node->list != &ctx->ready_list;
1302bf215546Sopenharmony_ci        node = list_entry(node->list.next, gpir_node, list)) {
1303bf215546Sopenharmony_ci      if (best_score != INT_MIN) {
1304bf215546Sopenharmony_ci         if (node->sched.dist < best_node->sched.dist)
1305bf215546Sopenharmony_ci            break;
1306bf215546Sopenharmony_ci      }
1307bf215546Sopenharmony_ci
1308bf215546Sopenharmony_ci      if (node->sched.ready) {
1309bf215546Sopenharmony_ci         ctx->total_spill_needed = 0;
1310bf215546Sopenharmony_ci         ctx->max_node_spill_needed = 0;
1311bf215546Sopenharmony_ci         int score = schedule_try_node(ctx, node, true);
1312bf215546Sopenharmony_ci         if (score == INT_MIN && !best_node &&
1313bf215546Sopenharmony_ci             ctx->total_spill_needed > 0 &&
1314bf215546Sopenharmony_ci             try_spill_nodes(ctx, node)) {
1315bf215546Sopenharmony_ci            score = schedule_try_node(ctx, node, true);
1316bf215546Sopenharmony_ci         }
1317bf215546Sopenharmony_ci
1318bf215546Sopenharmony_ci         /* schedule_first nodes must be scheduled if possible */
1319bf215546Sopenharmony_ci         if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) {
1320bf215546Sopenharmony_ci            best_node = node;
1321bf215546Sopenharmony_ci            best_score = score;
1322bf215546Sopenharmony_ci            break;
1323bf215546Sopenharmony_ci         }
1324bf215546Sopenharmony_ci
1325bf215546Sopenharmony_ci         if (score > best_score) {
1326bf215546Sopenharmony_ci            best_score = score;
1327bf215546Sopenharmony_ci            best_node = node;
1328bf215546Sopenharmony_ci         }
1329bf215546Sopenharmony_ci      }
1330bf215546Sopenharmony_ci   }
1331bf215546Sopenharmony_ci
1332bf215546Sopenharmony_ci   if (best_node) {
1333bf215546Sopenharmony_ci      gpir_debug("scheduling %d (score = %d)%s\n", best_node->index,
1334bf215546Sopenharmony_ci                 best_score, best_node->sched.max_node ? " (max)" : "");
1335bf215546Sopenharmony_ci      ASSERTED int score = schedule_try_node(ctx, best_node, false);
1336bf215546Sopenharmony_ci      assert(score != INT_MIN);
1337bf215546Sopenharmony_ci      return true;
1338bf215546Sopenharmony_ci   }
1339bf215546Sopenharmony_ci
1340bf215546Sopenharmony_ci   return false;
1341bf215546Sopenharmony_ci}
1342bf215546Sopenharmony_ci
1343bf215546Sopenharmony_cistatic void place_move(sched_ctx *ctx, gpir_node *node)
1344bf215546Sopenharmony_ci{
1345bf215546Sopenharmony_ci   /* For complex1 that is consumed by a postlog2, we cannot allow any moves
1346bf215546Sopenharmony_ci    * in between. Convert the postlog2 to a move and insert a new postlog2,
1347bf215546Sopenharmony_ci    * and try to schedule it again in try_node().
1348bf215546Sopenharmony_ci    */
1349bf215546Sopenharmony_ci   gpir_node *postlog2 = consuming_postlog2(node);
1350bf215546Sopenharmony_ci   if (postlog2) {
1351bf215546Sopenharmony_ci      postlog2->op = gpir_op_mov;
1352bf215546Sopenharmony_ci      create_postlog2(ctx, node);
1353bf215546Sopenharmony_ci      return;
1354bf215546Sopenharmony_ci   }
1355bf215546Sopenharmony_ci
1356bf215546Sopenharmony_ci   gpir_node *move = create_move(ctx, node);
1357bf215546Sopenharmony_ci   gpir_node_foreach_succ_safe(move, dep) {
1358bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
1359bf215546Sopenharmony_ci      if (!succ->sched.instr ||
1360bf215546Sopenharmony_ci          ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) {
1361bf215546Sopenharmony_ci         gpir_node_replace_pred(dep, node);
1362bf215546Sopenharmony_ci         if (dep->type == GPIR_DEP_INPUT)
1363bf215546Sopenharmony_ci            gpir_node_replace_child(succ, move, node);
1364bf215546Sopenharmony_ci      }
1365bf215546Sopenharmony_ci   }
1366bf215546Sopenharmony_ci   ASSERTED int score = schedule_try_node(ctx, move, false);
1367bf215546Sopenharmony_ci   assert(score != INT_MIN);
1368bf215546Sopenharmony_ci}
1369bf215546Sopenharmony_ci
1370bf215546Sopenharmony_ci/* For next-max nodes, not every node can be offloaded to a move in the
1371bf215546Sopenharmony_ci * complex slot. If we run out of non-complex slots, then such nodes cannot
1372bf215546Sopenharmony_ci * have moves placed for them. There should always be sufficient
1373bf215546Sopenharmony_ci * complex-capable nodes so that this isn't a problem. We also disallow moves
1374bf215546Sopenharmony_ci * for schedule_first nodes here.
1375bf215546Sopenharmony_ci */
1376bf215546Sopenharmony_cistatic bool can_place_move(sched_ctx *ctx, gpir_node *node)
1377bf215546Sopenharmony_ci{
1378bf215546Sopenharmony_ci   if (gpir_op_infos[node->op].schedule_first)
1379bf215546Sopenharmony_ci      return false;
1380bf215546Sopenharmony_ci
1381bf215546Sopenharmony_ci   if (!node->sched.next_max_node)
1382bf215546Sopenharmony_ci      return true;
1383bf215546Sopenharmony_ci
1384bf215546Sopenharmony_ci   if (node->sched.complex_allowed)
1385bf215546Sopenharmony_ci      return true;
1386bf215546Sopenharmony_ci
1387bf215546Sopenharmony_ci   return ctx->instr->alu_non_cplx_slot_free > 0;
1388bf215546Sopenharmony_ci}
1389bf215546Sopenharmony_ci
1390bf215546Sopenharmony_cistatic bool sched_move(sched_ctx *ctx)
1391bf215546Sopenharmony_ci{
1392bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1393bf215546Sopenharmony_ci      if (node->sched.max_node) {
1394bf215546Sopenharmony_ci         place_move(ctx, node);
1395bf215546Sopenharmony_ci         return true;
1396bf215546Sopenharmony_ci      }
1397bf215546Sopenharmony_ci   }
1398bf215546Sopenharmony_ci
1399bf215546Sopenharmony_ci   if (ctx->instr->alu_num_slot_needed_by_store > 0) {
1400bf215546Sopenharmony_ci      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1401bf215546Sopenharmony_ci         if (used_by_store(node, ctx->instr)) {
1402bf215546Sopenharmony_ci            place_move(ctx, node);
1403bf215546Sopenharmony_ci            /* If we have a store of a load, then we need to make sure that we
1404bf215546Sopenharmony_ci             * immediately schedule the dependent load, or create a move
1405bf215546Sopenharmony_ci             * instruction for it, like we would with a normal instruction.
1406bf215546Sopenharmony_ci             * The rest of the code isn't set up to handle load nodes in the
1407bf215546Sopenharmony_ci             * ready list -- see the comments in _schedule_try_node().
1408bf215546Sopenharmony_ci             */
1409bf215546Sopenharmony_ci            if (node->type == gpir_node_type_load) {
1410bf215546Sopenharmony_ci               if (!schedule_try_place_node(ctx, node, false)) {
1411bf215546Sopenharmony_ci                  create_move(ctx, node);
1412bf215546Sopenharmony_ci               }
1413bf215546Sopenharmony_ci            }
1414bf215546Sopenharmony_ci            return true;
1415bf215546Sopenharmony_ci         }
1416bf215546Sopenharmony_ci      }
1417bf215546Sopenharmony_ci   }
1418bf215546Sopenharmony_ci
1419bf215546Sopenharmony_ci   /* complex1 is a bit a special case, since it has a latency of 2 cycles.
1420bf215546Sopenharmony_ci    * Once it is fully ready, we need to group all its uses in the same
1421bf215546Sopenharmony_ci    * instruction, and then we need to avoid creating any moves in the next
1422bf215546Sopenharmony_ci    * cycle in order to get it scheduled. Failing to do any of these things
1423bf215546Sopenharmony_ci    * could result in a cycle penalty, or even worse, an infinite loop of
1424bf215546Sopenharmony_ci    * inserting moves. If it is a next-max node and ready, then it has a use
1425bf215546Sopenharmony_ci    * in the previous cycle. If it has a use in the current cycle as well,
1426bf215546Sopenharmony_ci    * then we want to insert a move node to make it ready in two cycles -- if
1427bf215546Sopenharmony_ci    * we don't, then there will be at least a one cycle penalty. Otherwise, it
1428bf215546Sopenharmony_ci    * will be ready next cycle, and we shouldn't insert a move node, or else
1429bf215546Sopenharmony_ci    * we'll also have a one cycle penalty.
1430bf215546Sopenharmony_ci    */
1431bf215546Sopenharmony_ci   if (ctx->instr->alu_num_slot_free > 0) {
1432bf215546Sopenharmony_ci      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1433bf215546Sopenharmony_ci         if (!can_place_move(ctx, node))
1434bf215546Sopenharmony_ci            continue;
1435bf215546Sopenharmony_ci
1436bf215546Sopenharmony_ci         if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
1437bf215546Sopenharmony_ci             node->sched.ready) {
1438bf215546Sopenharmony_ci            bool skip = true;
1439bf215546Sopenharmony_ci            gpir_node_foreach_succ(node, dep) {
1440bf215546Sopenharmony_ci               if (dep->type != GPIR_DEP_INPUT)
1441bf215546Sopenharmony_ci                  continue;
1442bf215546Sopenharmony_ci
1443bf215546Sopenharmony_ci               gpir_node *succ = dep->succ;
1444bf215546Sopenharmony_ci
1445bf215546Sopenharmony_ci               if (!succ->sched.instr ||
1446bf215546Sopenharmony_ci                   succ->sched.instr->index != ctx->instr->index - 1) {
1447bf215546Sopenharmony_ci                  skip = false;
1448bf215546Sopenharmony_ci                  break;
1449bf215546Sopenharmony_ci               }
1450bf215546Sopenharmony_ci            }
1451bf215546Sopenharmony_ci
1452bf215546Sopenharmony_ci            if (skip)
1453bf215546Sopenharmony_ci               continue;
1454bf215546Sopenharmony_ci
1455bf215546Sopenharmony_ci            place_move(ctx, node);
1456bf215546Sopenharmony_ci            return true;
1457bf215546Sopenharmony_ci         }
1458bf215546Sopenharmony_ci      }
1459bf215546Sopenharmony_ci   }
1460bf215546Sopenharmony_ci
1461bf215546Sopenharmony_ci   /* Once we've made all the required moves, we're free to use any extra
1462bf215546Sopenharmony_ci    * slots to schedule more moves for next max nodes. Besides sometimes being
1463bf215546Sopenharmony_ci    * necessary, this can free up extra space in the next instruction. We walk
1464bf215546Sopenharmony_ci    * from back to front so that we pick nodes less likely to be scheduled
1465bf215546Sopenharmony_ci    * next first -- an extra move would be unnecessary there. But make sure
1466bf215546Sopenharmony_ci    * not to handle the complex1 case handled above.
1467bf215546Sopenharmony_ci    */
1468bf215546Sopenharmony_ci   if (ctx->instr->alu_num_slot_free > 0) {
1469bf215546Sopenharmony_ci      list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
1470bf215546Sopenharmony_ci         if (!can_place_move(ctx, node))
1471bf215546Sopenharmony_ci            continue;
1472bf215546Sopenharmony_ci
1473bf215546Sopenharmony_ci         if (node->sched.next_max_node &&
1474bf215546Sopenharmony_ci             !(node->op == gpir_op_complex1 && node->sched.ready)) {
1475bf215546Sopenharmony_ci            place_move(ctx, node);
1476bf215546Sopenharmony_ci            return true;
1477bf215546Sopenharmony_ci         }
1478bf215546Sopenharmony_ci      }
1479bf215546Sopenharmony_ci   }
1480bf215546Sopenharmony_ci
1481bf215546Sopenharmony_ci   /* We may have skipped complex1 above, but if we run out of space, we still
1482bf215546Sopenharmony_ci    * need to insert the move.
1483bf215546Sopenharmony_ci    */
1484bf215546Sopenharmony_ci
1485bf215546Sopenharmony_ci   if (ctx->instr->alu_num_unscheduled_next_max >
1486bf215546Sopenharmony_ci       ctx->instr->alu_max_allowed_next_max) {
1487bf215546Sopenharmony_ci      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1488bf215546Sopenharmony_ci         if (!can_place_move(ctx, node))
1489bf215546Sopenharmony_ci            continue;
1490bf215546Sopenharmony_ci
1491bf215546Sopenharmony_ci         if (node->sched.next_max_node) {
1492bf215546Sopenharmony_ci            place_move(ctx, node);
1493bf215546Sopenharmony_ci            return true;
1494bf215546Sopenharmony_ci         }
1495bf215546Sopenharmony_ci      }
1496bf215546Sopenharmony_ci   }
1497bf215546Sopenharmony_ci
1498bf215546Sopenharmony_ci
1499bf215546Sopenharmony_ci   return false;
1500bf215546Sopenharmony_ci}
1501bf215546Sopenharmony_ci
1502bf215546Sopenharmony_cistatic bool gpir_sched_instr_pass(sched_ctx *ctx)
1503bf215546Sopenharmony_ci{
1504bf215546Sopenharmony_ci   if (try_node(ctx))
1505bf215546Sopenharmony_ci      return true;
1506bf215546Sopenharmony_ci
1507bf215546Sopenharmony_ci   if (sched_move(ctx))
1508bf215546Sopenharmony_ci      return true;
1509bf215546Sopenharmony_ci
1510bf215546Sopenharmony_ci   return false;
1511bf215546Sopenharmony_ci}
1512bf215546Sopenharmony_ci
1513bf215546Sopenharmony_cistatic void schedule_print_pre_one_instr(sched_ctx *ctx)
1514bf215546Sopenharmony_ci{
1515bf215546Sopenharmony_ci   if (!(lima_debug & LIMA_DEBUG_GP))
1516bf215546Sopenharmony_ci      return;
1517bf215546Sopenharmony_ci
1518bf215546Sopenharmony_ci   printf("instr %d for ready list:", ctx->instr->index);
1519bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1520bf215546Sopenharmony_ci      printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p',
1521bf215546Sopenharmony_ci             node->sched.dist, gpir_get_slots_required(node),
1522bf215546Sopenharmony_ci             node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none"));
1523bf215546Sopenharmony_ci   }
1524bf215546Sopenharmony_ci   printf("\nlive physregs: ");
1525bf215546Sopenharmony_ci   for (unsigned i = 0; i < 16; i++) {
1526bf215546Sopenharmony_ci      if (ctx->live_physregs & (0xfull << (4 * i))) {
1527bf215546Sopenharmony_ci         printf("$%d.", i);
1528bf215546Sopenharmony_ci         for (unsigned j = 0; j < 4; j++) {
1529bf215546Sopenharmony_ci            if (ctx->live_physregs & (1ull << (4 * i + j)))
1530bf215546Sopenharmony_ci               printf("%c", "xyzw"[j]);
1531bf215546Sopenharmony_ci         }
1532bf215546Sopenharmony_ci         printf(" ");
1533bf215546Sopenharmony_ci      }
1534bf215546Sopenharmony_ci   }
1535bf215546Sopenharmony_ci   printf("\n");
1536bf215546Sopenharmony_ci}
1537bf215546Sopenharmony_ci
1538bf215546Sopenharmony_cistatic void schedule_print_post_one_instr(gpir_instr *instr)
1539bf215546Sopenharmony_ci{
1540bf215546Sopenharmony_ci   if (!(lima_debug & LIMA_DEBUG_GP))
1541bf215546Sopenharmony_ci      return;
1542bf215546Sopenharmony_ci
1543bf215546Sopenharmony_ci   printf("post schedule instr");
1544bf215546Sopenharmony_ci   for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) {
1545bf215546Sopenharmony_ci      if (instr->slots[i])
1546bf215546Sopenharmony_ci         printf(" %d/%d", i, instr->slots[i]->index);
1547bf215546Sopenharmony_ci   }
1548bf215546Sopenharmony_ci   printf("\n");
1549bf215546Sopenharmony_ci}
1550bf215546Sopenharmony_ci
1551bf215546Sopenharmony_ci
1552bf215546Sopenharmony_cistatic bool schedule_one_instr(sched_ctx *ctx)
1553bf215546Sopenharmony_ci{
1554bf215546Sopenharmony_ci   gpir_instr *instr = gpir_instr_create(ctx->block);
1555bf215546Sopenharmony_ci   if (unlikely(!instr))
1556bf215546Sopenharmony_ci      return false;
1557bf215546Sopenharmony_ci
1558bf215546Sopenharmony_ci   ctx->instr = instr;
1559bf215546Sopenharmony_ci
1560bf215546Sopenharmony_ci   sched_find_max_nodes(ctx);
1561bf215546Sopenharmony_ci   schedule_print_pre_one_instr(ctx);
1562bf215546Sopenharmony_ci
1563bf215546Sopenharmony_ci   while (gpir_sched_instr_pass(ctx)) {
1564bf215546Sopenharmony_ci      assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));
1565bf215546Sopenharmony_ci#ifndef NDEBUG
1566bf215546Sopenharmony_ci      verify_max_nodes(ctx);
1567bf215546Sopenharmony_ci      verify_ready_list(ctx);
1568bf215546Sopenharmony_ci#endif
1569bf215546Sopenharmony_ci   }
1570bf215546Sopenharmony_ci
1571bf215546Sopenharmony_ci   schedule_print_post_one_instr(instr);
1572bf215546Sopenharmony_ci   return true;
1573bf215546Sopenharmony_ci}
1574bf215546Sopenharmony_ci
1575bf215546Sopenharmony_cistatic bool schedule_block(gpir_block *block)
1576bf215546Sopenharmony_ci{
1577bf215546Sopenharmony_ci   /* calculate distance */
1578bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &block->node_list, list) {
1579bf215546Sopenharmony_ci      if (gpir_node_is_root(node))
1580bf215546Sopenharmony_ci         schedule_update_distance(node);
1581bf215546Sopenharmony_ci   }
1582bf215546Sopenharmony_ci
1583bf215546Sopenharmony_ci   sched_ctx ctx;
1584bf215546Sopenharmony_ci   list_inithead(&ctx.ready_list);
1585bf215546Sopenharmony_ci   ctx.block = block;
1586bf215546Sopenharmony_ci   ctx.ready_list_slots = 0;
1587bf215546Sopenharmony_ci   ctx.live_physregs = block->live_out_phys;
1588bf215546Sopenharmony_ci
1589bf215546Sopenharmony_ci   for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {
1590bf215546Sopenharmony_ci      list_inithead(&ctx.physreg_reads[i]);
1591bf215546Sopenharmony_ci   }
1592bf215546Sopenharmony_ci
1593bf215546Sopenharmony_ci   /* construct the ready list from root nodes */
1594bf215546Sopenharmony_ci   list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1595bf215546Sopenharmony_ci      /* Add to physreg_reads */
1596bf215546Sopenharmony_ci      if (node->op == gpir_op_load_reg) {
1597bf215546Sopenharmony_ci         gpir_load_node *load = gpir_node_to_load(node);
1598bf215546Sopenharmony_ci         unsigned index = 4 * load->index + load->component;
1599bf215546Sopenharmony_ci         list_addtail(&load->reg_link, &ctx.physreg_reads[index]);
1600bf215546Sopenharmony_ci      }
1601bf215546Sopenharmony_ci
1602bf215546Sopenharmony_ci      if (gpir_node_is_root(node))
1603bf215546Sopenharmony_ci         schedule_insert_ready_list(&ctx, node);
1604bf215546Sopenharmony_ci   }
1605bf215546Sopenharmony_ci
1606bf215546Sopenharmony_ci   list_inithead(&block->node_list);
1607bf215546Sopenharmony_ci   while (!list_is_empty(&ctx.ready_list)) {
1608bf215546Sopenharmony_ci      if (!schedule_one_instr(&ctx))
1609bf215546Sopenharmony_ci         return false;
1610bf215546Sopenharmony_ci   }
1611bf215546Sopenharmony_ci
1612bf215546Sopenharmony_ci   return true;
1613bf215546Sopenharmony_ci}
1614bf215546Sopenharmony_ci
1615bf215546Sopenharmony_cistatic void schedule_build_dependency(gpir_block *block)
1616bf215546Sopenharmony_ci{
1617bf215546Sopenharmony_ci   /* merge dummy_f/m to the node created from */
1618bf215546Sopenharmony_ci   list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1619bf215546Sopenharmony_ci      if (node->op == gpir_op_dummy_m) {
1620bf215546Sopenharmony_ci         gpir_alu_node *alu = gpir_node_to_alu(node);
1621bf215546Sopenharmony_ci         gpir_node *origin = alu->children[0];
1622bf215546Sopenharmony_ci         gpir_node *dummy_f = alu->children[1];
1623bf215546Sopenharmony_ci
1624bf215546Sopenharmony_ci         gpir_node_foreach_succ(node, dep) {
1625bf215546Sopenharmony_ci            gpir_node *succ = dep->succ;
1626bf215546Sopenharmony_ci            /* origin and node may have same succ (by VREG/INPUT or
1627bf215546Sopenharmony_ci             * VREG/VREG dep), so use gpir_node_add_dep() instead of
1628bf215546Sopenharmony_ci             * gpir_node_replace_pred() */
1629bf215546Sopenharmony_ci            gpir_node_add_dep(succ, origin, dep->type);
1630bf215546Sopenharmony_ci            gpir_node_replace_child(succ, node, origin);
1631bf215546Sopenharmony_ci         }
1632bf215546Sopenharmony_ci         gpir_node_delete(dummy_f);
1633bf215546Sopenharmony_ci         gpir_node_delete(node);
1634bf215546Sopenharmony_ci      }
1635bf215546Sopenharmony_ci   }
1636bf215546Sopenharmony_ci}
1637bf215546Sopenharmony_ci
1638bf215546Sopenharmony_cistatic void print_statistic(gpir_compiler *comp, int save_index)
1639bf215546Sopenharmony_ci{
1640bf215546Sopenharmony_ci   int num_nodes[gpir_op_num] = {0};
1641bf215546Sopenharmony_ci   int num_created_nodes[gpir_op_num] = {0};
1642bf215546Sopenharmony_ci
1643bf215546Sopenharmony_ci   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1644bf215546Sopenharmony_ci      list_for_each_entry(gpir_node, node, &block->node_list, list) {
1645bf215546Sopenharmony_ci         num_nodes[node->op]++;
1646bf215546Sopenharmony_ci         if (node->index >= save_index)
1647bf215546Sopenharmony_ci            num_created_nodes[node->op]++;
1648bf215546Sopenharmony_ci      }
1649bf215546Sopenharmony_ci   }
1650bf215546Sopenharmony_ci
1651bf215546Sopenharmony_ci   printf("====== gpir scheduler statistic ======\n");
1652bf215546Sopenharmony_ci   printf("---- how many nodes are scheduled ----\n");
1653bf215546Sopenharmony_ci   int n = 0, l = 0;
1654bf215546Sopenharmony_ci   for (int i = 0; i < gpir_op_num; i++) {
1655bf215546Sopenharmony_ci      if (num_nodes[i]) {
1656bf215546Sopenharmony_ci         printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]);
1657bf215546Sopenharmony_ci         n += num_nodes[i];
1658bf215546Sopenharmony_ci         if (!(++l % 4))
1659bf215546Sopenharmony_ci            printf("\n");
1660bf215546Sopenharmony_ci      }
1661bf215546Sopenharmony_ci   }
1662bf215546Sopenharmony_ci   if (l % 4)
1663bf215546Sopenharmony_ci      printf("\n");
1664bf215546Sopenharmony_ci   printf("\ntotal: %d\n", n);
1665bf215546Sopenharmony_ci
1666bf215546Sopenharmony_ci   printf("---- how many nodes are created ----\n");
1667bf215546Sopenharmony_ci   n = l = 0;
1668bf215546Sopenharmony_ci   for (int i = 0; i < gpir_op_num; i++) {
1669bf215546Sopenharmony_ci      if (num_created_nodes[i]) {
1670bf215546Sopenharmony_ci         printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]);
1671bf215546Sopenharmony_ci         n += num_created_nodes[i];
1672bf215546Sopenharmony_ci         if (!(++l % 4))
1673bf215546Sopenharmony_ci            printf("\n");
1674bf215546Sopenharmony_ci      }
1675bf215546Sopenharmony_ci   }
1676bf215546Sopenharmony_ci   if (l % 4)
1677bf215546Sopenharmony_ci      printf("\n");
1678bf215546Sopenharmony_ci   printf("\ntotal: %d\n", n);
1679bf215546Sopenharmony_ci   printf("------------------------------------\n");
1680bf215546Sopenharmony_ci}
1681bf215546Sopenharmony_ci
1682bf215546Sopenharmony_cibool gpir_schedule_prog(gpir_compiler *comp)
1683bf215546Sopenharmony_ci{
1684bf215546Sopenharmony_ci   int save_index = comp->cur_index;
1685bf215546Sopenharmony_ci
1686bf215546Sopenharmony_ci   /* init schedule info */
1687bf215546Sopenharmony_ci   int index = 0;
1688bf215546Sopenharmony_ci   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1689bf215546Sopenharmony_ci      block->sched.instr_index = 0;
1690bf215546Sopenharmony_ci      list_for_each_entry(gpir_node, node, &block->node_list, list) {
1691bf215546Sopenharmony_ci         node->sched.instr = NULL;
1692bf215546Sopenharmony_ci         node->sched.pos = -1;
1693bf215546Sopenharmony_ci         node->sched.index = index++;
1694bf215546Sopenharmony_ci         node->sched.dist = -1;
1695bf215546Sopenharmony_ci         /* TODO when we support multiple basic blocks, we need a way to keep
1696bf215546Sopenharmony_ci          * track of this for physregs allocated before the scheduler.
1697bf215546Sopenharmony_ci          */
1698bf215546Sopenharmony_ci         node->sched.physreg_store = NULL;
1699bf215546Sopenharmony_ci         node->sched.ready = false;
1700bf215546Sopenharmony_ci         node->sched.inserted = false;
1701bf215546Sopenharmony_ci         node->sched.complex_allowed = false;
1702bf215546Sopenharmony_ci         node->sched.max_node = false;
1703bf215546Sopenharmony_ci         node->sched.next_max_node = false;
1704bf215546Sopenharmony_ci      }
1705bf215546Sopenharmony_ci   }
1706bf215546Sopenharmony_ci
1707bf215546Sopenharmony_ci   /* build dependency */
1708bf215546Sopenharmony_ci   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1709bf215546Sopenharmony_ci      schedule_build_dependency(block);
1710bf215546Sopenharmony_ci   }
1711bf215546Sopenharmony_ci
1712bf215546Sopenharmony_ci   //gpir_debug("after scheduler build reg dependency\n");
1713bf215546Sopenharmony_ci   //gpir_node_print_prog_dep(comp);
1714bf215546Sopenharmony_ci
1715bf215546Sopenharmony_ci   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1716bf215546Sopenharmony_ci      if (!schedule_block(block)) {
1717bf215546Sopenharmony_ci         gpir_error("fail schedule block\n");
1718bf215546Sopenharmony_ci         return false;
1719bf215546Sopenharmony_ci      }
1720bf215546Sopenharmony_ci   }
1721bf215546Sopenharmony_ci
1722bf215546Sopenharmony_ci   if (lima_debug & LIMA_DEBUG_GP) {
1723bf215546Sopenharmony_ci      print_statistic(comp, save_index);
1724bf215546Sopenharmony_ci      gpir_instr_print_prog(comp);
1725bf215546Sopenharmony_ci   }
1726bf215546Sopenharmony_ci
1727bf215546Sopenharmony_ci   return true;
1728bf215546Sopenharmony_ci}
1729