ir/gp/scheduler.c

bf215546Sopenharmony_ci/*
bf215546Sopenharmony_ci * Copyright (c) 2017 Lima Project
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sub license,
bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the
bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions
bf215546Sopenharmony_ci * of the Software.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
bf215546Sopenharmony_ci * DEALINGS IN THE SOFTWARE.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#include <limits.h>
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#include "gpir.h"
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/*
bf215546Sopenharmony_ci * GP scheduling algorithm (by Connor Abbott <cwabbott0@gmail.com>)
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * The GP pipeline has three main stages:
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * --------------------------------------------------------
bf215546Sopenharmony_ci * |                                                      |
bf215546Sopenharmony_ci * |               Register/Attr/Temp Fetch               |
bf215546Sopenharmony_ci * |                                                      |
bf215546Sopenharmony_ci * --------------------------------------------------------
bf215546Sopenharmony_ci * |        |        |        |        |        |         |
bf215546Sopenharmony_ci * |  Mul0  |  Mul1  |  Add0  |  Add1  |  Cplx  |  Pass   |
bf215546Sopenharmony_ci * |        |        |        |        |        |         |
bf215546Sopenharmony_ci * --------------------------------------------------------
bf215546Sopenharmony_ci * |                 |                          |         |
bf215546Sopenharmony_ci * |    Complex1     |   Temp/Register/Varying  |  Pass   |
bf215546Sopenharmony_ci * |    Stage 2      |         Store            | Stage 2 |
bf215546Sopenharmony_ci * |                 |                          |         |
bf215546Sopenharmony_ci * --------------------------------------------------------
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Because of this setup, storing a register has a latency of three cycles.
bf215546Sopenharmony_ci * Also, the register file is organized into 4-component vectors, and the
bf215546Sopenharmony_ci * load stage can only load two vectors at a time. Aside from these highly
bf215546Sopenharmony_ci * constrained register load/store units, there is an explicit bypass
bf215546Sopenharmony_ci * network, where each unit (mul0/mul1/etc.) can access the results of the
bf215546Sopenharmony_ci * any unit from the previous two cycles directly, except for the complex
bf215546Sopenharmony_ci * unit whose result can only be accessed for one cycle (since it's expected
bf215546Sopenharmony_ci * to be used directly by the complex2 instruction in the following cycle).
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Because of the very restricted register file, and because only rarely are
bf215546Sopenharmony_ci * all the units in use at the same time, it can be very beneficial to use
bf215546Sopenharmony_ci * the unused units to "thread" a value from source to destination by using
bf215546Sopenharmony_ci * moves in the otherwise-unused units, without involving the register file
bf215546Sopenharmony_ci * at all. It's very difficult to fully exploit this with a traditional
bf215546Sopenharmony_ci * scheduler, so we need to do something a little un-traditional. The 512
bf215546Sopenharmony_ci * instruction limit means that for more complex shaders, we need to do as
bf215546Sopenharmony_ci * well as possible or else the app won't even work.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * The scheduler works by considering the bypass network as a kind of
bf215546Sopenharmony_ci * register file. It's a quite unusual register file, since registers have to
bf215546Sopenharmony_ci * be assigned "on the fly" as we schedule operations, but with some care, we
bf215546Sopenharmony_ci * can use something conceptually similar to a linear-scan allocator to
bf215546Sopenharmony_ci * successfully schedule nodes to instructions without running into
bf215546Sopenharmony_ci * conflicts.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Values in the IR are separated into normal values, or "value registers",
bf215546Sopenharmony_ci * which is what normal nodes like add, mul, etc. produce, and which only
bf215546Sopenharmony_ci * live inside one basic block, and registers, which can span multiple basic
bf215546Sopenharmony_ci * blocks but have to be accessed via special load_reg/store_reg nodes. RA
bf215546Sopenharmony_ci * assigns physical registers to both value registers and normal registers,
bf215546Sopenharmony_ci * treating load_reg/store_reg as a move instruction, but these are only used
bf215546Sopenharmony_ci * directly for normal registers -- the physreg assigned to a value register
bf215546Sopenharmony_ci * is "fake," and is only used inside the scheduler. Before scheduling we
bf215546Sopenharmony_ci * insert read-after-write dependencies, even for value registers, as if
bf215546Sopenharmony_ci * we're going to use those, but then we throw them away. For example, if we
bf215546Sopenharmony_ci * had something like:
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * (*)r2 = add (*)r1, (*)r2
bf215546Sopenharmony_ci * (*)r1 = load_reg r0
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * we'd insert a write-after-read dependency between the add and load_reg,
bf215546Sopenharmony_ci * even though the starred registers aren't actually used by the scheduler
bf215546Sopenharmony_ci * after this step. This step is crucial since it guarantees that during any
bf215546Sopenharmony_ci * point in the schedule, the number of live registers + live value registers
bf215546Sopenharmony_ci * will never exceed the capacity of the register file and the bypass network
bf215546Sopenharmony_ci * combined. This is because each live register/value register will have a
bf215546Sopenharmony_ci * different fake number, thanks to the fake dependencies inserted before
bf215546Sopenharmony_ci * scheduling. This allows us to not have to worry about spilling to
bf215546Sopenharmony_ci * temporaries, which is only done ahead of time.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * The scheduler is a bottom-up scheduler. It keeps track of each live value
bf215546Sopenharmony_ci * register, and decides on-the-fly which value registers to keep in the
bf215546Sopenharmony_ci * bypass network and which to "spill" to registers. Of particular importance
bf215546Sopenharmony_ci * is the "ready list," which consists of "input nodes" (nodes that produce a
bf215546Sopenharmony_ci * value that can be consumed via the bypass network), both "partially ready"
bf215546Sopenharmony_ci * (only some of the uses have been scheduled) and "fully ready" (all uses
bf215546Sopenharmony_ci * have been scheduled), as well as other non-input nodes like register
bf215546Sopenharmony_ci * stores. Each input node on the ready list represents a live value register
bf215546Sopenharmony_ci * before the current instruction. There must be at most 11 such input nodes
bf215546Sopenharmony_ci * at all times, since there are only 11 slots in the next two instructions
bf215546Sopenharmony_ci * which can reach the current instruction.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * An input node is a "max node" if it has a use two cycles ago, which must be
bf215546Sopenharmony_ci * connected to a definition this cycle. Otherwise it may be a "next max node"
bf215546Sopenharmony_ci * if it will be a max node on the next instruction (i.e. it has a use at most
bf215546Sopenharmony_ci * one cycle ago), or it may be neither if all of its uses are this cycle. As
bf215546Sopenharmony_ci * we keep adding instructions to the front, input nodes graduate from
bf215546Sopenharmony_ci * neither, to next max, to max, unless we decide to insert a move to keep it
bf215546Sopenharmony_ci * alive longer, at which point any uses after the current instruction are
bf215546Sopenharmony_ci * rewritten to be uses of the move so that the original node returns to
bf215546Sopenharmony_ci * neither. The scheduler decides which nodes to try freely, but we have to
bf215546Sopenharmony_ci * reserve slots for two different reasons: (1) out of the 5 non-complex
bf215546Sopenharmony_ci * slots, we reserve a slot for each max node, so that we can connect a
bf215546Sopenharmony_ci * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a
bf215546Sopenharmony_ci * slot for every next-max node above 5, so that for the next instruction
bf215546Sopenharmony_ci * there are no more than 5 max nodes. When a max or next-max node gets
bf215546Sopenharmony_ci * scheduled, the corresponding reservation is reduced by one. At the end, we
bf215546Sopenharmony_ci * insert moves for every slot that was reserved. The reservation is actually
bf215546Sopenharmony_ci * managed by nir_instr, and all we have to do is tell it how many to reserve
bf215546Sopenharmony_ci * at the beginning and then tell it which nodes are max/next-max nodes. When
bf215546Sopenharmony_ci * we start scheduling an instruction, there will be at most 5 max nodes
bf215546Sopenharmony_ci * thanks to the previous instruction's next-max reservation/move insertion.
bf215546Sopenharmony_ci * Since there are at most 11 total input nodes, if there are N max nodes,
bf215546Sopenharmony_ci * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 =
bf215546Sopenharmony_ci * 6 - N slots need to be reserved for next-max nodes, and so at most
bf215546Sopenharmony_ci * 6 - N + N = 6 slots need to be reserved in total, exactly the total number
bf215546Sopenharmony_ci * of slots. So, thanks to the total input node restriction, we will never
bf215546Sopenharmony_ci * need to reserve too many slots.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * It sometimes happens that scheduling a given node will violate this total
bf215546Sopenharmony_ci * input node restriction, or that a reservation will mean that we can't
bf215546Sopenharmony_ci * schedule it. We first schedule a node "speculatively" to see if this is a
bf215546Sopenharmony_ci * problem. If some of the node's sources are loads, then we can schedule
bf215546Sopenharmony_ci * the node and its dependent loads in one swoop to avoid going over the
bf215546Sopenharmony_ci * pressure limit. If that fails, we can try to spill a ready or
bf215546Sopenharmony_ci * partially-ready input node to a register by rewriting all of its uses to
bf215546Sopenharmony_ci * refer to a register load. This removes it from the list of ready and
bf215546Sopenharmony_ci * partially ready input nodes as all of its uses are now unscheduled. If
bf215546Sopenharmony_ci * successful, we can then proceed with scheduling the original node. All of
bf215546Sopenharmony_ci * this happens "speculatively," meaning that afterwards the node is removed
bf215546Sopenharmony_ci * and the entire state of the scheduler is reverted to before it was tried, to
bf215546Sopenharmony_ci * ensure that we never get into an invalid state and run out of spots for
bf215546Sopenharmony_ci * moves. In try_nodes(), we try to schedule each node speculatively on the
bf215546Sopenharmony_ci * ready list, keeping only the nodes that could be successfully scheduled, so
bf215546Sopenharmony_ci * that when we finally decide which node to actually schedule, we know it
bf215546Sopenharmony_ci * will succeed.  This is how we decide on the fly which values go in
bf215546Sopenharmony_ci * registers and which go in the bypass network. Note that "unspilling" a
bf215546Sopenharmony_ci * value is simply a matter of scheduling the store_reg instruction created
bf215546Sopenharmony_ci * when we spill.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * The careful accounting of live value registers, reservations for moves, and
bf215546Sopenharmony_ci * speculative scheduling guarantee that we never run into a failure case
bf215546Sopenharmony_ci * while scheduling. However, we need to make sure that this scheduler will
bf215546Sopenharmony_ci * not get stuck in an infinite loop, i.e. that we'll always make forward
bf215546Sopenharmony_ci * progress by eventually scheduling a non-move node. If we run out of value
bf215546Sopenharmony_ci * registers, then we may have to spill a node to a register. If we
bf215546Sopenharmony_ci * were to schedule one of the fully-ready nodes, then we'd have 11 + N live
bf215546Sopenharmony_ci * value registers before the current instruction. But since there are at most
bf215546Sopenharmony_ci * 64+11 live registers and register values total thanks to the fake
bf215546Sopenharmony_ci * dependencies we inserted before scheduling, there are at most 64 - N live
bf215546Sopenharmony_ci * physical registers, and therefore there are at least N registers available
bf215546Sopenharmony_ci * for spilling. Not all these registers will be available immediately, since
bf215546Sopenharmony_ci * in order to spill a node to a given register we have to ensure that there
bf215546Sopenharmony_ci * are slots available to rewrite every use to a load instruction, and that
bf215546Sopenharmony_ci * may not be the case. There may also be intervening writes which prevent
bf215546Sopenharmony_ci * some registers from being used. However, these are all temporary problems,
bf215546Sopenharmony_ci * since as we create each instruction, we create additional register load
bf215546Sopenharmony_ci * slots that can be freely used for spilling, and we create more move nodes
bf215546Sopenharmony_ci * which means that the uses of the nodes we're trying to spill keep moving
bf215546Sopenharmony_ci * forward. This means that eventually, these problems will go away, at which
bf215546Sopenharmony_ci * point we'll be able to spill a node successfully, so eventually we'll be
bf215546Sopenharmony_ci * able to schedule the first node on the ready list.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_citypedef struct {
bf215546Sopenharmony_ci   /* This is the list of ready and partially-ready nodes. A partially-ready
bf215546Sopenharmony_ci    * node must have at least one input dependency already scheduled.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   struct list_head ready_list;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* The number of ready or partially-ready nodes with at least one input
bf215546Sopenharmony_ci    * dependency already scheduled. In other words, the number of live value
bf215546Sopenharmony_ci    * registers. This must be at most 11.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   int ready_list_slots;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* The physical registers live into the current instruction. */
bf215546Sopenharmony_ci   uint64_t live_physregs;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* The current instruction. */
bf215546Sopenharmony_ci   gpir_instr *instr;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* The current basic block. */
bf215546Sopenharmony_ci   gpir_block *block;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* True if at least one node failed to schedule due to lack of available
bf215546Sopenharmony_ci    * value registers.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   bool try_spill_all;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* The number of max nodes needed to spill to successfully schedule the
bf215546Sopenharmony_ci    * instruction.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   int max_node_spill_needed;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* The number of max and next-max nodes needed to spill to successfully
bf215546Sopenharmony_ci    * schedule the instruction.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   int total_spill_needed;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* For each physical register, a linked list of loads associated with it in
bf215546Sopenharmony_ci    * this block. When we spill a value to a given register, and there are
bf215546Sopenharmony_ci    * existing loads associated with it that haven't been scheduled yet, we
bf215546Sopenharmony_ci    * have to make sure that the corresponding unspill happens after the last
bf215546Sopenharmony_ci    * original use has happened, i.e. is scheduled before.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM];
bf215546Sopenharmony_ci} sched_ctx;
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic int gpir_min_dist_alu(gpir_dep *dep)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   switch (dep->pred->op) {
bf215546Sopenharmony_ci   case gpir_op_load_uniform:
bf215546Sopenharmony_ci   case gpir_op_load_temp:
bf215546Sopenharmony_ci   case gpir_op_load_reg:
bf215546Sopenharmony_ci   case gpir_op_load_attribute:
bf215546Sopenharmony_ci      return 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   case gpir_op_complex1:
bf215546Sopenharmony_ci      return 2;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   default:
bf215546Sopenharmony_ci      return 1;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic int gpir_get_min_dist(gpir_dep *dep)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   switch (dep->type) {
bf215546Sopenharmony_ci   case GPIR_DEP_INPUT:
bf215546Sopenharmony_ci      switch (dep->succ->op) {
bf215546Sopenharmony_ci      case gpir_op_store_temp:
bf215546Sopenharmony_ci      case gpir_op_store_reg:
bf215546Sopenharmony_ci      case gpir_op_store_varying:
bf215546Sopenharmony_ci         /* Stores must use an alu node as input. Also, complex1 takes two
bf215546Sopenharmony_ci          * cycles, which means that its result cannot be stored to a register
bf215546Sopenharmony_ci          * as part of the normal path, and therefore it must also have a move
bf215546Sopenharmony_ci          * inserted.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         if (dep->pred->type == gpir_node_type_load ||
bf215546Sopenharmony_ci             dep->pred->op == gpir_op_complex1)
bf215546Sopenharmony_ci            return INT_MAX >> 2;
bf215546Sopenharmony_ci         else
bf215546Sopenharmony_ci            return 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      default:
bf215546Sopenharmony_ci         return gpir_min_dist_alu(dep);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   case GPIR_DEP_OFFSET:
bf215546Sopenharmony_ci      assert(dep->succ->op == gpir_op_store_temp);
bf215546Sopenharmony_ci      return gpir_min_dist_alu(dep);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   case GPIR_DEP_READ_AFTER_WRITE:
bf215546Sopenharmony_ci      if (dep->succ->op == gpir_op_load_temp &&
bf215546Sopenharmony_ci          dep->pred->op == gpir_op_store_temp) {
bf215546Sopenharmony_ci         return 4;
bf215546Sopenharmony_ci      } else if (dep->succ->op == gpir_op_load_reg &&
bf215546Sopenharmony_ci                 dep->pred->op == gpir_op_store_reg) {
bf215546Sopenharmony_ci         return 3;
bf215546Sopenharmony_ci      } else if ((dep->pred->op == gpir_op_store_temp_load_off0 ||
bf215546Sopenharmony_ci                  dep->pred->op == gpir_op_store_temp_load_off1 ||
bf215546Sopenharmony_ci                  dep->pred->op == gpir_op_store_temp_load_off2) &&
bf215546Sopenharmony_ci                 dep->succ->op == gpir_op_load_uniform) {
bf215546Sopenharmony_ci         return 4;
bf215546Sopenharmony_ci      } else {
bf215546Sopenharmony_ci         /* Fake dependency */
bf215546Sopenharmony_ci         return 0;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   case GPIR_DEP_WRITE_AFTER_READ:
bf215546Sopenharmony_ci      return 0;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return 0;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic int gpir_max_dist_alu(gpir_dep *dep)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   switch (dep->pred->op) {
bf215546Sopenharmony_ci   case gpir_op_load_uniform:
bf215546Sopenharmony_ci   case gpir_op_load_temp:
bf215546Sopenharmony_ci      return 0;
bf215546Sopenharmony_ci   case gpir_op_load_attribute:
bf215546Sopenharmony_ci      return 1;
bf215546Sopenharmony_ci   case gpir_op_load_reg:
bf215546Sopenharmony_ci      if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 ||
bf215546Sopenharmony_ci          dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3)
bf215546Sopenharmony_ci         return 0;
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         return 1;
bf215546Sopenharmony_ci   case gpir_op_exp2_impl:
bf215546Sopenharmony_ci   case gpir_op_log2_impl:
bf215546Sopenharmony_ci   case gpir_op_rcp_impl:
bf215546Sopenharmony_ci   case gpir_op_rsqrt_impl:
bf215546Sopenharmony_ci   case gpir_op_store_temp_load_off0:
bf215546Sopenharmony_ci   case gpir_op_store_temp_load_off1:
bf215546Sopenharmony_ci   case gpir_op_store_temp_load_off2:
bf215546Sopenharmony_ci      return 1;
bf215546Sopenharmony_ci   case gpir_op_mov:
bf215546Sopenharmony_ci      if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
bf215546Sopenharmony_ci         return 1;
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         return 2;
bf215546Sopenharmony_ci   default:
bf215546Sopenharmony_ci      return 2;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic int gpir_get_max_dist(gpir_dep *dep)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   switch (dep->type) {
bf215546Sopenharmony_ci   case GPIR_DEP_INPUT:
bf215546Sopenharmony_ci      switch (dep->succ->op) {
bf215546Sopenharmony_ci      case gpir_op_store_temp:
bf215546Sopenharmony_ci      case gpir_op_store_reg:
bf215546Sopenharmony_ci      case gpir_op_store_varying:
bf215546Sopenharmony_ci         return 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      default:
bf215546Sopenharmony_ci         return gpir_max_dist_alu(dep);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   case GPIR_DEP_OFFSET:
bf215546Sopenharmony_ci      assert(dep->succ->op == gpir_op_store_temp);
bf215546Sopenharmony_ci      return gpir_max_dist_alu(dep);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   default:
bf215546Sopenharmony_ci      return INT_MAX >> 2; /* Don't want to overflow... */
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void schedule_update_distance(gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (gpir_node_is_leaf(node)) {
bf215546Sopenharmony_ci      node->sched.dist = 0;
bf215546Sopenharmony_ci      return;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   gpir_node_foreach_pred(node, dep) {
bf215546Sopenharmony_ci      gpir_node *pred = dep->pred;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (pred->sched.dist < 0)
bf215546Sopenharmony_ci         schedule_update_distance(pred);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      int dist = pred->sched.dist + gpir_min_dist_alu(dep);
bf215546Sopenharmony_ci      if (node->sched.dist < dist)
bf215546Sopenharmony_ci         node->sched.dist = dist;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool gpir_is_input_node(gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci      if (dep->type == GPIR_DEP_INPUT)
bf215546Sopenharmony_ci         return true;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   return false;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Get the number of slots required for a node on the ready list.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic int gpir_get_slots_required(gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (!gpir_is_input_node(node))
bf215546Sopenharmony_ci      return 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Note that we assume every node only consumes one slot, even dual-slot
bf215546Sopenharmony_ci    * instructions. While dual-slot instructions may consume more than one
bf215546Sopenharmony_ci    * slot, we can always safely insert a move if it turns out that there
bf215546Sopenharmony_ci    * isn't enough space for them. There's the risk that we get stuck in an
bf215546Sopenharmony_ci    * infinite loop if all the fully ready nodes are dual-slot nodes, but we
bf215546Sopenharmony_ci    * rely on spilling to registers to save us here.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   return 1;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void ASSERTED verify_ready_list(sched_ctx *ctx)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci      if (!gpir_is_input_node(node)) {
bf215546Sopenharmony_ci         assert(node->sched.ready);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (node->sched.ready) {
bf215546Sopenharmony_ci         /* Every successor must have been scheduled */
bf215546Sopenharmony_ci         gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci            assert(dep->succ->sched.instr);
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci      } else {
bf215546Sopenharmony_ci         /* There must be at least one successor that's not scheduled. */
bf215546Sopenharmony_ci         bool unscheduled = false;
bf215546Sopenharmony_ci         gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci            unscheduled |= !(dep->succ->sched.instr);
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         assert(unscheduled);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void schedule_insert_ready_list(sched_ctx *ctx,
bf215546Sopenharmony_ci                                       gpir_node *insert_node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   /* if this node is fully ready or partially ready
bf215546Sopenharmony_ci    *   fully ready: all successors have been scheduled
bf215546Sopenharmony_ci    *   partially ready: part of input successors have been scheduled
bf215546Sopenharmony_ci    *
bf215546Sopenharmony_ci    * either fully ready or partially ready node need be inserted to
bf215546Sopenharmony_ci    * the ready list, but we only schedule a move node for partially
bf215546Sopenharmony_ci    * ready node.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   bool ready = true, insert = false;
bf215546Sopenharmony_ci   gpir_node_foreach_succ(insert_node, dep) {
bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
bf215546Sopenharmony_ci      if (succ->sched.instr) {
bf215546Sopenharmony_ci         if (dep->type == GPIR_DEP_INPUT)
bf215546Sopenharmony_ci            insert = true;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         ready = false;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   insert_node->sched.ready = ready;
bf215546Sopenharmony_ci   /* for root node */
bf215546Sopenharmony_ci   insert |= ready;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (!insert || insert_node->sched.inserted)
bf215546Sopenharmony_ci      return;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   struct list_head *insert_pos = &ctx->ready_list;
bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci      if ((insert_node->sched.dist > node->sched.dist ||
bf215546Sopenharmony_ci          gpir_op_infos[insert_node->op].schedule_first) &&
bf215546Sopenharmony_ci          !gpir_op_infos[node->op].schedule_first) {
bf215546Sopenharmony_ci         insert_pos = &node->list;
bf215546Sopenharmony_ci         break;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   list_addtail(&insert_node->list, insert_pos);
bf215546Sopenharmony_ci   insert_node->sched.inserted = true;
bf215546Sopenharmony_ci   ctx->ready_list_slots += gpir_get_slots_required(insert_node);
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic int gpir_get_max_start(gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   int max_start = 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* find the max start instr constrainted by all successors */
bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
bf215546Sopenharmony_ci      if (!succ->sched.instr)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      int start = succ->sched.instr->index + gpir_get_min_dist(dep);
bf215546Sopenharmony_ci      if (start > max_start)
bf215546Sopenharmony_ci         max_start = start;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return max_start;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic int gpir_get_min_end(gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   int min_end = INT_MAX;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* find the min end instr constrainted by all successors */
bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
bf215546Sopenharmony_ci      if (!succ->sched.instr)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      int end = succ->sched.instr->index + gpir_get_max_dist(dep);
bf215546Sopenharmony_ci      if (end < min_end)
bf215546Sopenharmony_ci         min_end = end;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return min_end;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   gpir_load_node *load = gpir_node_to_load(node);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) {
bf215546Sopenharmony_ci      if (!instr->slots[i])
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      gpir_load_node *iload = gpir_node_to_load(instr->slots[i]);
bf215546Sopenharmony_ci      if (load->node.op == iload->node.op &&
bf215546Sopenharmony_ci          load->index == iload->index &&
bf215546Sopenharmony_ci          load->component == iload->component)
bf215546Sopenharmony_ci         return &iload->node;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   return NULL;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Simply place the node into the given instruction without trying to deal
bf215546Sopenharmony_ci * with liveness or the ready list. This will only fail if the instruction
bf215546Sopenharmony_ci * cannot be placed due to a lack of available slots. In addition to normal
bf215546Sopenharmony_ci * node placement, this is also used for placing loads when spilling to
bf215546Sopenharmony_ci * registers.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (node->type == gpir_node_type_load) {
bf215546Sopenharmony_ci      gpir_node *load = gpir_sched_instr_has_load(instr, node);
bf215546Sopenharmony_ci      if (load) {
bf215546Sopenharmony_ci         /* This node may have a store as a successor, in which case we have to
bf215546Sopenharmony_ci          * fail it exactly like below in order to later create a move node in
bf215546Sopenharmony_ci          * between.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         if (instr->index < gpir_get_max_start(node))
bf215546Sopenharmony_ci            return false;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         gpir_debug("same load %d in instr %d for node %d\n",
bf215546Sopenharmony_ci                    load->index, instr->index, node->index);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         /* not really merge two node, just fake scheduled same place */
bf215546Sopenharmony_ci         node->sched.instr = load->sched.instr;
bf215546Sopenharmony_ci         node->sched.pos = load->sched.pos;
bf215546Sopenharmony_ci         return true;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (node->op == gpir_op_store_reg) {
bf215546Sopenharmony_ci      /* This register may be loaded in the next basic block, in which case
bf215546Sopenharmony_ci       * there still needs to be a 2 instruction gap. We do what the blob
bf215546Sopenharmony_ci       * seems to do and simply disable stores in the last two instructions of
bf215546Sopenharmony_ci       * the basic block.
bf215546Sopenharmony_ci       *
bf215546Sopenharmony_ci       * TODO: We may be able to do better than this, but we have to check
bf215546Sopenharmony_ci       * first if storing a register works across branches.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      if (instr->index < 2)
bf215546Sopenharmony_ci         return false;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   node->sched.instr = instr;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   int max_node_spill_needed = INT_MAX;
bf215546Sopenharmony_ci   int total_spill_needed = INT_MAX;
bf215546Sopenharmony_ci   int *slots = gpir_op_infos[node->op].slots;
bf215546Sopenharmony_ci   for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) {
bf215546Sopenharmony_ci      node->sched.pos = slots[i];
bf215546Sopenharmony_ci      if (instr->index >= gpir_get_max_start(node) &&
bf215546Sopenharmony_ci          instr->index <= gpir_get_min_end(node) &&
bf215546Sopenharmony_ci          gpir_instr_try_insert_node(instr, node))
bf215546Sopenharmony_ci         return true;
bf215546Sopenharmony_ci      if (ctx->instr->non_cplx_slot_difference ||
bf215546Sopenharmony_ci          ctx->instr->slot_difference) {
bf215546Sopenharmony_ci         /* If one of these fields is non-zero, then we could insert the node
bf215546Sopenharmony_ci          * here after spilling. To get an accurate count of how many nodes we
bf215546Sopenharmony_ci          * need to spill, we need to choose one of the positions where there
bf215546Sopenharmony_ci          * were nonzero slot differences, preferably one with the smallest
bf215546Sopenharmony_ci          * difference (so we don't have to spill as much).
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed ||
bf215546Sopenharmony_ci             ctx->instr->slot_difference < total_spill_needed) {
bf215546Sopenharmony_ci            max_node_spill_needed = ctx->instr->non_cplx_slot_difference;
bf215546Sopenharmony_ci            total_spill_needed = ctx->instr->slot_difference;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (max_node_spill_needed != INT_MAX) {
bf215546Sopenharmony_ci      /* Indicate how many spill nodes are needed. */
bf215546Sopenharmony_ci      ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed,
bf215546Sopenharmony_ci                                        max_node_spill_needed);
bf215546Sopenharmony_ci      ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
bf215546Sopenharmony_ci                                     total_spill_needed);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   node->sched.instr = NULL;
bf215546Sopenharmony_ci   node->sched.pos = -1;
bf215546Sopenharmony_ci   return false;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Try to place just the node given, updating the ready list. If "speculative"
bf215546Sopenharmony_ci * is true, then this is part of the pre-commit phase. If false, then we have
bf215546Sopenharmony_ci * committed to placing this node, so update liveness and ready list
bf215546Sopenharmony_ci * information.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node,
bf215546Sopenharmony_ci                                    bool speculative)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (!_try_place_node(ctx, ctx->instr, node)) {
bf215546Sopenharmony_ci      if (!speculative)
bf215546Sopenharmony_ci         gpir_debug("failed to place %d\n", node->index);
bf215546Sopenharmony_ci      return false;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   ctx->ready_list_slots -= gpir_get_slots_required(node);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (!speculative) {
bf215546Sopenharmony_ci      gpir_debug("placed node %d\n", node->index);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* We assume here that writes are placed before reads. If this changes,
bf215546Sopenharmony_ci       * then this needs to be updated.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      if (node->op == gpir_op_store_reg) {
bf215546Sopenharmony_ci         gpir_store_node *store = gpir_node_to_store(node);
bf215546Sopenharmony_ci         ctx->live_physregs &=
bf215546Sopenharmony_ci            ~(1ull << (4 * store->index + store->component));
bf215546Sopenharmony_ci         if (store->child->sched.physreg_store == store)
bf215546Sopenharmony_ci            store->child->sched.physreg_store = NULL;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (node->op == gpir_op_load_reg) {
bf215546Sopenharmony_ci         gpir_load_node *load = gpir_node_to_load(node);
bf215546Sopenharmony_ci         ctx->live_physregs |=
bf215546Sopenharmony_ci            (1ull << (4 * load->index + load->component));
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      list_del(&node->list);
bf215546Sopenharmony_ci      list_add(&node->list, &ctx->block->node_list);
bf215546Sopenharmony_ci      gpir_node_foreach_pred(node, dep) {
bf215546Sopenharmony_ci         gpir_node *pred = dep->pred;
bf215546Sopenharmony_ci         schedule_insert_ready_list(ctx, pred);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      gpir_node_foreach_pred(node, dep) {
bf215546Sopenharmony_ci         gpir_node *pred = dep->pred;
bf215546Sopenharmony_ci         if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT)
bf215546Sopenharmony_ci            ctx->ready_list_slots += gpir_get_slots_required(pred);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return true;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Create a new node with "node" as the child, replace all uses of "node" with
bf215546Sopenharmony_ci * this new node, and replace "node" with it in the ready list.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic gpir_node *create_replacement(sched_ctx *ctx, gpir_node *node,
bf215546Sopenharmony_ci                                     gpir_op op)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   gpir_alu_node *new_node = gpir_node_create(node->block, op);
bf215546Sopenharmony_ci   if (unlikely(!new_node))
bf215546Sopenharmony_ci      return NULL;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   new_node->children[0] = node;
bf215546Sopenharmony_ci   new_node->num_child = 1;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   new_node->node.sched.instr = NULL;
bf215546Sopenharmony_ci   new_node->node.sched.pos = -1;
bf215546Sopenharmony_ci   new_node->node.sched.dist = node->sched.dist;
bf215546Sopenharmony_ci   new_node->node.sched.max_node = node->sched.max_node;
bf215546Sopenharmony_ci   new_node->node.sched.next_max_node = node->sched.next_max_node;
bf215546Sopenharmony_ci   new_node->node.sched.complex_allowed = node->sched.complex_allowed;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   ctx->ready_list_slots--;
bf215546Sopenharmony_ci   list_del(&node->list);
bf215546Sopenharmony_ci   node->sched.max_node = false;
bf215546Sopenharmony_ci   node->sched.next_max_node = false;
bf215546Sopenharmony_ci   node->sched.ready = false;
bf215546Sopenharmony_ci   node->sched.inserted = false;
bf215546Sopenharmony_ci   gpir_node_replace_succ(&new_node->node, node);
bf215546Sopenharmony_ci   gpir_node_add_dep(&new_node->node, node, GPIR_DEP_INPUT);
bf215546Sopenharmony_ci   schedule_insert_ready_list(ctx, &new_node->node);
bf215546Sopenharmony_ci   return &new_node->node;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   gpir_node *move = create_replacement(ctx, node, gpir_op_mov);
bf215546Sopenharmony_ci   gpir_debug("create move %d for %d\n", move->index, node->index);
bf215546Sopenharmony_ci   return move;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic gpir_node *create_postlog2(sched_ctx *ctx, gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   assert(node->op == gpir_op_complex1);
bf215546Sopenharmony_ci   gpir_node *postlog2 = create_replacement(ctx, node, gpir_op_postlog2);
bf215546Sopenharmony_ci   gpir_debug("create postlog2 %d for %d\n", postlog2->index, node->index);
bf215546Sopenharmony_ci   return postlog2;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Once we schedule the successor, would the predecessor be fully ready? */
bf215546Sopenharmony_cistatic bool pred_almost_ready(gpir_dep *dep)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   bool fully_ready = true;
bf215546Sopenharmony_ci   gpir_node_foreach_succ(dep->pred, other_dep) {
bf215546Sopenharmony_ci      gpir_node *succ = other_dep->succ;
bf215546Sopenharmony_ci      if (!succ->sched.instr && dep->succ != other_dep->succ) {
bf215546Sopenharmony_ci         fully_ready = false;
bf215546Sopenharmony_ci         break;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return fully_ready;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Recursively try to schedule a node and all its dependent nodes that can fit
bf215546Sopenharmony_ci * in the same  instruction. There is a simple heuristic scoring system to try
bf215546Sopenharmony_ci * to group together nodes that load different components of the same input,
bf215546Sopenharmony_ci * to avoid bottlenecking for operations like matrix multiplies that are
bf215546Sopenharmony_ci * mostly input-bound.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (!schedule_try_place_node(ctx, node, speculative))
bf215546Sopenharmony_ci      return INT_MIN;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   int score = 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   gpir_node_foreach_pred(node, dep) {
bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      int pred_score = INT_MIN;
bf215546Sopenharmony_ci      if (pred_almost_ready(dep)) {
bf215546Sopenharmony_ci         if (dep->pred->type == gpir_node_type_load ||
bf215546Sopenharmony_ci             node->type == gpir_node_type_store) {
bf215546Sopenharmony_ci            pred_score = _schedule_try_node(ctx, dep->pred, speculative);
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci      if (dep->pred->type == gpir_node_type_load ||
bf215546Sopenharmony_ci          node->type == gpir_node_type_store) {
bf215546Sopenharmony_ci         if (pred_score == INT_MIN) {
bf215546Sopenharmony_ci            if (node->op == gpir_op_mov) {
bf215546Sopenharmony_ci               /* The only moves on the ready list are for loads that we
bf215546Sopenharmony_ci                * couldn't schedule immediately, as created below. If we
bf215546Sopenharmony_ci                * couldn't schedule the load, there's no point scheduling
bf215546Sopenharmony_ci                * the move. The normal move threading logic will ensure
bf215546Sopenharmony_ci                * that another move is created if we're about to go too far
bf215546Sopenharmony_ci                * from the uses of this move.
bf215546Sopenharmony_ci                */
bf215546Sopenharmony_ci               assert(speculative);
bf215546Sopenharmony_ci               return INT_MIN;
bf215546Sopenharmony_ci            } else if (!speculative && dep->pred->type == gpir_node_type_load) {
bf215546Sopenharmony_ci               /* We couldn't schedule the load right away, so it will have
bf215546Sopenharmony_ci                * to happen in some earlier instruction and then be moved
bf215546Sopenharmony_ci                * into a value register and threaded to the use by "node".
bf215546Sopenharmony_ci                * We create the move right away, so that later we'll fail
bf215546Sopenharmony_ci                * to schedule it if there isn't a slot for a move
bf215546Sopenharmony_ci                * available.
bf215546Sopenharmony_ci                */
bf215546Sopenharmony_ci               create_move(ctx, dep->pred);
bf215546Sopenharmony_ci            }
bf215546Sopenharmony_ci            /* Penalize nodes whose dependent ops we couldn't schedule.
bf215546Sopenharmony_ci             */
bf215546Sopenharmony_ci            score--;
bf215546Sopenharmony_ci         } else {
bf215546Sopenharmony_ci            score += pred_score;
bf215546Sopenharmony_ci            continue;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return score;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* If we speculatively tried a node, undo everything.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void schedule_undo_node(sched_ctx *ctx, gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   gpir_instr_remove_node(ctx->instr, node);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   gpir_node_foreach_pred(node, dep) {
bf215546Sopenharmony_ci      gpir_node *pred = dep->pred;
bf215546Sopenharmony_ci      if (pred->sched.instr) {
bf215546Sopenharmony_ci         schedule_undo_node(ctx, pred);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Try to schedule a node. We also try to schedule any predecessors that can
bf215546Sopenharmony_ci * be part of the same instruction. If "speculative" is true, then we don't
bf215546Sopenharmony_ci * actually change any state, only returning the score were the node to be
bf215546Sopenharmony_ci * scheduled, with INT_MIN meaning "cannot be scheduled at all".
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   int prev_slots = ctx->ready_list_slots;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   int score = _schedule_try_node(ctx, node, speculative);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) {
bf215546Sopenharmony_ci      assert(speculative);
bf215546Sopenharmony_ci      ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
bf215546Sopenharmony_ci                                     ctx->ready_list_slots - GPIR_VALUE_REG_NUM);
bf215546Sopenharmony_ci      score = INT_MIN;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (speculative) {
bf215546Sopenharmony_ci      ctx->ready_list_slots = prev_slots;
bf215546Sopenharmony_ci      if (node->sched.instr)
bf215546Sopenharmony_ci         schedule_undo_node(ctx, node);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return score;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* This is called when we want to spill "node" by inserting loads at its uses.
bf215546Sopenharmony_ci * It returns all the possible registers we can use so that all the loads will
bf215546Sopenharmony_ci * successfully be inserted. Also return the first instruction we'll need to
bf215546Sopenharmony_ci * insert a load for.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node,
bf215546Sopenharmony_ci                                   int *min_index)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   uint64_t available = ~0ull;
bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      gpir_node *use = dep->succ;
bf215546Sopenharmony_ci      gpir_instr *instr = use->sched.instr;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (!instr) {
bf215546Sopenharmony_ci         /* This use isn't scheduled, so no need to spill it. */
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (use->type == gpir_node_type_store) {
bf215546Sopenharmony_ci         /* We're trying to spill something that was recently stored... just
bf215546Sopenharmony_ci          * bail out.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         return 0;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (use->op == gpir_op_mov && instr == ctx->instr) {
bf215546Sopenharmony_ci         /* We try to spill the sources of this move, so we can free up space
bf215546Sopenharmony_ci          * in the current instruction.
bf215546Sopenharmony_ci          *
bf215546Sopenharmony_ci          * TODO: should we go back further? It might let us schedule the
bf215546Sopenharmony_ci          * write earlier in some cases, but then we might fail to spill.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         available &= get_available_regs(ctx, use, min_index);
bf215546Sopenharmony_ci      } else {
bf215546Sopenharmony_ci         if (instr->index < *min_index)
bf215546Sopenharmony_ci            *min_index = instr->index;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         uint64_t use_available = 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         if (instr->reg0_use_count == 0)
bf215546Sopenharmony_ci            use_available = ~0ull;
bf215546Sopenharmony_ci         else if (!instr->reg0_is_attr)
bf215546Sopenharmony_ci            use_available = 0xfull << (4 * instr->reg0_index);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         if (instr->reg1_use_count == 0)
bf215546Sopenharmony_ci            use_available = ~0ull;
bf215546Sopenharmony_ci         else
bf215546Sopenharmony_ci            use_available |= 0xfull << (4 * instr->reg1_index);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         available &= use_available;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return available;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Using "min_index" returned by get_available_regs(), figure out which
bf215546Sopenharmony_ci * registers are killed by a write after or during the current instruction and
bf215546Sopenharmony_ci * hence we can't use for spilling. Writes that haven't been scheduled yet
bf215546Sopenharmony_ci * should be reflected in live_physregs.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic uint64_t get_killed_regs(sched_ctx *ctx, int min_index)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   uint64_t killed = 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) {
bf215546Sopenharmony_ci      if (instr->index <= min_index)
bf215546Sopenharmony_ci         break;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3;
bf215546Sopenharmony_ci           slot++) {
bf215546Sopenharmony_ci         if (!instr->slots[slot])
bf215546Sopenharmony_ci            continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         gpir_store_node *store = gpir_node_to_store(instr->slots[slot]);
bf215546Sopenharmony_ci         if (store->node.op != gpir_op_store_reg)
bf215546Sopenharmony_ci            continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         killed |= 1ull << (4 * store->index + store->component);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return killed;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Actually spill a node so that it is no longer in the ready list. Note that
bf215546Sopenharmony_ci * this must exactly follow the logic of get_available_regs() or else the
bf215546Sopenharmony_ci * loads could fail to schedule.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   gpir_node_foreach_succ_safe(node, dep) {
bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      gpir_node *use = dep->succ;
bf215546Sopenharmony_ci      gpir_instr *instr = use->sched.instr;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (!instr)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (use->op == gpir_op_mov && instr == ctx->instr) {
bf215546Sopenharmony_ci         spill_node(ctx, use, store);
bf215546Sopenharmony_ci      } else {
bf215546Sopenharmony_ci         gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg);
bf215546Sopenharmony_ci         load->index = store->index;
bf215546Sopenharmony_ci         load->component = store->component;
bf215546Sopenharmony_ci         list_add(&load->node.list, &ctx->block->node_list);
bf215546Sopenharmony_ci         gpir_node_replace_child(dep->succ, dep->pred, &load->node);
bf215546Sopenharmony_ci         gpir_node_replace_pred(dep, &load->node);
bf215546Sopenharmony_ci         gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
bf215546Sopenharmony_ci         gpir_debug("spilling use %d of node %d to load node %d\n",
bf215546Sopenharmony_ci                    use->index, node->index, load->node.index);
bf215546Sopenharmony_ci         ASSERTED bool result = _try_place_node(ctx, use->sched.instr, &load->node);
bf215546Sopenharmony_ci         assert(result);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (node->op == gpir_op_mov) {
bf215546Sopenharmony_ci      /* We replaced all the uses of the move, so it's dead now. */
bf215546Sopenharmony_ci      gpir_instr_remove_node(node->sched.instr, node);
bf215546Sopenharmony_ci      gpir_node_delete(node);
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      /* We deleted all the uses of the node except the store, so it's not
bf215546Sopenharmony_ci       * live anymore.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      list_del(&node->list);
bf215546Sopenharmony_ci      node->sched.inserted = false;
bf215546Sopenharmony_ci      ctx->ready_list_slots--;
bf215546Sopenharmony_ci      if (node->sched.max_node) {
bf215546Sopenharmony_ci         node->sched.max_node = false;
bf215546Sopenharmony_ci         ctx->instr->alu_num_slot_needed_by_max--;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci      if (node->sched.next_max_node) {
bf215546Sopenharmony_ci         node->sched.next_max_node = false;
bf215546Sopenharmony_ci         ctx->instr->alu_num_unscheduled_next_max--;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool used_by_store(gpir_node *node, gpir_instr *instr)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (dep->succ->type == gpir_node_type_store &&
bf215546Sopenharmony_ci          dep->succ->sched.instr == instr)
bf215546Sopenharmony_ci         return true;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return false;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic gpir_node *consuming_postlog2(gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (node->op != gpir_op_complex1)
bf215546Sopenharmony_ci      return NULL;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci      if (dep->succ->op == gpir_op_postlog2)
bf215546Sopenharmony_ci         return dep->succ;
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         return NULL;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return NULL;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool try_spill_node(sched_ctx *ctx, gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   assert(node->op != gpir_op_mov);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (used_by_store(node, ctx->instr))
bf215546Sopenharmony_ci      return false;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   gpir_debug("trying to spill %d\n", node->index);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   int min_instr = INT_MAX;
bf215546Sopenharmony_ci   uint64_t available = get_available_regs(ctx, node, &min_instr);
bf215546Sopenharmony_ci   available &= ~get_killed_regs(ctx, min_instr);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (node->sched.physreg_store) {
bf215546Sopenharmony_ci      gpir_store_node *store = node->sched.physreg_store;
bf215546Sopenharmony_ci      if (!(available & (1ull << (4 * store->index + store->component))))
bf215546Sopenharmony_ci         return false;
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      available &= ~ctx->live_physregs;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (available == 0)
bf215546Sopenharmony_ci         return false;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* Don't spill complex1 if it's used postlog2, turn the postlog2 into a
bf215546Sopenharmony_ci       * move, replace the complex1 with postlog2 and spill that instead. The
bf215546Sopenharmony_ci       * store needs a move anyways so the postlog2 is usually free.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      gpir_node *postlog2 = consuming_postlog2(node);
bf215546Sopenharmony_ci      if (postlog2) {
bf215546Sopenharmony_ci         postlog2->op = gpir_op_mov;
bf215546Sopenharmony_ci         node = create_postlog2(ctx, node);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* TODO: use a better heuristic for choosing an available register? */
bf215546Sopenharmony_ci      int physreg = ffsll(available) - 1;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      ctx->live_physregs |= (1ull << physreg);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);
bf215546Sopenharmony_ci      store->index = physreg / 4;
bf215546Sopenharmony_ci      store->component = physreg % 4;
bf215546Sopenharmony_ci      store->child = node;
bf215546Sopenharmony_ci      store->node.sched.max_node = false;
bf215546Sopenharmony_ci      store->node.sched.next_max_node = false;
bf215546Sopenharmony_ci      store->node.sched.complex_allowed = false;
bf215546Sopenharmony_ci      store->node.sched.pos = -1;
bf215546Sopenharmony_ci      store->node.sched.instr = NULL;
bf215546Sopenharmony_ci      store->node.sched.inserted = false;
bf215546Sopenharmony_ci      store->node.sched.dist = node->sched.dist;
bf215546Sopenharmony_ci      if (node->op == gpir_op_complex1) {
bf215546Sopenharmony_ci         /* Complex1 cannot be directly stored, and has a latency of 2 */
bf215546Sopenharmony_ci         store->node.sched.dist += 2;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci      node->sched.physreg_store = store;
bf215546Sopenharmony_ci      gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      list_for_each_entry(gpir_load_node, load,
bf215546Sopenharmony_ci                          &ctx->physreg_reads[physreg], reg_link) {
bf215546Sopenharmony_ci         gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ);
bf215546Sopenharmony_ci         if (load->node.sched.ready) {
bf215546Sopenharmony_ci            list_del(&load->node.list);
bf215546Sopenharmony_ci            load->node.sched.ready = false;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      node->sched.ready = false;
bf215546Sopenharmony_ci      schedule_insert_ready_list(ctx, &store->node);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   gpir_debug("spilling %d to $%d.%c, store %d\n", node->index,
bf215546Sopenharmony_ci              node->sched.physreg_store->index,
bf215546Sopenharmony_ci              "xyzw"[node->sched.physreg_store->component],
bf215546Sopenharmony_ci              node->sched.physreg_store->node.index);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   spill_node(ctx, node, node->sched.physreg_store);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return true;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   /* First, try to spill max nodes. */
bf215546Sopenharmony_ci   list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci      if (ctx->max_node_spill_needed <= 0)
bf215546Sopenharmony_ci         break;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* orig_node is the node we're trying to schedule, so spilling it makes
bf215546Sopenharmony_ci       * no sense. Also don't try to spill any nodes in front of it, since
bf215546Sopenharmony_ci       * they might be scheduled instead.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      if (node == orig_node)
bf215546Sopenharmony_ci         break;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (node->op == gpir_op_mov) {
bf215546Sopenharmony_ci         /* Don't try to spill loads, since that only adds another load and
bf215546Sopenharmony_ci          * store which is likely pointless.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (!gpir_is_input_node(node) || !node->sched.max_node)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (try_spill_node(ctx, node)) {
bf215546Sopenharmony_ci         ctx->max_node_spill_needed--;
bf215546Sopenharmony_ci         ctx->total_spill_needed--;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Now, try to spill the remaining nodes. */
bf215546Sopenharmony_ci   list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci      if (ctx->total_spill_needed <= 0)
bf215546Sopenharmony_ci         break;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (node == orig_node)
bf215546Sopenharmony_ci         break;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (node->op == gpir_op_mov)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (!gpir_is_input_node(node) ||
bf215546Sopenharmony_ci          !(node->sched.max_node || node->sched.next_max_node))
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (try_spill_node(ctx, node))
bf215546Sopenharmony_ci         ctx->total_spill_needed--;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic int ASSERTED gpir_get_curr_ready_list_slots(sched_ctx *ctx)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   int total = 0;
bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci      total += gpir_get_slots_required(node);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return total;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* What gpir_get_min_end() would return if node were replaced with a move
bf215546Sopenharmony_ci * instruction not in the complex slot. Normally this is 2 + min_end, except
bf215546Sopenharmony_ci * for some store instructions which must have the move node in the same
bf215546Sopenharmony_ci * instruction.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic int gpir_get_min_end_as_move(gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   int min = INT_MAX;
bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
bf215546Sopenharmony_ci      if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) {
bf215546Sopenharmony_ci         switch (succ->op) {
bf215546Sopenharmony_ci         case gpir_op_store_temp:
bf215546Sopenharmony_ci         case gpir_op_store_reg:
bf215546Sopenharmony_ci         case gpir_op_store_varying:
bf215546Sopenharmony_ci            continue;
bf215546Sopenharmony_ci         default:
bf215546Sopenharmony_ci            break;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci         if (min > succ->sched.instr->index + 2)
bf215546Sopenharmony_ci            min = succ->sched.instr->index + 2;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   return min;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* The second source for add0, add1, mul0, and mul1 units cannot be complex.
bf215546Sopenharmony_ci * The hardware overwrites the add second sources with 0 and mul second
bf215546Sopenharmony_ci * sources with 1. This can be a problem if we need to insert more next-max
bf215546Sopenharmony_ci * moves but we only have values that can't use the complex unit for moves.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Fortunately, we only need to insert a next-max move if there are more than
bf215546Sopenharmony_ci * 5 next-max nodes, but there are only 4 sources in the previous instruction
bf215546Sopenharmony_ci * that make values not complex-capable, which means there can be at most 4
bf215546Sopenharmony_ci * non-complex-capable values. Hence there will always be at least two values
bf215546Sopenharmony_ci * that can be rewritten to use a move in the complex slot. However, we have
bf215546Sopenharmony_ci * to be careful not to waste those values by putting both of them in a
bf215546Sopenharmony_ci * non-complex slot. This is handled for us by gpir_instr, which will reject
bf215546Sopenharmony_ci * such instructions. We just need to tell it which nodes can use complex, and
bf215546Sopenharmony_ci * it will do the accounting to figure out what is safe.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool can_use_complex(gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci      if (dep->type != GPIR_DEP_INPUT)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
bf215546Sopenharmony_ci      if (succ->type != gpir_node_type_alu ||
bf215546Sopenharmony_ci          !succ->sched.instr)
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1}
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      gpir_alu_node *alu = gpir_node_to_alu(succ);
bf215546Sopenharmony_ci      switch (alu->node.op) {
bf215546Sopenharmony_ci      case gpir_op_complex1:
bf215546Sopenharmony_ci         /* complex1 puts its third source in the fourth slot */
bf215546Sopenharmony_ci         if (alu->children[1] == node || alu->children[2] == node)
bf215546Sopenharmony_ci            return false;
bf215546Sopenharmony_ci         break;
bf215546Sopenharmony_ci      case gpir_op_complex2:
bf215546Sopenharmony_ci         /* complex2 has its source duplicated, since it actually takes two
bf215546Sopenharmony_ci          * sources but we only ever use it with both sources the same. Hence
bf215546Sopenharmony_ci          * its source can never be the complex slot.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         return false;
bf215546Sopenharmony_ci      case gpir_op_select:
bf215546Sopenharmony_ci         /* Select has its sources rearranged */
bf215546Sopenharmony_ci         if (alu->children[0] == node)
bf215546Sopenharmony_ci            return false;
bf215546Sopenharmony_ci         break;
bf215546Sopenharmony_ci      default:
bf215546Sopenharmony_ci         assert(alu->num_child <= 2);
bf215546Sopenharmony_ci         if (alu->num_child == 2 && alu->children[1] == node)
bf215546Sopenharmony_ci            return false;
bf215546Sopenharmony_ci         break;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return true;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Initialize node->sched.max_node and node->sched.next_max_node for every
bf215546Sopenharmony_ci * input node on the ready list. We should only need to do this once per
bf215546Sopenharmony_ci * instruction, at the beginning, since we never add max nodes to the ready
bf215546Sopenharmony_ci * list.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void sched_find_max_nodes(sched_ctx *ctx)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   ctx->instr->alu_num_unscheduled_next_max = 0;
bf215546Sopenharmony_ci   ctx->instr->alu_num_slot_needed_by_max = 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci      if (!gpir_is_input_node(node))
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      int min_end_move = gpir_get_min_end_as_move(node);
bf215546Sopenharmony_ci      node->sched.max_node = (min_end_move == ctx->instr->index);
bf215546Sopenharmony_ci      node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
bf215546Sopenharmony_ci      if (node->sched.next_max_node)
bf215546Sopenharmony_ci         node->sched.complex_allowed = can_use_complex(node);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (node->sched.max_node)
bf215546Sopenharmony_ci         ctx->instr->alu_num_slot_needed_by_max++;
bf215546Sopenharmony_ci      if (node->sched.next_max_node)
bf215546Sopenharmony_ci         ctx->instr->alu_num_unscheduled_next_max++;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Verify the invariants described in gpir.h, as well as making sure the
bf215546Sopenharmony_ci * counts are correct.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic void ASSERTED verify_max_nodes(sched_ctx *ctx)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   int alu_num_slot_needed_by_max = 0;
bf215546Sopenharmony_ci   int alu_num_unscheduled_next_max = 0;
bf215546Sopenharmony_ci   int alu_num_slot_needed_by_store = 0;
bf215546Sopenharmony_ci   int alu_num_slot_needed_by_non_cplx_store = 0;
bf215546Sopenharmony_ci   ASSERTED int alu_max_allowed_next_max = 5;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci      if (!gpir_is_input_node(node))
bf215546Sopenharmony_ci         continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (node->sched.max_node)
bf215546Sopenharmony_ci         alu_num_slot_needed_by_max++;
bf215546Sopenharmony_ci      if (node->sched.next_max_node)
bf215546Sopenharmony_ci         alu_num_unscheduled_next_max++;
bf215546Sopenharmony_ci      if (used_by_store(node, ctx->instr)) {
bf215546Sopenharmony_ci         alu_num_slot_needed_by_store++;
bf215546Sopenharmony_ci         if (node->sched.next_max_node && !node->sched.complex_allowed)
bf215546Sopenharmony_ci            alu_num_slot_needed_by_non_cplx_store++;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] &&
bf215546Sopenharmony_ci       ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1)
bf215546Sopenharmony_ci      alu_max_allowed_next_max = 4;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
bf215546Sopenharmony_ci   assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max);
bf215546Sopenharmony_ci   assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max);
bf215546Sopenharmony_ci   assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
bf215546Sopenharmony_ci   assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
bf215546Sopenharmony_ci          alu_num_slot_needed_by_non_cplx_store);
bf215546Sopenharmony_ci   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0));
bf215546Sopenharmony_ci   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool try_node(sched_ctx *ctx)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   gpir_node *best_node = NULL;
bf215546Sopenharmony_ci   int best_score = INT_MIN;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Spilling will delete arbitrary nodes after the current one in the ready
bf215546Sopenharmony_ci    * list, which means that we always need to look up the next node in the
bf215546Sopenharmony_ci    * list at the end of each iteration. While list_for_each_entry() works for
bf215546Sopenharmony_ci    * this purpose, its sanity checking assumes that you don't want to modify
bf215546Sopenharmony_ci    * the list at all. We know better here, so we have to open-code
bf215546Sopenharmony_ci    * list_for_each_entry() without the check in order to not assert.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   for (gpir_node *node = list_entry(ctx->ready_list.next, gpir_node, list);
bf215546Sopenharmony_ci        &node->list != &ctx->ready_list;
bf215546Sopenharmony_ci        node = list_entry(node->list.next, gpir_node, list)) {
bf215546Sopenharmony_ci      if (best_score != INT_MIN) {
bf215546Sopenharmony_ci         if (node->sched.dist < best_node->sched.dist)
bf215546Sopenharmony_ci            break;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (node->sched.ready) {
bf215546Sopenharmony_ci         ctx->total_spill_needed = 0;
bf215546Sopenharmony_ci         ctx->max_node_spill_needed = 0;
bf215546Sopenharmony_ci         int score = schedule_try_node(ctx, node, true);
bf215546Sopenharmony_ci         if (score == INT_MIN && !best_node &&
bf215546Sopenharmony_ci             ctx->total_spill_needed > 0 &&
bf215546Sopenharmony_ci             try_spill_nodes(ctx, node)) {
bf215546Sopenharmony_ci            score = schedule_try_node(ctx, node, true);
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         /* schedule_first nodes must be scheduled if possible */
bf215546Sopenharmony_ci         if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) {
bf215546Sopenharmony_ci            best_node = node;
bf215546Sopenharmony_ci            best_score = score;
bf215546Sopenharmony_ci            break;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         if (score > best_score) {
bf215546Sopenharmony_ci            best_score = score;
bf215546Sopenharmony_ci            best_node = node;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (best_node) {
bf215546Sopenharmony_ci      gpir_debug("scheduling %d (score = %d)%s\n", best_node->index,
bf215546Sopenharmony_ci                 best_score, best_node->sched.max_node ? " (max)" : "");
bf215546Sopenharmony_ci      ASSERTED int score = schedule_try_node(ctx, best_node, false);
bf215546Sopenharmony_ci      assert(score != INT_MIN);
bf215546Sopenharmony_ci      return true;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return false;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void place_move(sched_ctx *ctx, gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   /* For complex1 that is consumed by a postlog2, we cannot allow any moves
bf215546Sopenharmony_ci    * in between. Convert the postlog2 to a move and insert a new postlog2,
bf215546Sopenharmony_ci    * and try to schedule it again in try_node().
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   gpir_node *postlog2 = consuming_postlog2(node);
bf215546Sopenharmony_ci   if (postlog2) {
bf215546Sopenharmony_ci      postlog2->op = gpir_op_mov;
bf215546Sopenharmony_ci      create_postlog2(ctx, node);
bf215546Sopenharmony_ci      return;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   gpir_node *move = create_move(ctx, node);
bf215546Sopenharmony_ci   gpir_node_foreach_succ_safe(move, dep) {
bf215546Sopenharmony_ci      gpir_node *succ = dep->succ;
bf215546Sopenharmony_ci      if (!succ->sched.instr ||
bf215546Sopenharmony_ci          ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) {
bf215546Sopenharmony_ci         gpir_node_replace_pred(dep, node);
bf215546Sopenharmony_ci         if (dep->type == GPIR_DEP_INPUT)
bf215546Sopenharmony_ci            gpir_node_replace_child(succ, move, node);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   ASSERTED int score = schedule_try_node(ctx, move, false);
bf215546Sopenharmony_ci   assert(score != INT_MIN);
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* For next-max nodes, not every node can be offloaded to a move in the
bf215546Sopenharmony_ci * complex slot. If we run out of non-complex slots, then such nodes cannot
bf215546Sopenharmony_ci * have moves placed for them. There should always be sufficient
bf215546Sopenharmony_ci * complex-capable nodes so that this isn't a problem. We also disallow moves
bf215546Sopenharmony_ci * for schedule_first nodes here.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic bool can_place_move(sched_ctx *ctx, gpir_node *node)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (gpir_op_infos[node->op].schedule_first)
bf215546Sopenharmony_ci      return false;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (!node->sched.next_max_node)
bf215546Sopenharmony_ci      return true;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (node->sched.complex_allowed)
bf215546Sopenharmony_ci      return true;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return ctx->instr->alu_non_cplx_slot_free > 0;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool sched_move(sched_ctx *ctx)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci      if (node->sched.max_node) {
bf215546Sopenharmony_ci         place_move(ctx, node);
bf215546Sopenharmony_ci         return true;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (ctx->instr->alu_num_slot_needed_by_store > 0) {
bf215546Sopenharmony_ci      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci         if (used_by_store(node, ctx->instr)) {
bf215546Sopenharmony_ci            place_move(ctx, node);
bf215546Sopenharmony_ci            /* If we have a store of a load, then we need to make sure that we
bf215546Sopenharmony_ci             * immediately schedule the dependent load, or create a move
bf215546Sopenharmony_ci             * instruction for it, like we would with a normal instruction.
bf215546Sopenharmony_ci             * The rest of the code isn't set up to handle load nodes in the
bf215546Sopenharmony_ci             * ready list -- see the comments in _schedule_try_node().
bf215546Sopenharmony_ci             */
bf215546Sopenharmony_ci            if (node->type == gpir_node_type_load) {
bf215546Sopenharmony_ci               if (!schedule_try_place_node(ctx, node, false)) {
bf215546Sopenharmony_ci                  create_move(ctx, node);
bf215546Sopenharmony_ci               }
bf215546Sopenharmony_ci            }
bf215546Sopenharmony_ci            return true;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* complex1 is a bit a special case, since it has a latency of 2 cycles.
bf215546Sopenharmony_ci    * Once it is fully ready, we need to group all its uses in the same
bf215546Sopenharmony_ci    * instruction, and then we need to avoid creating any moves in the next
bf215546Sopenharmony_ci    * cycle in order to get it scheduled. Failing to do any of these things
bf215546Sopenharmony_ci    * could result in a cycle penalty, or even worse, an infinite loop of
bf215546Sopenharmony_ci    * inserting moves. If it is a next-max node and ready, then it has a use
bf215546Sopenharmony_ci    * in the previous cycle. If it has a use in the current cycle as well,
bf215546Sopenharmony_ci    * then we want to insert a move node to make it ready in two cycles -- if
bf215546Sopenharmony_ci    * we don't, then there will be at least a one cycle penalty. Otherwise, it
bf215546Sopenharmony_ci    * will be ready next cycle, and we shouldn't insert a move node, or else
bf215546Sopenharmony_ci    * we'll also have a one cycle penalty.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   if (ctx->instr->alu_num_slot_free > 0) {
bf215546Sopenharmony_ci      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci         if (!can_place_move(ctx, node))
bf215546Sopenharmony_ci            continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
bf215546Sopenharmony_ci             node->sched.ready) {
bf215546Sopenharmony_ci            bool skip = true;
bf215546Sopenharmony_ci            gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci               if (dep->type != GPIR_DEP_INPUT)
bf215546Sopenharmony_ci                  continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci               gpir_node *succ = dep->succ;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci               if (!succ->sched.instr ||
bf215546Sopenharmony_ci                   succ->sched.instr->index != ctx->instr->index - 1) {
bf215546Sopenharmony_ci                  skip = false;
bf215546Sopenharmony_ci                  break;
bf215546Sopenharmony_ci               }
bf215546Sopenharmony_ci            }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci            if (skip)
bf215546Sopenharmony_ci               continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci            place_move(ctx, node);
bf215546Sopenharmony_ci            return true;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Once we've made all the required moves, we're free to use any extra
bf215546Sopenharmony_ci    * slots to schedule more moves for next max nodes. Besides sometimes being
bf215546Sopenharmony_ci    * necessary, this can free up extra space in the next instruction. We walk
bf215546Sopenharmony_ci    * from back to front so that we pick nodes less likely to be scheduled
bf215546Sopenharmony_ci    * next first -- an extra move would be unnecessary there. But make sure
bf215546Sopenharmony_ci    * not to handle the complex1 case handled above.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   if (ctx->instr->alu_num_slot_free > 0) {
bf215546Sopenharmony_ci      list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci         if (!can_place_move(ctx, node))
bf215546Sopenharmony_ci            continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         if (node->sched.next_max_node &&
bf215546Sopenharmony_ci             !(node->op == gpir_op_complex1 && node->sched.ready)) {
bf215546Sopenharmony_ci            place_move(ctx, node);
bf215546Sopenharmony_ci            return true;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* We may have skipped complex1 above, but if we run out of space, we still
bf215546Sopenharmony_ci    * need to insert the move.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (ctx->instr->alu_num_unscheduled_next_max >
bf215546Sopenharmony_ci       ctx->instr->alu_max_allowed_next_max) {
bf215546Sopenharmony_ci      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci         if (!can_place_move(ctx, node))
bf215546Sopenharmony_ci            continue;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         if (node->sched.next_max_node) {
bf215546Sopenharmony_ci            place_move(ctx, node);
bf215546Sopenharmony_ci            return true;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return false;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool gpir_sched_instr_pass(sched_ctx *ctx)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (try_node(ctx))
bf215546Sopenharmony_ci      return true;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (sched_move(ctx))
bf215546Sopenharmony_ci      return true;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return false;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void schedule_print_pre_one_instr(sched_ctx *ctx)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (!(lima_debug & LIMA_DEBUG_GP))
bf215546Sopenharmony_ci      return;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   printf("instr %d for ready list:", ctx->instr->index);
bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
bf215546Sopenharmony_ci      printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p',
bf215546Sopenharmony_ci             node->sched.dist, gpir_get_slots_required(node),
bf215546Sopenharmony_ci             node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none"));
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   printf("\nlive physregs: ");
bf215546Sopenharmony_ci   for (unsigned i = 0; i < 16; i++) {
bf215546Sopenharmony_ci      if (ctx->live_physregs & (0xfull << (4 * i))) {
bf215546Sopenharmony_ci         printf("$%d.", i);
bf215546Sopenharmony_ci         for (unsigned j = 0; j < 4; j++) {
bf215546Sopenharmony_ci            if (ctx->live_physregs & (1ull << (4 * i + j)))
bf215546Sopenharmony_ci               printf("%c", "xyzw"[j]);
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci         printf(" ");
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   printf("\n");
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void schedule_print_post_one_instr(gpir_instr *instr)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (!(lima_debug & LIMA_DEBUG_GP))
bf215546Sopenharmony_ci      return;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   printf("post schedule instr");
bf215546Sopenharmony_ci   for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) {
bf215546Sopenharmony_ci      if (instr->slots[i])
bf215546Sopenharmony_ci         printf(" %d/%d", i, instr->slots[i]->index);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   printf("\n");
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool schedule_one_instr(sched_ctx *ctx)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   gpir_instr *instr = gpir_instr_create(ctx->block);
bf215546Sopenharmony_ci   if (unlikely(!instr))
bf215546Sopenharmony_ci      return false;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   ctx->instr = instr;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   sched_find_max_nodes(ctx);
bf215546Sopenharmony_ci   schedule_print_pre_one_instr(ctx);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   while (gpir_sched_instr_pass(ctx)) {
bf215546Sopenharmony_ci      assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));
bf215546Sopenharmony_ci#ifndef NDEBUG
bf215546Sopenharmony_ci      verify_max_nodes(ctx);
bf215546Sopenharmony_ci      verify_ready_list(ctx);
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   schedule_print_post_one_instr(instr);
bf215546Sopenharmony_ci   return true;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic bool schedule_block(gpir_block *block)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   /* calculate distance */
bf215546Sopenharmony_ci   list_for_each_entry(gpir_node, node, &block->node_list, list) {
bf215546Sopenharmony_ci      if (gpir_node_is_root(node))
bf215546Sopenharmony_ci         schedule_update_distance(node);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   sched_ctx ctx;
bf215546Sopenharmony_ci   list_inithead(&ctx.ready_list);
bf215546Sopenharmony_ci   ctx.block = block;
bf215546Sopenharmony_ci   ctx.ready_list_slots = 0;
bf215546Sopenharmony_ci   ctx.live_physregs = block->live_out_phys;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {
bf215546Sopenharmony_ci      list_inithead(&ctx.physreg_reads[i]);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* construct the ready list from root nodes */
bf215546Sopenharmony_ci   list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
bf215546Sopenharmony_ci      /* Add to physreg_reads */
bf215546Sopenharmony_ci      if (node->op == gpir_op_load_reg) {
bf215546Sopenharmony_ci         gpir_load_node *load = gpir_node_to_load(node);
bf215546Sopenharmony_ci         unsigned index = 4 * load->index + load->component;
bf215546Sopenharmony_ci         list_addtail(&load->reg_link, &ctx.physreg_reads[index]);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (gpir_node_is_root(node))
bf215546Sopenharmony_ci         schedule_insert_ready_list(&ctx, node);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   list_inithead(&block->node_list);
bf215546Sopenharmony_ci   while (!list_is_empty(&ctx.ready_list)) {
bf215546Sopenharmony_ci      if (!schedule_one_instr(&ctx))
bf215546Sopenharmony_ci         return false;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return true;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void schedule_build_dependency(gpir_block *block)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   /* merge dummy_f/m to the node created from */
bf215546Sopenharmony_ci   list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
bf215546Sopenharmony_ci      if (node->op == gpir_op_dummy_m) {
bf215546Sopenharmony_ci         gpir_alu_node *alu = gpir_node_to_alu(node);
bf215546Sopenharmony_ci         gpir_node *origin = alu->children[0];
bf215546Sopenharmony_ci         gpir_node *dummy_f = alu->children[1];
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         gpir_node_foreach_succ(node, dep) {
bf215546Sopenharmony_ci            gpir_node *succ = dep->succ;
bf215546Sopenharmony_ci            /* origin and node may have same succ (by VREG/INPUT or
bf215546Sopenharmony_ci             * VREG/VREG dep), so use gpir_node_add_dep() instead of
bf215546Sopenharmony_ci             * gpir_node_replace_pred() */
bf215546Sopenharmony_ci            gpir_node_add_dep(succ, origin, dep->type);
bf215546Sopenharmony_ci            gpir_node_replace_child(succ, node, origin);
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci         gpir_node_delete(dummy_f);
bf215546Sopenharmony_ci         gpir_node_delete(node);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic void print_statistic(gpir_compiler *comp, int save_index)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   int num_nodes[gpir_op_num] = {0};
bf215546Sopenharmony_ci   int num_created_nodes[gpir_op_num] = {0};
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
bf215546Sopenharmony_ci      list_for_each_entry(gpir_node, node, &block->node_list, list) {
bf215546Sopenharmony_ci         num_nodes[node->op]++;
bf215546Sopenharmony_ci         if (node->index >= save_index)
bf215546Sopenharmony_ci            num_created_nodes[node->op]++;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   printf("====== gpir scheduler statistic ======\n");
bf215546Sopenharmony_ci   printf("---- how many nodes are scheduled ----\n");
bf215546Sopenharmony_ci   int n = 0, l = 0;
bf215546Sopenharmony_ci   for (int i = 0; i < gpir_op_num; i++) {
bf215546Sopenharmony_ci      if (num_nodes[i]) {
bf215546Sopenharmony_ci         printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]);
bf215546Sopenharmony_ci         n += num_nodes[i];
bf215546Sopenharmony_ci         if (!(++l % 4))
bf215546Sopenharmony_ci            printf("\n");
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   if (l % 4)
bf215546Sopenharmony_ci      printf("\n");
bf215546Sopenharmony_ci   printf("\ntotal: %d\n", n);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   printf("---- how many nodes are created ----\n");
bf215546Sopenharmony_ci   n = l = 0;
bf215546Sopenharmony_ci   for (int i = 0; i < gpir_op_num; i++) {
bf215546Sopenharmony_ci      if (num_created_nodes[i]) {
bf215546Sopenharmony_ci         printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]);
bf215546Sopenharmony_ci         n += num_created_nodes[i];
bf215546Sopenharmony_ci         if (!(++l % 4))
bf215546Sopenharmony_ci            printf("\n");
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   if (l % 4)
bf215546Sopenharmony_ci      printf("\n");
bf215546Sopenharmony_ci   printf("\ntotal: %d\n", n);
bf215546Sopenharmony_ci   printf("------------------------------------\n");
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cibool gpir_schedule_prog(gpir_compiler *comp)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   int save_index = comp->cur_index;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* init schedule info */
bf215546Sopenharmony_ci   int index = 0;
bf215546Sopenharmony_ci   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
bf215546Sopenharmony_ci      block->sched.instr_index = 0;
bf215546Sopenharmony_ci      list_for_each_entry(gpir_node, node, &block->node_list, list) {
bf215546Sopenharmony_ci         node->sched.instr = NULL;
bf215546Sopenharmony_ci         node->sched.pos = -1;
bf215546Sopenharmony_ci         node->sched.index = index++;
bf215546Sopenharmony_ci         node->sched.dist = -1;
bf215546Sopenharmony_ci         /* TODO when we support multiple basic blocks, we need a way to keep
bf215546Sopenharmony_ci          * track of this for physregs allocated before the scheduler.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         node->sched.physreg_store = NULL;
bf215546Sopenharmony_ci         node->sched.ready = false;
bf215546Sopenharmony_ci         node->sched.inserted = false;
bf215546Sopenharmony_ci         node->sched.complex_allowed = false;
bf215546Sopenharmony_ci         node->sched.max_node = false;
bf215546Sopenharmony_ci         node->sched.next_max_node = false;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* build dependency */
bf215546Sopenharmony_ci   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
bf215546Sopenharmony_ci      schedule_build_dependency(block);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   //gpir_debug("after scheduler build reg dependency\n");
bf215546Sopenharmony_ci   //gpir_node_print_prog_dep(comp);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
bf215546Sopenharmony_ci      if (!schedule_block(block)) {
bf215546Sopenharmony_ci         gpir_error("fail schedule block\n");
bf215546Sopenharmony_ci         return false;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (lima_debug & LIMA_DEBUG_GP) {
bf215546Sopenharmony_ci      print_statistic(comp, save_index);
bf215546Sopenharmony_ci      gpir_instr_print_prog(comp);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return true;
bf215546Sopenharmony_ci}