1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright (c) 2017 Lima Project 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sub license, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the 12bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions 13bf215546Sopenharmony_ci * of the Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21bf215546Sopenharmony_ci * DEALINGS IN THE SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci */ 24bf215546Sopenharmony_ci 25bf215546Sopenharmony_ci#include <limits.h> 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci#include "gpir.h" 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci/* 30bf215546Sopenharmony_ci * GP scheduling algorithm (by Connor Abbott <cwabbott0@gmail.com>) 31bf215546Sopenharmony_ci * 32bf215546Sopenharmony_ci * The GP pipeline has three main stages: 33bf215546Sopenharmony_ci * 34bf215546Sopenharmony_ci * -------------------------------------------------------- 35bf215546Sopenharmony_ci * | | 36bf215546Sopenharmony_ci * | Register/Attr/Temp Fetch | 37bf215546Sopenharmony_ci * | | 38bf215546Sopenharmony_ci * -------------------------------------------------------- 39bf215546Sopenharmony_ci * | | | | | | | 40bf215546Sopenharmony_ci * | Mul0 | Mul1 | Add0 | Add1 | Cplx | Pass | 41bf215546Sopenharmony_ci * | | | | | | | 42bf215546Sopenharmony_ci * -------------------------------------------------------- 43bf215546Sopenharmony_ci * | | | | 44bf215546Sopenharmony_ci * | Complex1 | Temp/Register/Varying | Pass | 45bf215546Sopenharmony_ci * | Stage 2 | Store | Stage 2 | 46bf215546Sopenharmony_ci * | | | | 47bf215546Sopenharmony_ci * -------------------------------------------------------- 48bf215546Sopenharmony_ci * 49bf215546Sopenharmony_ci * Because of this setup, storing a register has a latency of three cycles. 50bf215546Sopenharmony_ci * Also, the register file is organized into 4-component vectors, and the 51bf215546Sopenharmony_ci * load stage can only load two vectors at a time. Aside from these highly 52bf215546Sopenharmony_ci * constrained register load/store units, there is an explicit bypass 53bf215546Sopenharmony_ci * network, where each unit (mul0/mul1/etc.) can access the results of the 54bf215546Sopenharmony_ci * any unit from the previous two cycles directly, except for the complex 55bf215546Sopenharmony_ci * unit whose result can only be accessed for one cycle (since it's expected 56bf215546Sopenharmony_ci * to be used directly by the complex2 instruction in the following cycle). 57bf215546Sopenharmony_ci * 58bf215546Sopenharmony_ci * Because of the very restricted register file, and because only rarely are 59bf215546Sopenharmony_ci * all the units in use at the same time, it can be very beneficial to use 60bf215546Sopenharmony_ci * the unused units to "thread" a value from source to destination by using 61bf215546Sopenharmony_ci * moves in the otherwise-unused units, without involving the register file 62bf215546Sopenharmony_ci * at all. It's very difficult to fully exploit this with a traditional 63bf215546Sopenharmony_ci * scheduler, so we need to do something a little un-traditional. The 512 64bf215546Sopenharmony_ci * instruction limit means that for more complex shaders, we need to do as 65bf215546Sopenharmony_ci * well as possible or else the app won't even work. 66bf215546Sopenharmony_ci * 67bf215546Sopenharmony_ci * The scheduler works by considering the bypass network as a kind of 68bf215546Sopenharmony_ci * register file. It's a quite unusual register file, since registers have to 69bf215546Sopenharmony_ci * be assigned "on the fly" as we schedule operations, but with some care, we 70bf215546Sopenharmony_ci * can use something conceptually similar to a linear-scan allocator to 71bf215546Sopenharmony_ci * successfully schedule nodes to instructions without running into 72bf215546Sopenharmony_ci * conflicts. 73bf215546Sopenharmony_ci * 74bf215546Sopenharmony_ci * Values in the IR are separated into normal values, or "value registers", 75bf215546Sopenharmony_ci * which is what normal nodes like add, mul, etc. produce, and which only 76bf215546Sopenharmony_ci * live inside one basic block, and registers, which can span multiple basic 77bf215546Sopenharmony_ci * blocks but have to be accessed via special load_reg/store_reg nodes. RA 78bf215546Sopenharmony_ci * assigns physical registers to both value registers and normal registers, 79bf215546Sopenharmony_ci * treating load_reg/store_reg as a move instruction, but these are only used 80bf215546Sopenharmony_ci * directly for normal registers -- the physreg assigned to a value register 81bf215546Sopenharmony_ci * is "fake," and is only used inside the scheduler. Before scheduling we 82bf215546Sopenharmony_ci * insert read-after-write dependencies, even for value registers, as if 83bf215546Sopenharmony_ci * we're going to use those, but then we throw them away. For example, if we 84bf215546Sopenharmony_ci * had something like: 85bf215546Sopenharmony_ci * 86bf215546Sopenharmony_ci * (*)r2 = add (*)r1, (*)r2 87bf215546Sopenharmony_ci * (*)r1 = load_reg r0 88bf215546Sopenharmony_ci * 89bf215546Sopenharmony_ci * we'd insert a write-after-read dependency between the add and load_reg, 90bf215546Sopenharmony_ci * even though the starred registers aren't actually used by the scheduler 91bf215546Sopenharmony_ci * after this step. This step is crucial since it guarantees that during any 92bf215546Sopenharmony_ci * point in the schedule, the number of live registers + live value registers 93bf215546Sopenharmony_ci * will never exceed the capacity of the register file and the bypass network 94bf215546Sopenharmony_ci * combined. This is because each live register/value register will have a 95bf215546Sopenharmony_ci * different fake number, thanks to the fake dependencies inserted before 96bf215546Sopenharmony_ci * scheduling. This allows us to not have to worry about spilling to 97bf215546Sopenharmony_ci * temporaries, which is only done ahead of time. 98bf215546Sopenharmony_ci * 99bf215546Sopenharmony_ci * The scheduler is a bottom-up scheduler. It keeps track of each live value 100bf215546Sopenharmony_ci * register, and decides on-the-fly which value registers to keep in the 101bf215546Sopenharmony_ci * bypass network and which to "spill" to registers. Of particular importance 102bf215546Sopenharmony_ci * is the "ready list," which consists of "input nodes" (nodes that produce a 103bf215546Sopenharmony_ci * value that can be consumed via the bypass network), both "partially ready" 104bf215546Sopenharmony_ci * (only some of the uses have been scheduled) and "fully ready" (all uses 105bf215546Sopenharmony_ci * have been scheduled), as well as other non-input nodes like register 106bf215546Sopenharmony_ci * stores. Each input node on the ready list represents a live value register 107bf215546Sopenharmony_ci * before the current instruction. There must be at most 11 such input nodes 108bf215546Sopenharmony_ci * at all times, since there are only 11 slots in the next two instructions 109bf215546Sopenharmony_ci * which can reach the current instruction. 110bf215546Sopenharmony_ci * 111bf215546Sopenharmony_ci * An input node is a "max node" if it has a use two cycles ago, which must be 112bf215546Sopenharmony_ci * connected to a definition this cycle. Otherwise it may be a "next max node" 113bf215546Sopenharmony_ci * if it will be a max node on the next instruction (i.e. it has a use at most 114bf215546Sopenharmony_ci * one cycle ago), or it may be neither if all of its uses are this cycle. As 115bf215546Sopenharmony_ci * we keep adding instructions to the front, input nodes graduate from 116bf215546Sopenharmony_ci * neither, to next max, to max, unless we decide to insert a move to keep it 117bf215546Sopenharmony_ci * alive longer, at which point any uses after the current instruction are 118bf215546Sopenharmony_ci * rewritten to be uses of the move so that the original node returns to 119bf215546Sopenharmony_ci * neither. The scheduler decides which nodes to try freely, but we have to 120bf215546Sopenharmony_ci * reserve slots for two different reasons: (1) out of the 5 non-complex 121bf215546Sopenharmony_ci * slots, we reserve a slot for each max node, so that we can connect a 122bf215546Sopenharmony_ci * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a 123bf215546Sopenharmony_ci * slot for every next-max node above 5, so that for the next instruction 124bf215546Sopenharmony_ci * there are no more than 5 max nodes. When a max or next-max node gets 125bf215546Sopenharmony_ci * scheduled, the corresponding reservation is reduced by one. At the end, we 126bf215546Sopenharmony_ci * insert moves for every slot that was reserved. The reservation is actually 127bf215546Sopenharmony_ci * managed by nir_instr, and all we have to do is tell it how many to reserve 128bf215546Sopenharmony_ci * at the beginning and then tell it which nodes are max/next-max nodes. When 129bf215546Sopenharmony_ci * we start scheduling an instruction, there will be at most 5 max nodes 130bf215546Sopenharmony_ci * thanks to the previous instruction's next-max reservation/move insertion. 131bf215546Sopenharmony_ci * Since there are at most 11 total input nodes, if there are N max nodes, 132bf215546Sopenharmony_ci * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 = 133bf215546Sopenharmony_ci * 6 - N slots need to be reserved for next-max nodes, and so at most 134bf215546Sopenharmony_ci * 6 - N + N = 6 slots need to be reserved in total, exactly the total number 135bf215546Sopenharmony_ci * of slots. So, thanks to the total input node restriction, we will never 136bf215546Sopenharmony_ci * need to reserve too many slots. 137bf215546Sopenharmony_ci * 138bf215546Sopenharmony_ci * It sometimes happens that scheduling a given node will violate this total 139bf215546Sopenharmony_ci * input node restriction, or that a reservation will mean that we can't 140bf215546Sopenharmony_ci * schedule it. We first schedule a node "speculatively" to see if this is a 141bf215546Sopenharmony_ci * problem. If some of the node's sources are loads, then we can schedule 142bf215546Sopenharmony_ci * the node and its dependent loads in one swoop to avoid going over the 143bf215546Sopenharmony_ci * pressure limit. If that fails, we can try to spill a ready or 144bf215546Sopenharmony_ci * partially-ready input node to a register by rewriting all of its uses to 145bf215546Sopenharmony_ci * refer to a register load. This removes it from the list of ready and 146bf215546Sopenharmony_ci * partially ready input nodes as all of its uses are now unscheduled. If 147bf215546Sopenharmony_ci * successful, we can then proceed with scheduling the original node. All of 148bf215546Sopenharmony_ci * this happens "speculatively," meaning that afterwards the node is removed 149bf215546Sopenharmony_ci * and the entire state of the scheduler is reverted to before it was tried, to 150bf215546Sopenharmony_ci * ensure that we never get into an invalid state and run out of spots for 151bf215546Sopenharmony_ci * moves. In try_nodes(), we try to schedule each node speculatively on the 152bf215546Sopenharmony_ci * ready list, keeping only the nodes that could be successfully scheduled, so 153bf215546Sopenharmony_ci * that when we finally decide which node to actually schedule, we know it 154bf215546Sopenharmony_ci * will succeed. This is how we decide on the fly which values go in 155bf215546Sopenharmony_ci * registers and which go in the bypass network. Note that "unspilling" a 156bf215546Sopenharmony_ci * value is simply a matter of scheduling the store_reg instruction created 157bf215546Sopenharmony_ci * when we spill. 158bf215546Sopenharmony_ci * 159bf215546Sopenharmony_ci * The careful accounting of live value registers, reservations for moves, and 160bf215546Sopenharmony_ci * speculative scheduling guarantee that we never run into a failure case 161bf215546Sopenharmony_ci * while scheduling. However, we need to make sure that this scheduler will 162bf215546Sopenharmony_ci * not get stuck in an infinite loop, i.e. that we'll always make forward 163bf215546Sopenharmony_ci * progress by eventually scheduling a non-move node. If we run out of value 164bf215546Sopenharmony_ci * registers, then we may have to spill a node to a register. If we 165bf215546Sopenharmony_ci * were to schedule one of the fully-ready nodes, then we'd have 11 + N live 166bf215546Sopenharmony_ci * value registers before the current instruction. But since there are at most 167bf215546Sopenharmony_ci * 64+11 live registers and register values total thanks to the fake 168bf215546Sopenharmony_ci * dependencies we inserted before scheduling, there are at most 64 - N live 169bf215546Sopenharmony_ci * physical registers, and therefore there are at least N registers available 170bf215546Sopenharmony_ci * for spilling. Not all these registers will be available immediately, since 171bf215546Sopenharmony_ci * in order to spill a node to a given register we have to ensure that there 172bf215546Sopenharmony_ci * are slots available to rewrite every use to a load instruction, and that 173bf215546Sopenharmony_ci * may not be the case. There may also be intervening writes which prevent 174bf215546Sopenharmony_ci * some registers from being used. However, these are all temporary problems, 175bf215546Sopenharmony_ci * since as we create each instruction, we create additional register load 176bf215546Sopenharmony_ci * slots that can be freely used for spilling, and we create more move nodes 177bf215546Sopenharmony_ci * which means that the uses of the nodes we're trying to spill keep moving 178bf215546Sopenharmony_ci * forward. This means that eventually, these problems will go away, at which 179bf215546Sopenharmony_ci * point we'll be able to spill a node successfully, so eventually we'll be 180bf215546Sopenharmony_ci * able to schedule the first node on the ready list. 181bf215546Sopenharmony_ci */ 182bf215546Sopenharmony_ci 183bf215546Sopenharmony_citypedef struct { 184bf215546Sopenharmony_ci /* This is the list of ready and partially-ready nodes. A partially-ready 185bf215546Sopenharmony_ci * node must have at least one input dependency already scheduled. 186bf215546Sopenharmony_ci */ 187bf215546Sopenharmony_ci struct list_head ready_list; 188bf215546Sopenharmony_ci 189bf215546Sopenharmony_ci /* The number of ready or partially-ready nodes with at least one input 190bf215546Sopenharmony_ci * dependency already scheduled. In other words, the number of live value 191bf215546Sopenharmony_ci * registers. This must be at most 11. 192bf215546Sopenharmony_ci */ 193bf215546Sopenharmony_ci int ready_list_slots; 194bf215546Sopenharmony_ci 195bf215546Sopenharmony_ci /* The physical registers live into the current instruction. */ 196bf215546Sopenharmony_ci uint64_t live_physregs; 197bf215546Sopenharmony_ci 198bf215546Sopenharmony_ci /* The current instruction. */ 199bf215546Sopenharmony_ci gpir_instr *instr; 200bf215546Sopenharmony_ci 201bf215546Sopenharmony_ci /* The current basic block. */ 202bf215546Sopenharmony_ci gpir_block *block; 203bf215546Sopenharmony_ci 204bf215546Sopenharmony_ci /* True if at least one node failed to schedule due to lack of available 205bf215546Sopenharmony_ci * value registers. 206bf215546Sopenharmony_ci */ 207bf215546Sopenharmony_ci bool try_spill_all; 208bf215546Sopenharmony_ci 209bf215546Sopenharmony_ci /* The number of max nodes needed to spill to successfully schedule the 210bf215546Sopenharmony_ci * instruction. 211bf215546Sopenharmony_ci */ 212bf215546Sopenharmony_ci int max_node_spill_needed; 213bf215546Sopenharmony_ci 214bf215546Sopenharmony_ci /* The number of max and next-max nodes needed to spill to successfully 215bf215546Sopenharmony_ci * schedule the instruction. 216bf215546Sopenharmony_ci */ 217bf215546Sopenharmony_ci int total_spill_needed; 218bf215546Sopenharmony_ci 219bf215546Sopenharmony_ci /* For each physical register, a linked list of loads associated with it in 220bf215546Sopenharmony_ci * this block. When we spill a value to a given register, and there are 221bf215546Sopenharmony_ci * existing loads associated with it that haven't been scheduled yet, we 222bf215546Sopenharmony_ci * have to make sure that the corresponding unspill happens after the last 223bf215546Sopenharmony_ci * original use has happened, i.e. is scheduled before. 224bf215546Sopenharmony_ci */ 225bf215546Sopenharmony_ci struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM]; 226bf215546Sopenharmony_ci} sched_ctx; 227bf215546Sopenharmony_ci 228bf215546Sopenharmony_cistatic int gpir_min_dist_alu(gpir_dep *dep) 229bf215546Sopenharmony_ci{ 230bf215546Sopenharmony_ci switch (dep->pred->op) { 231bf215546Sopenharmony_ci case gpir_op_load_uniform: 232bf215546Sopenharmony_ci case gpir_op_load_temp: 233bf215546Sopenharmony_ci case gpir_op_load_reg: 234bf215546Sopenharmony_ci case gpir_op_load_attribute: 235bf215546Sopenharmony_ci return 0; 236bf215546Sopenharmony_ci 237bf215546Sopenharmony_ci case gpir_op_complex1: 238bf215546Sopenharmony_ci return 2; 239bf215546Sopenharmony_ci 240bf215546Sopenharmony_ci default: 241bf215546Sopenharmony_ci return 1; 242bf215546Sopenharmony_ci } 243bf215546Sopenharmony_ci} 244bf215546Sopenharmony_ci 245bf215546Sopenharmony_cistatic int gpir_get_min_dist(gpir_dep *dep) 246bf215546Sopenharmony_ci{ 247bf215546Sopenharmony_ci switch (dep->type) { 248bf215546Sopenharmony_ci case GPIR_DEP_INPUT: 249bf215546Sopenharmony_ci switch (dep->succ->op) { 250bf215546Sopenharmony_ci case gpir_op_store_temp: 251bf215546Sopenharmony_ci case gpir_op_store_reg: 252bf215546Sopenharmony_ci case gpir_op_store_varying: 253bf215546Sopenharmony_ci /* Stores must use an alu node as input. Also, complex1 takes two 254bf215546Sopenharmony_ci * cycles, which means that its result cannot be stored to a register 255bf215546Sopenharmony_ci * as part of the normal path, and therefore it must also have a move 256bf215546Sopenharmony_ci * inserted. 257bf215546Sopenharmony_ci */ 258bf215546Sopenharmony_ci if (dep->pred->type == gpir_node_type_load || 259bf215546Sopenharmony_ci dep->pred->op == gpir_op_complex1) 260bf215546Sopenharmony_ci return INT_MAX >> 2; 261bf215546Sopenharmony_ci else 262bf215546Sopenharmony_ci return 0; 263bf215546Sopenharmony_ci 264bf215546Sopenharmony_ci default: 265bf215546Sopenharmony_ci return gpir_min_dist_alu(dep); 266bf215546Sopenharmony_ci } 267bf215546Sopenharmony_ci 268bf215546Sopenharmony_ci case GPIR_DEP_OFFSET: 269bf215546Sopenharmony_ci assert(dep->succ->op == gpir_op_store_temp); 270bf215546Sopenharmony_ci return gpir_min_dist_alu(dep); 271bf215546Sopenharmony_ci 272bf215546Sopenharmony_ci case GPIR_DEP_READ_AFTER_WRITE: 273bf215546Sopenharmony_ci if (dep->succ->op == gpir_op_load_temp && 274bf215546Sopenharmony_ci dep->pred->op == gpir_op_store_temp) { 275bf215546Sopenharmony_ci return 4; 276bf215546Sopenharmony_ci } else if (dep->succ->op == gpir_op_load_reg && 277bf215546Sopenharmony_ci dep->pred->op == gpir_op_store_reg) { 278bf215546Sopenharmony_ci return 3; 279bf215546Sopenharmony_ci } else if ((dep->pred->op == gpir_op_store_temp_load_off0 || 280bf215546Sopenharmony_ci dep->pred->op == gpir_op_store_temp_load_off1 || 281bf215546Sopenharmony_ci dep->pred->op == gpir_op_store_temp_load_off2) && 282bf215546Sopenharmony_ci dep->succ->op == gpir_op_load_uniform) { 283bf215546Sopenharmony_ci return 4; 284bf215546Sopenharmony_ci } else { 285bf215546Sopenharmony_ci /* Fake dependency */ 286bf215546Sopenharmony_ci return 0; 287bf215546Sopenharmony_ci } 288bf215546Sopenharmony_ci 289bf215546Sopenharmony_ci case GPIR_DEP_WRITE_AFTER_READ: 290bf215546Sopenharmony_ci return 0; 291bf215546Sopenharmony_ci } 292bf215546Sopenharmony_ci 293bf215546Sopenharmony_ci return 0; 294bf215546Sopenharmony_ci} 295bf215546Sopenharmony_ci 296bf215546Sopenharmony_cistatic int gpir_max_dist_alu(gpir_dep *dep) 297bf215546Sopenharmony_ci{ 298bf215546Sopenharmony_ci switch (dep->pred->op) { 299bf215546Sopenharmony_ci case gpir_op_load_uniform: 300bf215546Sopenharmony_ci case gpir_op_load_temp: 301bf215546Sopenharmony_ci return 0; 302bf215546Sopenharmony_ci case gpir_op_load_attribute: 303bf215546Sopenharmony_ci return 1; 304bf215546Sopenharmony_ci case gpir_op_load_reg: 305bf215546Sopenharmony_ci if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 || 306bf215546Sopenharmony_ci dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3) 307bf215546Sopenharmony_ci return 0; 308bf215546Sopenharmony_ci else 309bf215546Sopenharmony_ci return 1; 310bf215546Sopenharmony_ci case gpir_op_exp2_impl: 311bf215546Sopenharmony_ci case gpir_op_log2_impl: 312bf215546Sopenharmony_ci case gpir_op_rcp_impl: 313bf215546Sopenharmony_ci case gpir_op_rsqrt_impl: 314bf215546Sopenharmony_ci case gpir_op_store_temp_load_off0: 315bf215546Sopenharmony_ci case gpir_op_store_temp_load_off1: 316bf215546Sopenharmony_ci case gpir_op_store_temp_load_off2: 317bf215546Sopenharmony_ci return 1; 318bf215546Sopenharmony_ci case gpir_op_mov: 319bf215546Sopenharmony_ci if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX) 320bf215546Sopenharmony_ci return 1; 321bf215546Sopenharmony_ci else 322bf215546Sopenharmony_ci return 2; 323bf215546Sopenharmony_ci default: 324bf215546Sopenharmony_ci return 2; 325bf215546Sopenharmony_ci } 326bf215546Sopenharmony_ci} 327bf215546Sopenharmony_ci 328bf215546Sopenharmony_cistatic int gpir_get_max_dist(gpir_dep *dep) 329bf215546Sopenharmony_ci{ 330bf215546Sopenharmony_ci switch (dep->type) { 331bf215546Sopenharmony_ci case GPIR_DEP_INPUT: 332bf215546Sopenharmony_ci switch (dep->succ->op) { 333bf215546Sopenharmony_ci case gpir_op_store_temp: 334bf215546Sopenharmony_ci case gpir_op_store_reg: 335bf215546Sopenharmony_ci case gpir_op_store_varying: 336bf215546Sopenharmony_ci return 0; 337bf215546Sopenharmony_ci 338bf215546Sopenharmony_ci default: 339bf215546Sopenharmony_ci return gpir_max_dist_alu(dep); 340bf215546Sopenharmony_ci } 341bf215546Sopenharmony_ci 342bf215546Sopenharmony_ci case GPIR_DEP_OFFSET: 343bf215546Sopenharmony_ci assert(dep->succ->op == gpir_op_store_temp); 344bf215546Sopenharmony_ci return gpir_max_dist_alu(dep); 345bf215546Sopenharmony_ci 346bf215546Sopenharmony_ci default: 347bf215546Sopenharmony_ci return INT_MAX >> 2; /* Don't want to overflow... */ 348bf215546Sopenharmony_ci } 349bf215546Sopenharmony_ci} 350bf215546Sopenharmony_ci 351bf215546Sopenharmony_cistatic void schedule_update_distance(gpir_node *node) 352bf215546Sopenharmony_ci{ 353bf215546Sopenharmony_ci if (gpir_node_is_leaf(node)) { 354bf215546Sopenharmony_ci node->sched.dist = 0; 355bf215546Sopenharmony_ci return; 356bf215546Sopenharmony_ci } 357bf215546Sopenharmony_ci 358bf215546Sopenharmony_ci gpir_node_foreach_pred(node, dep) { 359bf215546Sopenharmony_ci gpir_node *pred = dep->pred; 360bf215546Sopenharmony_ci 361bf215546Sopenharmony_ci if (pred->sched.dist < 0) 362bf215546Sopenharmony_ci schedule_update_distance(pred); 363bf215546Sopenharmony_ci 364bf215546Sopenharmony_ci int dist = pred->sched.dist + gpir_min_dist_alu(dep); 365bf215546Sopenharmony_ci if (node->sched.dist < dist) 366bf215546Sopenharmony_ci node->sched.dist = dist; 367bf215546Sopenharmony_ci } 368bf215546Sopenharmony_ci} 369bf215546Sopenharmony_ci 370bf215546Sopenharmony_cistatic bool gpir_is_input_node(gpir_node *node) 371bf215546Sopenharmony_ci{ 372bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 373bf215546Sopenharmony_ci if (dep->type == GPIR_DEP_INPUT) 374bf215546Sopenharmony_ci return true; 375bf215546Sopenharmony_ci } 376bf215546Sopenharmony_ci return false; 377bf215546Sopenharmony_ci} 378bf215546Sopenharmony_ci 379bf215546Sopenharmony_ci 380bf215546Sopenharmony_ci/* Get the number of slots required for a node on the ready list. 381bf215546Sopenharmony_ci */ 382bf215546Sopenharmony_cistatic int gpir_get_slots_required(gpir_node *node) 383bf215546Sopenharmony_ci{ 384bf215546Sopenharmony_ci if (!gpir_is_input_node(node)) 385bf215546Sopenharmony_ci return 0; 386bf215546Sopenharmony_ci 387bf215546Sopenharmony_ci /* Note that we assume every node only consumes one slot, even dual-slot 388bf215546Sopenharmony_ci * instructions. While dual-slot instructions may consume more than one 389bf215546Sopenharmony_ci * slot, we can always safely insert a move if it turns out that there 390bf215546Sopenharmony_ci * isn't enough space for them. There's the risk that we get stuck in an 391bf215546Sopenharmony_ci * infinite loop if all the fully ready nodes are dual-slot nodes, but we 392bf215546Sopenharmony_ci * rely on spilling to registers to save us here. 393bf215546Sopenharmony_ci */ 394bf215546Sopenharmony_ci return 1; 395bf215546Sopenharmony_ci} 396bf215546Sopenharmony_ci 397bf215546Sopenharmony_cistatic void ASSERTED verify_ready_list(sched_ctx *ctx) 398bf215546Sopenharmony_ci{ 399bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 400bf215546Sopenharmony_ci if (!gpir_is_input_node(node)) { 401bf215546Sopenharmony_ci assert(node->sched.ready); 402bf215546Sopenharmony_ci } 403bf215546Sopenharmony_ci 404bf215546Sopenharmony_ci if (node->sched.ready) { 405bf215546Sopenharmony_ci /* Every successor must have been scheduled */ 406bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 407bf215546Sopenharmony_ci assert(dep->succ->sched.instr); 408bf215546Sopenharmony_ci } 409bf215546Sopenharmony_ci } else { 410bf215546Sopenharmony_ci /* There must be at least one successor that's not scheduled. */ 411bf215546Sopenharmony_ci bool unscheduled = false; 412bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 413bf215546Sopenharmony_ci unscheduled |= !(dep->succ->sched.instr); 414bf215546Sopenharmony_ci } 415bf215546Sopenharmony_ci 416bf215546Sopenharmony_ci assert(unscheduled); 417bf215546Sopenharmony_ci } 418bf215546Sopenharmony_ci } 419bf215546Sopenharmony_ci} 420bf215546Sopenharmony_ci 421bf215546Sopenharmony_cistatic void schedule_insert_ready_list(sched_ctx *ctx, 422bf215546Sopenharmony_ci gpir_node *insert_node) 423bf215546Sopenharmony_ci{ 424bf215546Sopenharmony_ci /* if this node is fully ready or partially ready 425bf215546Sopenharmony_ci * fully ready: all successors have been scheduled 426bf215546Sopenharmony_ci * partially ready: part of input successors have been scheduled 427bf215546Sopenharmony_ci * 428bf215546Sopenharmony_ci * either fully ready or partially ready node need be inserted to 429bf215546Sopenharmony_ci * the ready list, but we only schedule a move node for partially 430bf215546Sopenharmony_ci * ready node. 431bf215546Sopenharmony_ci */ 432bf215546Sopenharmony_ci bool ready = true, insert = false; 433bf215546Sopenharmony_ci gpir_node_foreach_succ(insert_node, dep) { 434bf215546Sopenharmony_ci gpir_node *succ = dep->succ; 435bf215546Sopenharmony_ci if (succ->sched.instr) { 436bf215546Sopenharmony_ci if (dep->type == GPIR_DEP_INPUT) 437bf215546Sopenharmony_ci insert = true; 438bf215546Sopenharmony_ci } 439bf215546Sopenharmony_ci else 440bf215546Sopenharmony_ci ready = false; 441bf215546Sopenharmony_ci } 442bf215546Sopenharmony_ci 443bf215546Sopenharmony_ci insert_node->sched.ready = ready; 444bf215546Sopenharmony_ci /* for root node */ 445bf215546Sopenharmony_ci insert |= ready; 446bf215546Sopenharmony_ci 447bf215546Sopenharmony_ci if (!insert || insert_node->sched.inserted) 448bf215546Sopenharmony_ci return; 449bf215546Sopenharmony_ci 450bf215546Sopenharmony_ci struct list_head *insert_pos = &ctx->ready_list; 451bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 452bf215546Sopenharmony_ci if ((insert_node->sched.dist > node->sched.dist || 453bf215546Sopenharmony_ci gpir_op_infos[insert_node->op].schedule_first) && 454bf215546Sopenharmony_ci !gpir_op_infos[node->op].schedule_first) { 455bf215546Sopenharmony_ci insert_pos = &node->list; 456bf215546Sopenharmony_ci break; 457bf215546Sopenharmony_ci } 458bf215546Sopenharmony_ci } 459bf215546Sopenharmony_ci 460bf215546Sopenharmony_ci list_addtail(&insert_node->list, insert_pos); 461bf215546Sopenharmony_ci insert_node->sched.inserted = true; 462bf215546Sopenharmony_ci ctx->ready_list_slots += gpir_get_slots_required(insert_node); 463bf215546Sopenharmony_ci} 464bf215546Sopenharmony_ci 465bf215546Sopenharmony_cistatic int gpir_get_max_start(gpir_node *node) 466bf215546Sopenharmony_ci{ 467bf215546Sopenharmony_ci int max_start = 0; 468bf215546Sopenharmony_ci 469bf215546Sopenharmony_ci /* find the max start instr constrainted by all successors */ 470bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 471bf215546Sopenharmony_ci gpir_node *succ = dep->succ; 472bf215546Sopenharmony_ci if (!succ->sched.instr) 473bf215546Sopenharmony_ci continue; 474bf215546Sopenharmony_ci 475bf215546Sopenharmony_ci int start = succ->sched.instr->index + gpir_get_min_dist(dep); 476bf215546Sopenharmony_ci if (start > max_start) 477bf215546Sopenharmony_ci max_start = start; 478bf215546Sopenharmony_ci } 479bf215546Sopenharmony_ci 480bf215546Sopenharmony_ci return max_start; 481bf215546Sopenharmony_ci} 482bf215546Sopenharmony_ci 483bf215546Sopenharmony_cistatic int gpir_get_min_end(gpir_node *node) 484bf215546Sopenharmony_ci{ 485bf215546Sopenharmony_ci int min_end = INT_MAX; 486bf215546Sopenharmony_ci 487bf215546Sopenharmony_ci /* find the min end instr constrainted by all successors */ 488bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 489bf215546Sopenharmony_ci gpir_node *succ = dep->succ; 490bf215546Sopenharmony_ci if (!succ->sched.instr) 491bf215546Sopenharmony_ci continue; 492bf215546Sopenharmony_ci 493bf215546Sopenharmony_ci int end = succ->sched.instr->index + gpir_get_max_dist(dep); 494bf215546Sopenharmony_ci if (end < min_end) 495bf215546Sopenharmony_ci min_end = end; 496bf215546Sopenharmony_ci } 497bf215546Sopenharmony_ci 498bf215546Sopenharmony_ci return min_end; 499bf215546Sopenharmony_ci} 500bf215546Sopenharmony_ci 501bf215546Sopenharmony_cistatic gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node) 502bf215546Sopenharmony_ci{ 503bf215546Sopenharmony_ci gpir_load_node *load = gpir_node_to_load(node); 504bf215546Sopenharmony_ci 505bf215546Sopenharmony_ci for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) { 506bf215546Sopenharmony_ci if (!instr->slots[i]) 507bf215546Sopenharmony_ci continue; 508bf215546Sopenharmony_ci 509bf215546Sopenharmony_ci gpir_load_node *iload = gpir_node_to_load(instr->slots[i]); 510bf215546Sopenharmony_ci if (load->node.op == iload->node.op && 511bf215546Sopenharmony_ci load->index == iload->index && 512bf215546Sopenharmony_ci load->component == iload->component) 513bf215546Sopenharmony_ci return &iload->node; 514bf215546Sopenharmony_ci } 515bf215546Sopenharmony_ci return NULL; 516bf215546Sopenharmony_ci} 517bf215546Sopenharmony_ci 518bf215546Sopenharmony_ci/* Simply place the node into the given instruction without trying to deal 519bf215546Sopenharmony_ci * with liveness or the ready list. This will only fail if the instruction 520bf215546Sopenharmony_ci * cannot be placed due to a lack of available slots. In addition to normal 521bf215546Sopenharmony_ci * node placement, this is also used for placing loads when spilling to 522bf215546Sopenharmony_ci * registers. 523bf215546Sopenharmony_ci */ 524bf215546Sopenharmony_cistatic bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node) 525bf215546Sopenharmony_ci{ 526bf215546Sopenharmony_ci if (node->type == gpir_node_type_load) { 527bf215546Sopenharmony_ci gpir_node *load = gpir_sched_instr_has_load(instr, node); 528bf215546Sopenharmony_ci if (load) { 529bf215546Sopenharmony_ci /* This node may have a store as a successor, in which case we have to 530bf215546Sopenharmony_ci * fail it exactly like below in order to later create a move node in 531bf215546Sopenharmony_ci * between. 532bf215546Sopenharmony_ci */ 533bf215546Sopenharmony_ci if (instr->index < gpir_get_max_start(node)) 534bf215546Sopenharmony_ci return false; 535bf215546Sopenharmony_ci 536bf215546Sopenharmony_ci gpir_debug("same load %d in instr %d for node %d\n", 537bf215546Sopenharmony_ci load->index, instr->index, node->index); 538bf215546Sopenharmony_ci 539bf215546Sopenharmony_ci /* not really merge two node, just fake scheduled same place */ 540bf215546Sopenharmony_ci node->sched.instr = load->sched.instr; 541bf215546Sopenharmony_ci node->sched.pos = load->sched.pos; 542bf215546Sopenharmony_ci return true; 543bf215546Sopenharmony_ci } 544bf215546Sopenharmony_ci } 545bf215546Sopenharmony_ci 546bf215546Sopenharmony_ci if (node->op == gpir_op_store_reg) { 547bf215546Sopenharmony_ci /* This register may be loaded in the next basic block, in which case 548bf215546Sopenharmony_ci * there still needs to be a 2 instruction gap. We do what the blob 549bf215546Sopenharmony_ci * seems to do and simply disable stores in the last two instructions of 550bf215546Sopenharmony_ci * the basic block. 551bf215546Sopenharmony_ci * 552bf215546Sopenharmony_ci * TODO: We may be able to do better than this, but we have to check 553bf215546Sopenharmony_ci * first if storing a register works across branches. 554bf215546Sopenharmony_ci */ 555bf215546Sopenharmony_ci if (instr->index < 2) 556bf215546Sopenharmony_ci return false; 557bf215546Sopenharmony_ci } 558bf215546Sopenharmony_ci 559bf215546Sopenharmony_ci node->sched.instr = instr; 560bf215546Sopenharmony_ci 561bf215546Sopenharmony_ci int max_node_spill_needed = INT_MAX; 562bf215546Sopenharmony_ci int total_spill_needed = INT_MAX; 563bf215546Sopenharmony_ci int *slots = gpir_op_infos[node->op].slots; 564bf215546Sopenharmony_ci for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) { 565bf215546Sopenharmony_ci node->sched.pos = slots[i]; 566bf215546Sopenharmony_ci if (instr->index >= gpir_get_max_start(node) && 567bf215546Sopenharmony_ci instr->index <= gpir_get_min_end(node) && 568bf215546Sopenharmony_ci gpir_instr_try_insert_node(instr, node)) 569bf215546Sopenharmony_ci return true; 570bf215546Sopenharmony_ci if (ctx->instr->non_cplx_slot_difference || 571bf215546Sopenharmony_ci ctx->instr->slot_difference) { 572bf215546Sopenharmony_ci /* If one of these fields is non-zero, then we could insert the node 573bf215546Sopenharmony_ci * here after spilling. To get an accurate count of how many nodes we 574bf215546Sopenharmony_ci * need to spill, we need to choose one of the positions where there 575bf215546Sopenharmony_ci * were nonzero slot differences, preferably one with the smallest 576bf215546Sopenharmony_ci * difference (so we don't have to spill as much). 577bf215546Sopenharmony_ci */ 578bf215546Sopenharmony_ci if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed || 579bf215546Sopenharmony_ci ctx->instr->slot_difference < total_spill_needed) { 580bf215546Sopenharmony_ci max_node_spill_needed = ctx->instr->non_cplx_slot_difference; 581bf215546Sopenharmony_ci total_spill_needed = ctx->instr->slot_difference; 582bf215546Sopenharmony_ci } 583bf215546Sopenharmony_ci } 584bf215546Sopenharmony_ci } 585bf215546Sopenharmony_ci 586bf215546Sopenharmony_ci if (max_node_spill_needed != INT_MAX) { 587bf215546Sopenharmony_ci /* Indicate how many spill nodes are needed. */ 588bf215546Sopenharmony_ci ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed, 589bf215546Sopenharmony_ci max_node_spill_needed); 590bf215546Sopenharmony_ci ctx->total_spill_needed = MAX2(ctx->total_spill_needed, 591bf215546Sopenharmony_ci total_spill_needed); 592bf215546Sopenharmony_ci } 593bf215546Sopenharmony_ci node->sched.instr = NULL; 594bf215546Sopenharmony_ci node->sched.pos = -1; 595bf215546Sopenharmony_ci return false; 596bf215546Sopenharmony_ci} 597bf215546Sopenharmony_ci 598bf215546Sopenharmony_ci/* Try to place just the node given, updating the ready list. If "speculative" 599bf215546Sopenharmony_ci * is true, then this is part of the pre-commit phase. If false, then we have 600bf215546Sopenharmony_ci * committed to placing this node, so update liveness and ready list 601bf215546Sopenharmony_ci * information. 602bf215546Sopenharmony_ci */ 603bf215546Sopenharmony_ci 604bf215546Sopenharmony_cistatic bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node, 605bf215546Sopenharmony_ci bool speculative) 606bf215546Sopenharmony_ci{ 607bf215546Sopenharmony_ci if (!_try_place_node(ctx, ctx->instr, node)) { 608bf215546Sopenharmony_ci if (!speculative) 609bf215546Sopenharmony_ci gpir_debug("failed to place %d\n", node->index); 610bf215546Sopenharmony_ci return false; 611bf215546Sopenharmony_ci } 612bf215546Sopenharmony_ci 613bf215546Sopenharmony_ci ctx->ready_list_slots -= gpir_get_slots_required(node); 614bf215546Sopenharmony_ci 615bf215546Sopenharmony_ci if (!speculative) { 616bf215546Sopenharmony_ci gpir_debug("placed node %d\n", node->index); 617bf215546Sopenharmony_ci 618bf215546Sopenharmony_ci /* We assume here that writes are placed before reads. If this changes, 619bf215546Sopenharmony_ci * then this needs to be updated. 620bf215546Sopenharmony_ci */ 621bf215546Sopenharmony_ci if (node->op == gpir_op_store_reg) { 622bf215546Sopenharmony_ci gpir_store_node *store = gpir_node_to_store(node); 623bf215546Sopenharmony_ci ctx->live_physregs &= 624bf215546Sopenharmony_ci ~(1ull << (4 * store->index + store->component)); 625bf215546Sopenharmony_ci if (store->child->sched.physreg_store == store) 626bf215546Sopenharmony_ci store->child->sched.physreg_store = NULL; 627bf215546Sopenharmony_ci } 628bf215546Sopenharmony_ci 629bf215546Sopenharmony_ci if (node->op == gpir_op_load_reg) { 630bf215546Sopenharmony_ci gpir_load_node *load = gpir_node_to_load(node); 631bf215546Sopenharmony_ci ctx->live_physregs |= 632bf215546Sopenharmony_ci (1ull << (4 * load->index + load->component)); 633bf215546Sopenharmony_ci } 634bf215546Sopenharmony_ci 635bf215546Sopenharmony_ci list_del(&node->list); 636bf215546Sopenharmony_ci list_add(&node->list, &ctx->block->node_list); 637bf215546Sopenharmony_ci gpir_node_foreach_pred(node, dep) { 638bf215546Sopenharmony_ci gpir_node *pred = dep->pred; 639bf215546Sopenharmony_ci schedule_insert_ready_list(ctx, pred); 640bf215546Sopenharmony_ci } 641bf215546Sopenharmony_ci } else { 642bf215546Sopenharmony_ci gpir_node_foreach_pred(node, dep) { 643bf215546Sopenharmony_ci gpir_node *pred = dep->pred; 644bf215546Sopenharmony_ci if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT) 645bf215546Sopenharmony_ci ctx->ready_list_slots += gpir_get_slots_required(pred); 646bf215546Sopenharmony_ci } 647bf215546Sopenharmony_ci } 648bf215546Sopenharmony_ci 649bf215546Sopenharmony_ci return true; 650bf215546Sopenharmony_ci} 651bf215546Sopenharmony_ci 652bf215546Sopenharmony_ci/* Create a new node with "node" as the child, replace all uses of "node" with 653bf215546Sopenharmony_ci * this new node, and replace "node" with it in the ready list. 654bf215546Sopenharmony_ci */ 655bf215546Sopenharmony_cistatic gpir_node *create_replacement(sched_ctx *ctx, gpir_node *node, 656bf215546Sopenharmony_ci gpir_op op) 657bf215546Sopenharmony_ci{ 658bf215546Sopenharmony_ci 659bf215546Sopenharmony_ci gpir_alu_node *new_node = gpir_node_create(node->block, op); 660bf215546Sopenharmony_ci if (unlikely(!new_node)) 661bf215546Sopenharmony_ci return NULL; 662bf215546Sopenharmony_ci 663bf215546Sopenharmony_ci new_node->children[0] = node; 664bf215546Sopenharmony_ci new_node->num_child = 1; 665bf215546Sopenharmony_ci 666bf215546Sopenharmony_ci new_node->node.sched.instr = NULL; 667bf215546Sopenharmony_ci new_node->node.sched.pos = -1; 668bf215546Sopenharmony_ci new_node->node.sched.dist = node->sched.dist; 669bf215546Sopenharmony_ci new_node->node.sched.max_node = node->sched.max_node; 670bf215546Sopenharmony_ci new_node->node.sched.next_max_node = node->sched.next_max_node; 671bf215546Sopenharmony_ci new_node->node.sched.complex_allowed = node->sched.complex_allowed; 672bf215546Sopenharmony_ci 673bf215546Sopenharmony_ci ctx->ready_list_slots--; 674bf215546Sopenharmony_ci list_del(&node->list); 675bf215546Sopenharmony_ci node->sched.max_node = false; 676bf215546Sopenharmony_ci node->sched.next_max_node = false; 677bf215546Sopenharmony_ci node->sched.ready = false; 678bf215546Sopenharmony_ci node->sched.inserted = false; 679bf215546Sopenharmony_ci gpir_node_replace_succ(&new_node->node, node); 680bf215546Sopenharmony_ci gpir_node_add_dep(&new_node->node, node, GPIR_DEP_INPUT); 681bf215546Sopenharmony_ci schedule_insert_ready_list(ctx, &new_node->node); 682bf215546Sopenharmony_ci return &new_node->node; 683bf215546Sopenharmony_ci} 684bf215546Sopenharmony_ci 685bf215546Sopenharmony_cistatic gpir_node *create_move(sched_ctx *ctx, gpir_node *node) 686bf215546Sopenharmony_ci{ 687bf215546Sopenharmony_ci gpir_node *move = create_replacement(ctx, node, gpir_op_mov); 688bf215546Sopenharmony_ci gpir_debug("create move %d for %d\n", move->index, node->index); 689bf215546Sopenharmony_ci return move; 690bf215546Sopenharmony_ci} 691bf215546Sopenharmony_ci 692bf215546Sopenharmony_cistatic gpir_node *create_postlog2(sched_ctx *ctx, gpir_node *node) 693bf215546Sopenharmony_ci{ 694bf215546Sopenharmony_ci assert(node->op == gpir_op_complex1); 695bf215546Sopenharmony_ci gpir_node *postlog2 = create_replacement(ctx, node, gpir_op_postlog2); 696bf215546Sopenharmony_ci gpir_debug("create postlog2 %d for %d\n", postlog2->index, node->index); 697bf215546Sopenharmony_ci return postlog2; 698bf215546Sopenharmony_ci} 699bf215546Sopenharmony_ci 700bf215546Sopenharmony_ci/* Once we schedule the successor, would the predecessor be fully ready? */ 701bf215546Sopenharmony_cistatic bool pred_almost_ready(gpir_dep *dep) 702bf215546Sopenharmony_ci{ 703bf215546Sopenharmony_ci bool fully_ready = true; 704bf215546Sopenharmony_ci gpir_node_foreach_succ(dep->pred, other_dep) { 705bf215546Sopenharmony_ci gpir_node *succ = other_dep->succ; 706bf215546Sopenharmony_ci if (!succ->sched.instr && dep->succ != other_dep->succ) { 707bf215546Sopenharmony_ci fully_ready = false; 708bf215546Sopenharmony_ci break; 709bf215546Sopenharmony_ci } 710bf215546Sopenharmony_ci } 711bf215546Sopenharmony_ci 712bf215546Sopenharmony_ci return fully_ready; 713bf215546Sopenharmony_ci} 714bf215546Sopenharmony_ci 715bf215546Sopenharmony_ci/* Recursively try to schedule a node and all its dependent nodes that can fit 716bf215546Sopenharmony_ci * in the same instruction. There is a simple heuristic scoring system to try 717bf215546Sopenharmony_ci * to group together nodes that load different components of the same input, 718bf215546Sopenharmony_ci * to avoid bottlenecking for operations like matrix multiplies that are 719bf215546Sopenharmony_ci * mostly input-bound. 720bf215546Sopenharmony_ci */ 721bf215546Sopenharmony_ci 722bf215546Sopenharmony_cistatic int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative) 723bf215546Sopenharmony_ci{ 724bf215546Sopenharmony_ci if (!schedule_try_place_node(ctx, node, speculative)) 725bf215546Sopenharmony_ci return INT_MIN; 726bf215546Sopenharmony_ci 727bf215546Sopenharmony_ci int score = 0; 728bf215546Sopenharmony_ci 729bf215546Sopenharmony_ci gpir_node_foreach_pred(node, dep) { 730bf215546Sopenharmony_ci if (dep->type != GPIR_DEP_INPUT) 731bf215546Sopenharmony_ci continue; 732bf215546Sopenharmony_ci 733bf215546Sopenharmony_ci int pred_score = INT_MIN; 734bf215546Sopenharmony_ci if (pred_almost_ready(dep)) { 735bf215546Sopenharmony_ci if (dep->pred->type == gpir_node_type_load || 736bf215546Sopenharmony_ci node->type == gpir_node_type_store) { 737bf215546Sopenharmony_ci pred_score = _schedule_try_node(ctx, dep->pred, speculative); 738bf215546Sopenharmony_ci } 739bf215546Sopenharmony_ci } 740bf215546Sopenharmony_ci if (dep->pred->type == gpir_node_type_load || 741bf215546Sopenharmony_ci node->type == gpir_node_type_store) { 742bf215546Sopenharmony_ci if (pred_score == INT_MIN) { 743bf215546Sopenharmony_ci if (node->op == gpir_op_mov) { 744bf215546Sopenharmony_ci /* The only moves on the ready list are for loads that we 745bf215546Sopenharmony_ci * couldn't schedule immediately, as created below. If we 746bf215546Sopenharmony_ci * couldn't schedule the load, there's no point scheduling 747bf215546Sopenharmony_ci * the move. The normal move threading logic will ensure 748bf215546Sopenharmony_ci * that another move is created if we're about to go too far 749bf215546Sopenharmony_ci * from the uses of this move. 750bf215546Sopenharmony_ci */ 751bf215546Sopenharmony_ci assert(speculative); 752bf215546Sopenharmony_ci return INT_MIN; 753bf215546Sopenharmony_ci } else if (!speculative && dep->pred->type == gpir_node_type_load) { 754bf215546Sopenharmony_ci /* We couldn't schedule the load right away, so it will have 755bf215546Sopenharmony_ci * to happen in some earlier instruction and then be moved 756bf215546Sopenharmony_ci * into a value register and threaded to the use by "node". 757bf215546Sopenharmony_ci * We create the move right away, so that later we'll fail 758bf215546Sopenharmony_ci * to schedule it if there isn't a slot for a move 759bf215546Sopenharmony_ci * available. 760bf215546Sopenharmony_ci */ 761bf215546Sopenharmony_ci create_move(ctx, dep->pred); 762bf215546Sopenharmony_ci } 763bf215546Sopenharmony_ci /* Penalize nodes whose dependent ops we couldn't schedule. 764bf215546Sopenharmony_ci */ 765bf215546Sopenharmony_ci score--; 766bf215546Sopenharmony_ci } else { 767bf215546Sopenharmony_ci score += pred_score; 768bf215546Sopenharmony_ci continue; 769bf215546Sopenharmony_ci } 770bf215546Sopenharmony_ci } 771bf215546Sopenharmony_ci } 772bf215546Sopenharmony_ci 773bf215546Sopenharmony_ci return score; 774bf215546Sopenharmony_ci} 775bf215546Sopenharmony_ci 776bf215546Sopenharmony_ci/* If we speculatively tried a node, undo everything. 777bf215546Sopenharmony_ci */ 778bf215546Sopenharmony_ci 779bf215546Sopenharmony_cistatic void schedule_undo_node(sched_ctx *ctx, gpir_node *node) 780bf215546Sopenharmony_ci{ 781bf215546Sopenharmony_ci gpir_instr_remove_node(ctx->instr, node); 782bf215546Sopenharmony_ci 783bf215546Sopenharmony_ci gpir_node_foreach_pred(node, dep) { 784bf215546Sopenharmony_ci gpir_node *pred = dep->pred; 785bf215546Sopenharmony_ci if (pred->sched.instr) { 786bf215546Sopenharmony_ci schedule_undo_node(ctx, pred); 787bf215546Sopenharmony_ci } 788bf215546Sopenharmony_ci } 789bf215546Sopenharmony_ci} 790bf215546Sopenharmony_ci 791bf215546Sopenharmony_ci/* Try to schedule a node. We also try to schedule any predecessors that can 792bf215546Sopenharmony_ci * be part of the same instruction. If "speculative" is true, then we don't 793bf215546Sopenharmony_ci * actually change any state, only returning the score were the node to be 794bf215546Sopenharmony_ci * scheduled, with INT_MIN meaning "cannot be scheduled at all". 795bf215546Sopenharmony_ci */ 796bf215546Sopenharmony_cistatic int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative) 797bf215546Sopenharmony_ci{ 798bf215546Sopenharmony_ci int prev_slots = ctx->ready_list_slots; 799bf215546Sopenharmony_ci 800bf215546Sopenharmony_ci int score = _schedule_try_node(ctx, node, speculative); 801bf215546Sopenharmony_ci 802bf215546Sopenharmony_ci if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) { 803bf215546Sopenharmony_ci assert(speculative); 804bf215546Sopenharmony_ci ctx->total_spill_needed = MAX2(ctx->total_spill_needed, 805bf215546Sopenharmony_ci ctx->ready_list_slots - GPIR_VALUE_REG_NUM); 806bf215546Sopenharmony_ci score = INT_MIN; 807bf215546Sopenharmony_ci } 808bf215546Sopenharmony_ci 809bf215546Sopenharmony_ci if (speculative) { 810bf215546Sopenharmony_ci ctx->ready_list_slots = prev_slots; 811bf215546Sopenharmony_ci if (node->sched.instr) 812bf215546Sopenharmony_ci schedule_undo_node(ctx, node); 813bf215546Sopenharmony_ci } 814bf215546Sopenharmony_ci 815bf215546Sopenharmony_ci return score; 816bf215546Sopenharmony_ci} 817bf215546Sopenharmony_ci 818bf215546Sopenharmony_ci/* This is called when we want to spill "node" by inserting loads at its uses. 819bf215546Sopenharmony_ci * It returns all the possible registers we can use so that all the loads will 820bf215546Sopenharmony_ci * successfully be inserted. Also return the first instruction we'll need to 821bf215546Sopenharmony_ci * insert a load for. 822bf215546Sopenharmony_ci */ 823bf215546Sopenharmony_ci 824bf215546Sopenharmony_cistatic uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node, 825bf215546Sopenharmony_ci int *min_index) 826bf215546Sopenharmony_ci{ 827bf215546Sopenharmony_ci uint64_t available = ~0ull; 828bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 829bf215546Sopenharmony_ci if (dep->type != GPIR_DEP_INPUT) 830bf215546Sopenharmony_ci continue; 831bf215546Sopenharmony_ci 832bf215546Sopenharmony_ci gpir_node *use = dep->succ; 833bf215546Sopenharmony_ci gpir_instr *instr = use->sched.instr; 834bf215546Sopenharmony_ci 835bf215546Sopenharmony_ci if (!instr) { 836bf215546Sopenharmony_ci /* This use isn't scheduled, so no need to spill it. */ 837bf215546Sopenharmony_ci continue; 838bf215546Sopenharmony_ci } 839bf215546Sopenharmony_ci 840bf215546Sopenharmony_ci if (use->type == gpir_node_type_store) { 841bf215546Sopenharmony_ci /* We're trying to spill something that was recently stored... just 842bf215546Sopenharmony_ci * bail out. 843bf215546Sopenharmony_ci */ 844bf215546Sopenharmony_ci return 0; 845bf215546Sopenharmony_ci } 846bf215546Sopenharmony_ci 847bf215546Sopenharmony_ci if (use->op == gpir_op_mov && instr == ctx->instr) { 848bf215546Sopenharmony_ci /* We try to spill the sources of this move, so we can free up space 849bf215546Sopenharmony_ci * in the current instruction. 850bf215546Sopenharmony_ci * 851bf215546Sopenharmony_ci * TODO: should we go back further? It might let us schedule the 852bf215546Sopenharmony_ci * write earlier in some cases, but then we might fail to spill. 853bf215546Sopenharmony_ci */ 854bf215546Sopenharmony_ci available &= get_available_regs(ctx, use, min_index); 855bf215546Sopenharmony_ci } else { 856bf215546Sopenharmony_ci if (instr->index < *min_index) 857bf215546Sopenharmony_ci *min_index = instr->index; 858bf215546Sopenharmony_ci 859bf215546Sopenharmony_ci uint64_t use_available = 0; 860bf215546Sopenharmony_ci 861bf215546Sopenharmony_ci if (instr->reg0_use_count == 0) 862bf215546Sopenharmony_ci use_available = ~0ull; 863bf215546Sopenharmony_ci else if (!instr->reg0_is_attr) 864bf215546Sopenharmony_ci use_available = 0xfull << (4 * instr->reg0_index); 865bf215546Sopenharmony_ci 866bf215546Sopenharmony_ci if (instr->reg1_use_count == 0) 867bf215546Sopenharmony_ci use_available = ~0ull; 868bf215546Sopenharmony_ci else 869bf215546Sopenharmony_ci use_available |= 0xfull << (4 * instr->reg1_index); 870bf215546Sopenharmony_ci 871bf215546Sopenharmony_ci available &= use_available; 872bf215546Sopenharmony_ci } 873bf215546Sopenharmony_ci } 874bf215546Sopenharmony_ci 875bf215546Sopenharmony_ci return available; 876bf215546Sopenharmony_ci} 877bf215546Sopenharmony_ci 878bf215546Sopenharmony_ci/* Using "min_index" returned by get_available_regs(), figure out which 879bf215546Sopenharmony_ci * registers are killed by a write after or during the current instruction and 880bf215546Sopenharmony_ci * hence we can't use for spilling. Writes that haven't been scheduled yet 881bf215546Sopenharmony_ci * should be reflected in live_physregs. 882bf215546Sopenharmony_ci */ 883bf215546Sopenharmony_ci 884bf215546Sopenharmony_cistatic uint64_t get_killed_regs(sched_ctx *ctx, int min_index) 885bf215546Sopenharmony_ci{ 886bf215546Sopenharmony_ci uint64_t killed = 0; 887bf215546Sopenharmony_ci 888bf215546Sopenharmony_ci list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) { 889bf215546Sopenharmony_ci if (instr->index <= min_index) 890bf215546Sopenharmony_ci break; 891bf215546Sopenharmony_ci 892bf215546Sopenharmony_ci for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3; 893bf215546Sopenharmony_ci slot++) { 894bf215546Sopenharmony_ci if (!instr->slots[slot]) 895bf215546Sopenharmony_ci continue; 896bf215546Sopenharmony_ci 897bf215546Sopenharmony_ci gpir_store_node *store = gpir_node_to_store(instr->slots[slot]); 898bf215546Sopenharmony_ci if (store->node.op != gpir_op_store_reg) 899bf215546Sopenharmony_ci continue; 900bf215546Sopenharmony_ci 901bf215546Sopenharmony_ci killed |= 1ull << (4 * store->index + store->component); 902bf215546Sopenharmony_ci } 903bf215546Sopenharmony_ci } 904bf215546Sopenharmony_ci 905bf215546Sopenharmony_ci return killed; 906bf215546Sopenharmony_ci} 907bf215546Sopenharmony_ci 908bf215546Sopenharmony_ci/* Actually spill a node so that it is no longer in the ready list. Note that 909bf215546Sopenharmony_ci * this must exactly follow the logic of get_available_regs() or else the 910bf215546Sopenharmony_ci * loads could fail to schedule. 911bf215546Sopenharmony_ci */ 912bf215546Sopenharmony_ci 913bf215546Sopenharmony_cistatic void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store) 914bf215546Sopenharmony_ci{ 915bf215546Sopenharmony_ci gpir_node_foreach_succ_safe(node, dep) { 916bf215546Sopenharmony_ci if (dep->type != GPIR_DEP_INPUT) 917bf215546Sopenharmony_ci continue; 918bf215546Sopenharmony_ci 919bf215546Sopenharmony_ci gpir_node *use = dep->succ; 920bf215546Sopenharmony_ci gpir_instr *instr = use->sched.instr; 921bf215546Sopenharmony_ci 922bf215546Sopenharmony_ci if (!instr) 923bf215546Sopenharmony_ci continue; 924bf215546Sopenharmony_ci 925bf215546Sopenharmony_ci if (use->op == gpir_op_mov && instr == ctx->instr) { 926bf215546Sopenharmony_ci spill_node(ctx, use, store); 927bf215546Sopenharmony_ci } else { 928bf215546Sopenharmony_ci gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg); 929bf215546Sopenharmony_ci load->index = store->index; 930bf215546Sopenharmony_ci load->component = store->component; 931bf215546Sopenharmony_ci list_add(&load->node.list, &ctx->block->node_list); 932bf215546Sopenharmony_ci gpir_node_replace_child(dep->succ, dep->pred, &load->node); 933bf215546Sopenharmony_ci gpir_node_replace_pred(dep, &load->node); 934bf215546Sopenharmony_ci gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE); 935bf215546Sopenharmony_ci gpir_debug("spilling use %d of node %d to load node %d\n", 936bf215546Sopenharmony_ci use->index, node->index, load->node.index); 937bf215546Sopenharmony_ci ASSERTED bool result = _try_place_node(ctx, use->sched.instr, &load->node); 938bf215546Sopenharmony_ci assert(result); 939bf215546Sopenharmony_ci } 940bf215546Sopenharmony_ci } 941bf215546Sopenharmony_ci 942bf215546Sopenharmony_ci if (node->op == gpir_op_mov) { 943bf215546Sopenharmony_ci /* We replaced all the uses of the move, so it's dead now. */ 944bf215546Sopenharmony_ci gpir_instr_remove_node(node->sched.instr, node); 945bf215546Sopenharmony_ci gpir_node_delete(node); 946bf215546Sopenharmony_ci } else { 947bf215546Sopenharmony_ci /* We deleted all the uses of the node except the store, so it's not 948bf215546Sopenharmony_ci * live anymore. 949bf215546Sopenharmony_ci */ 950bf215546Sopenharmony_ci list_del(&node->list); 951bf215546Sopenharmony_ci node->sched.inserted = false; 952bf215546Sopenharmony_ci ctx->ready_list_slots--; 953bf215546Sopenharmony_ci if (node->sched.max_node) { 954bf215546Sopenharmony_ci node->sched.max_node = false; 955bf215546Sopenharmony_ci ctx->instr->alu_num_slot_needed_by_max--; 956bf215546Sopenharmony_ci } 957bf215546Sopenharmony_ci if (node->sched.next_max_node) { 958bf215546Sopenharmony_ci node->sched.next_max_node = false; 959bf215546Sopenharmony_ci ctx->instr->alu_num_unscheduled_next_max--; 960bf215546Sopenharmony_ci } 961bf215546Sopenharmony_ci } 962bf215546Sopenharmony_ci} 963bf215546Sopenharmony_ci 964bf215546Sopenharmony_cistatic bool used_by_store(gpir_node *node, gpir_instr *instr) 965bf215546Sopenharmony_ci{ 966bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 967bf215546Sopenharmony_ci if (dep->type != GPIR_DEP_INPUT) 968bf215546Sopenharmony_ci continue; 969bf215546Sopenharmony_ci 970bf215546Sopenharmony_ci if (dep->succ->type == gpir_node_type_store && 971bf215546Sopenharmony_ci dep->succ->sched.instr == instr) 972bf215546Sopenharmony_ci return true; 973bf215546Sopenharmony_ci } 974bf215546Sopenharmony_ci 975bf215546Sopenharmony_ci return false; 976bf215546Sopenharmony_ci} 977bf215546Sopenharmony_ci 978bf215546Sopenharmony_cistatic gpir_node *consuming_postlog2(gpir_node *node) 979bf215546Sopenharmony_ci{ 980bf215546Sopenharmony_ci if (node->op != gpir_op_complex1) 981bf215546Sopenharmony_ci return NULL; 982bf215546Sopenharmony_ci 983bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 984bf215546Sopenharmony_ci if (dep->type != GPIR_DEP_INPUT) 985bf215546Sopenharmony_ci continue; 986bf215546Sopenharmony_ci if (dep->succ->op == gpir_op_postlog2) 987bf215546Sopenharmony_ci return dep->succ; 988bf215546Sopenharmony_ci else 989bf215546Sopenharmony_ci return NULL; 990bf215546Sopenharmony_ci } 991bf215546Sopenharmony_ci 992bf215546Sopenharmony_ci return NULL; 993bf215546Sopenharmony_ci} 994bf215546Sopenharmony_ci 995bf215546Sopenharmony_cistatic bool try_spill_node(sched_ctx *ctx, gpir_node *node) 996bf215546Sopenharmony_ci{ 997bf215546Sopenharmony_ci assert(node->op != gpir_op_mov); 998bf215546Sopenharmony_ci 999bf215546Sopenharmony_ci if (used_by_store(node, ctx->instr)) 1000bf215546Sopenharmony_ci return false; 1001bf215546Sopenharmony_ci 1002bf215546Sopenharmony_ci gpir_debug("trying to spill %d\n", node->index); 1003bf215546Sopenharmony_ci 1004bf215546Sopenharmony_ci int min_instr = INT_MAX; 1005bf215546Sopenharmony_ci uint64_t available = get_available_regs(ctx, node, &min_instr); 1006bf215546Sopenharmony_ci available &= ~get_killed_regs(ctx, min_instr); 1007bf215546Sopenharmony_ci 1008bf215546Sopenharmony_ci if (node->sched.physreg_store) { 1009bf215546Sopenharmony_ci gpir_store_node *store = node->sched.physreg_store; 1010bf215546Sopenharmony_ci if (!(available & (1ull << (4 * store->index + store->component)))) 1011bf215546Sopenharmony_ci return false; 1012bf215546Sopenharmony_ci } else { 1013bf215546Sopenharmony_ci available &= ~ctx->live_physregs; 1014bf215546Sopenharmony_ci 1015bf215546Sopenharmony_ci if (available == 0) 1016bf215546Sopenharmony_ci return false; 1017bf215546Sopenharmony_ci 1018bf215546Sopenharmony_ci /* Don't spill complex1 if it's used postlog2, turn the postlog2 into a 1019bf215546Sopenharmony_ci * move, replace the complex1 with postlog2 and spill that instead. The 1020bf215546Sopenharmony_ci * store needs a move anyways so the postlog2 is usually free. 1021bf215546Sopenharmony_ci */ 1022bf215546Sopenharmony_ci gpir_node *postlog2 = consuming_postlog2(node); 1023bf215546Sopenharmony_ci if (postlog2) { 1024bf215546Sopenharmony_ci postlog2->op = gpir_op_mov; 1025bf215546Sopenharmony_ci node = create_postlog2(ctx, node); 1026bf215546Sopenharmony_ci } 1027bf215546Sopenharmony_ci 1028bf215546Sopenharmony_ci /* TODO: use a better heuristic for choosing an available register? */ 1029bf215546Sopenharmony_ci int physreg = ffsll(available) - 1; 1030bf215546Sopenharmony_ci 1031bf215546Sopenharmony_ci ctx->live_physregs |= (1ull << physreg); 1032bf215546Sopenharmony_ci 1033bf215546Sopenharmony_ci gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg); 1034bf215546Sopenharmony_ci store->index = physreg / 4; 1035bf215546Sopenharmony_ci store->component = physreg % 4; 1036bf215546Sopenharmony_ci store->child = node; 1037bf215546Sopenharmony_ci store->node.sched.max_node = false; 1038bf215546Sopenharmony_ci store->node.sched.next_max_node = false; 1039bf215546Sopenharmony_ci store->node.sched.complex_allowed = false; 1040bf215546Sopenharmony_ci store->node.sched.pos = -1; 1041bf215546Sopenharmony_ci store->node.sched.instr = NULL; 1042bf215546Sopenharmony_ci store->node.sched.inserted = false; 1043bf215546Sopenharmony_ci store->node.sched.dist = node->sched.dist; 1044bf215546Sopenharmony_ci if (node->op == gpir_op_complex1) { 1045bf215546Sopenharmony_ci /* Complex1 cannot be directly stored, and has a latency of 2 */ 1046bf215546Sopenharmony_ci store->node.sched.dist += 2; 1047bf215546Sopenharmony_ci } 1048bf215546Sopenharmony_ci node->sched.physreg_store = store; 1049bf215546Sopenharmony_ci gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); 1050bf215546Sopenharmony_ci 1051bf215546Sopenharmony_ci list_for_each_entry(gpir_load_node, load, 1052bf215546Sopenharmony_ci &ctx->physreg_reads[physreg], reg_link) { 1053bf215546Sopenharmony_ci gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ); 1054bf215546Sopenharmony_ci if (load->node.sched.ready) { 1055bf215546Sopenharmony_ci list_del(&load->node.list); 1056bf215546Sopenharmony_ci load->node.sched.ready = false; 1057bf215546Sopenharmony_ci } 1058bf215546Sopenharmony_ci } 1059bf215546Sopenharmony_ci 1060bf215546Sopenharmony_ci node->sched.ready = false; 1061bf215546Sopenharmony_ci schedule_insert_ready_list(ctx, &store->node); 1062bf215546Sopenharmony_ci } 1063bf215546Sopenharmony_ci 1064bf215546Sopenharmony_ci gpir_debug("spilling %d to $%d.%c, store %d\n", node->index, 1065bf215546Sopenharmony_ci node->sched.physreg_store->index, 1066bf215546Sopenharmony_ci "xyzw"[node->sched.physreg_store->component], 1067bf215546Sopenharmony_ci node->sched.physreg_store->node.index); 1068bf215546Sopenharmony_ci 1069bf215546Sopenharmony_ci spill_node(ctx, node, node->sched.physreg_store); 1070bf215546Sopenharmony_ci 1071bf215546Sopenharmony_ci return true; 1072bf215546Sopenharmony_ci} 1073bf215546Sopenharmony_ci 1074bf215546Sopenharmony_cistatic bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node) 1075bf215546Sopenharmony_ci{ 1076bf215546Sopenharmony_ci /* First, try to spill max nodes. */ 1077bf215546Sopenharmony_ci list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) { 1078bf215546Sopenharmony_ci if (ctx->max_node_spill_needed <= 0) 1079bf215546Sopenharmony_ci break; 1080bf215546Sopenharmony_ci 1081bf215546Sopenharmony_ci /* orig_node is the node we're trying to schedule, so spilling it makes 1082bf215546Sopenharmony_ci * no sense. Also don't try to spill any nodes in front of it, since 1083bf215546Sopenharmony_ci * they might be scheduled instead. 1084bf215546Sopenharmony_ci */ 1085bf215546Sopenharmony_ci if (node == orig_node) 1086bf215546Sopenharmony_ci break; 1087bf215546Sopenharmony_ci 1088bf215546Sopenharmony_ci if (node->op == gpir_op_mov) { 1089bf215546Sopenharmony_ci /* Don't try to spill loads, since that only adds another load and 1090bf215546Sopenharmony_ci * store which is likely pointless. 1091bf215546Sopenharmony_ci */ 1092bf215546Sopenharmony_ci continue; 1093bf215546Sopenharmony_ci } 1094bf215546Sopenharmony_ci 1095bf215546Sopenharmony_ci if (!gpir_is_input_node(node) || !node->sched.max_node) 1096bf215546Sopenharmony_ci continue; 1097bf215546Sopenharmony_ci 1098bf215546Sopenharmony_ci if (try_spill_node(ctx, node)) { 1099bf215546Sopenharmony_ci ctx->max_node_spill_needed--; 1100bf215546Sopenharmony_ci ctx->total_spill_needed--; 1101bf215546Sopenharmony_ci } 1102bf215546Sopenharmony_ci } 1103bf215546Sopenharmony_ci 1104bf215546Sopenharmony_ci /* Now, try to spill the remaining nodes. */ 1105bf215546Sopenharmony_ci list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) { 1106bf215546Sopenharmony_ci if (ctx->total_spill_needed <= 0) 1107bf215546Sopenharmony_ci break; 1108bf215546Sopenharmony_ci 1109bf215546Sopenharmony_ci if (node == orig_node) 1110bf215546Sopenharmony_ci break; 1111bf215546Sopenharmony_ci 1112bf215546Sopenharmony_ci if (node->op == gpir_op_mov) 1113bf215546Sopenharmony_ci continue; 1114bf215546Sopenharmony_ci 1115bf215546Sopenharmony_ci if (!gpir_is_input_node(node) || 1116bf215546Sopenharmony_ci !(node->sched.max_node || node->sched.next_max_node)) 1117bf215546Sopenharmony_ci continue; 1118bf215546Sopenharmony_ci 1119bf215546Sopenharmony_ci if (try_spill_node(ctx, node)) 1120bf215546Sopenharmony_ci ctx->total_spill_needed--; 1121bf215546Sopenharmony_ci } 1122bf215546Sopenharmony_ci 1123bf215546Sopenharmony_ci return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0; 1124bf215546Sopenharmony_ci} 1125bf215546Sopenharmony_ci 1126bf215546Sopenharmony_cistatic int ASSERTED gpir_get_curr_ready_list_slots(sched_ctx *ctx) 1127bf215546Sopenharmony_ci{ 1128bf215546Sopenharmony_ci int total = 0; 1129bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1130bf215546Sopenharmony_ci total += gpir_get_slots_required(node); 1131bf215546Sopenharmony_ci } 1132bf215546Sopenharmony_ci 1133bf215546Sopenharmony_ci return total; 1134bf215546Sopenharmony_ci} 1135bf215546Sopenharmony_ci 1136bf215546Sopenharmony_ci/* What gpir_get_min_end() would return if node were replaced with a move 1137bf215546Sopenharmony_ci * instruction not in the complex slot. Normally this is 2 + min_end, except 1138bf215546Sopenharmony_ci * for some store instructions which must have the move node in the same 1139bf215546Sopenharmony_ci * instruction. 1140bf215546Sopenharmony_ci */ 1141bf215546Sopenharmony_cistatic int gpir_get_min_end_as_move(gpir_node *node) 1142bf215546Sopenharmony_ci{ 1143bf215546Sopenharmony_ci int min = INT_MAX; 1144bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 1145bf215546Sopenharmony_ci gpir_node *succ = dep->succ; 1146bf215546Sopenharmony_ci if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) { 1147bf215546Sopenharmony_ci switch (succ->op) { 1148bf215546Sopenharmony_ci case gpir_op_store_temp: 1149bf215546Sopenharmony_ci case gpir_op_store_reg: 1150bf215546Sopenharmony_ci case gpir_op_store_varying: 1151bf215546Sopenharmony_ci continue; 1152bf215546Sopenharmony_ci default: 1153bf215546Sopenharmony_ci break; 1154bf215546Sopenharmony_ci } 1155bf215546Sopenharmony_ci if (min > succ->sched.instr->index + 2) 1156bf215546Sopenharmony_ci min = succ->sched.instr->index + 2; 1157bf215546Sopenharmony_ci } 1158bf215546Sopenharmony_ci } 1159bf215546Sopenharmony_ci return min; 1160bf215546Sopenharmony_ci} 1161bf215546Sopenharmony_ci 1162bf215546Sopenharmony_ci/* The second source for add0, add1, mul0, and mul1 units cannot be complex. 1163bf215546Sopenharmony_ci * The hardware overwrites the add second sources with 0 and mul second 1164bf215546Sopenharmony_ci * sources with 1. This can be a problem if we need to insert more next-max 1165bf215546Sopenharmony_ci * moves but we only have values that can't use the complex unit for moves. 1166bf215546Sopenharmony_ci * 1167bf215546Sopenharmony_ci * Fortunately, we only need to insert a next-max move if there are more than 1168bf215546Sopenharmony_ci * 5 next-max nodes, but there are only 4 sources in the previous instruction 1169bf215546Sopenharmony_ci * that make values not complex-capable, which means there can be at most 4 1170bf215546Sopenharmony_ci * non-complex-capable values. Hence there will always be at least two values 1171bf215546Sopenharmony_ci * that can be rewritten to use a move in the complex slot. However, we have 1172bf215546Sopenharmony_ci * to be careful not to waste those values by putting both of them in a 1173bf215546Sopenharmony_ci * non-complex slot. This is handled for us by gpir_instr, which will reject 1174bf215546Sopenharmony_ci * such instructions. We just need to tell it which nodes can use complex, and 1175bf215546Sopenharmony_ci * it will do the accounting to figure out what is safe. 1176bf215546Sopenharmony_ci */ 1177bf215546Sopenharmony_ci 1178bf215546Sopenharmony_cistatic bool can_use_complex(gpir_node *node) 1179bf215546Sopenharmony_ci{ 1180bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 1181bf215546Sopenharmony_ci if (dep->type != GPIR_DEP_INPUT) 1182bf215546Sopenharmony_ci continue; 1183bf215546Sopenharmony_ci 1184bf215546Sopenharmony_ci gpir_node *succ = dep->succ; 1185bf215546Sopenharmony_ci if (succ->type != gpir_node_type_alu || 1186bf215546Sopenharmony_ci !succ->sched.instr) 1187bf215546Sopenharmony_ci continue; 1188bf215546Sopenharmony_ci 1189bf215546Sopenharmony_ci /* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1} 1190bf215546Sopenharmony_ci */ 1191bf215546Sopenharmony_ci gpir_alu_node *alu = gpir_node_to_alu(succ); 1192bf215546Sopenharmony_ci switch (alu->node.op) { 1193bf215546Sopenharmony_ci case gpir_op_complex1: 1194bf215546Sopenharmony_ci /* complex1 puts its third source in the fourth slot */ 1195bf215546Sopenharmony_ci if (alu->children[1] == node || alu->children[2] == node) 1196bf215546Sopenharmony_ci return false; 1197bf215546Sopenharmony_ci break; 1198bf215546Sopenharmony_ci case gpir_op_complex2: 1199bf215546Sopenharmony_ci /* complex2 has its source duplicated, since it actually takes two 1200bf215546Sopenharmony_ci * sources but we only ever use it with both sources the same. Hence 1201bf215546Sopenharmony_ci * its source can never be the complex slot. 1202bf215546Sopenharmony_ci */ 1203bf215546Sopenharmony_ci return false; 1204bf215546Sopenharmony_ci case gpir_op_select: 1205bf215546Sopenharmony_ci /* Select has its sources rearranged */ 1206bf215546Sopenharmony_ci if (alu->children[0] == node) 1207bf215546Sopenharmony_ci return false; 1208bf215546Sopenharmony_ci break; 1209bf215546Sopenharmony_ci default: 1210bf215546Sopenharmony_ci assert(alu->num_child <= 2); 1211bf215546Sopenharmony_ci if (alu->num_child == 2 && alu->children[1] == node) 1212bf215546Sopenharmony_ci return false; 1213bf215546Sopenharmony_ci break; 1214bf215546Sopenharmony_ci } 1215bf215546Sopenharmony_ci } 1216bf215546Sopenharmony_ci 1217bf215546Sopenharmony_ci return true; 1218bf215546Sopenharmony_ci} 1219bf215546Sopenharmony_ci 1220bf215546Sopenharmony_ci/* Initialize node->sched.max_node and node->sched.next_max_node for every 1221bf215546Sopenharmony_ci * input node on the ready list. We should only need to do this once per 1222bf215546Sopenharmony_ci * instruction, at the beginning, since we never add max nodes to the ready 1223bf215546Sopenharmony_ci * list. 1224bf215546Sopenharmony_ci */ 1225bf215546Sopenharmony_ci 1226bf215546Sopenharmony_cistatic void sched_find_max_nodes(sched_ctx *ctx) 1227bf215546Sopenharmony_ci{ 1228bf215546Sopenharmony_ci ctx->instr->alu_num_unscheduled_next_max = 0; 1229bf215546Sopenharmony_ci ctx->instr->alu_num_slot_needed_by_max = 0; 1230bf215546Sopenharmony_ci 1231bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1232bf215546Sopenharmony_ci if (!gpir_is_input_node(node)) 1233bf215546Sopenharmony_ci continue; 1234bf215546Sopenharmony_ci 1235bf215546Sopenharmony_ci int min_end_move = gpir_get_min_end_as_move(node); 1236bf215546Sopenharmony_ci node->sched.max_node = (min_end_move == ctx->instr->index); 1237bf215546Sopenharmony_ci node->sched.next_max_node = (min_end_move == ctx->instr->index + 1); 1238bf215546Sopenharmony_ci if (node->sched.next_max_node) 1239bf215546Sopenharmony_ci node->sched.complex_allowed = can_use_complex(node); 1240bf215546Sopenharmony_ci 1241bf215546Sopenharmony_ci if (node->sched.max_node) 1242bf215546Sopenharmony_ci ctx->instr->alu_num_slot_needed_by_max++; 1243bf215546Sopenharmony_ci if (node->sched.next_max_node) 1244bf215546Sopenharmony_ci ctx->instr->alu_num_unscheduled_next_max++; 1245bf215546Sopenharmony_ci } 1246bf215546Sopenharmony_ci} 1247bf215546Sopenharmony_ci 1248bf215546Sopenharmony_ci/* Verify the invariants described in gpir.h, as well as making sure the 1249bf215546Sopenharmony_ci * counts are correct. 1250bf215546Sopenharmony_ci */ 1251bf215546Sopenharmony_cistatic void ASSERTED verify_max_nodes(sched_ctx *ctx) 1252bf215546Sopenharmony_ci{ 1253bf215546Sopenharmony_ci int alu_num_slot_needed_by_max = 0; 1254bf215546Sopenharmony_ci int alu_num_unscheduled_next_max = 0; 1255bf215546Sopenharmony_ci int alu_num_slot_needed_by_store = 0; 1256bf215546Sopenharmony_ci int alu_num_slot_needed_by_non_cplx_store = 0; 1257bf215546Sopenharmony_ci ASSERTED int alu_max_allowed_next_max = 5; 1258bf215546Sopenharmony_ci 1259bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1260bf215546Sopenharmony_ci if (!gpir_is_input_node(node)) 1261bf215546Sopenharmony_ci continue; 1262bf215546Sopenharmony_ci 1263bf215546Sopenharmony_ci if (node->sched.max_node) 1264bf215546Sopenharmony_ci alu_num_slot_needed_by_max++; 1265bf215546Sopenharmony_ci if (node->sched.next_max_node) 1266bf215546Sopenharmony_ci alu_num_unscheduled_next_max++; 1267bf215546Sopenharmony_ci if (used_by_store(node, ctx->instr)) { 1268bf215546Sopenharmony_ci alu_num_slot_needed_by_store++; 1269bf215546Sopenharmony_ci if (node->sched.next_max_node && !node->sched.complex_allowed) 1270bf215546Sopenharmony_ci alu_num_slot_needed_by_non_cplx_store++; 1271bf215546Sopenharmony_ci } 1272bf215546Sopenharmony_ci } 1273bf215546Sopenharmony_ci 1274bf215546Sopenharmony_ci if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] && 1275bf215546Sopenharmony_ci ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1) 1276bf215546Sopenharmony_ci alu_max_allowed_next_max = 4; 1277bf215546Sopenharmony_ci 1278bf215546Sopenharmony_ci assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max); 1279bf215546Sopenharmony_ci assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max); 1280bf215546Sopenharmony_ci assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max); 1281bf215546Sopenharmony_ci assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store); 1282bf215546Sopenharmony_ci assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store == 1283bf215546Sopenharmony_ci alu_num_slot_needed_by_non_cplx_store); 1284bf215546Sopenharmony_ci assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0)); 1285bf215546Sopenharmony_ci assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store); 1286bf215546Sopenharmony_ci} 1287bf215546Sopenharmony_ci 1288bf215546Sopenharmony_cistatic bool try_node(sched_ctx *ctx) 1289bf215546Sopenharmony_ci{ 1290bf215546Sopenharmony_ci gpir_node *best_node = NULL; 1291bf215546Sopenharmony_ci int best_score = INT_MIN; 1292bf215546Sopenharmony_ci 1293bf215546Sopenharmony_ci /* Spilling will delete arbitrary nodes after the current one in the ready 1294bf215546Sopenharmony_ci * list, which means that we always need to look up the next node in the 1295bf215546Sopenharmony_ci * list at the end of each iteration. While list_for_each_entry() works for 1296bf215546Sopenharmony_ci * this purpose, its sanity checking assumes that you don't want to modify 1297bf215546Sopenharmony_ci * the list at all. We know better here, so we have to open-code 1298bf215546Sopenharmony_ci * list_for_each_entry() without the check in order to not assert. 1299bf215546Sopenharmony_ci */ 1300bf215546Sopenharmony_ci for (gpir_node *node = list_entry(ctx->ready_list.next, gpir_node, list); 1301bf215546Sopenharmony_ci &node->list != &ctx->ready_list; 1302bf215546Sopenharmony_ci node = list_entry(node->list.next, gpir_node, list)) { 1303bf215546Sopenharmony_ci if (best_score != INT_MIN) { 1304bf215546Sopenharmony_ci if (node->sched.dist < best_node->sched.dist) 1305bf215546Sopenharmony_ci break; 1306bf215546Sopenharmony_ci } 1307bf215546Sopenharmony_ci 1308bf215546Sopenharmony_ci if (node->sched.ready) { 1309bf215546Sopenharmony_ci ctx->total_spill_needed = 0; 1310bf215546Sopenharmony_ci ctx->max_node_spill_needed = 0; 1311bf215546Sopenharmony_ci int score = schedule_try_node(ctx, node, true); 1312bf215546Sopenharmony_ci if (score == INT_MIN && !best_node && 1313bf215546Sopenharmony_ci ctx->total_spill_needed > 0 && 1314bf215546Sopenharmony_ci try_spill_nodes(ctx, node)) { 1315bf215546Sopenharmony_ci score = schedule_try_node(ctx, node, true); 1316bf215546Sopenharmony_ci } 1317bf215546Sopenharmony_ci 1318bf215546Sopenharmony_ci /* schedule_first nodes must be scheduled if possible */ 1319bf215546Sopenharmony_ci if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) { 1320bf215546Sopenharmony_ci best_node = node; 1321bf215546Sopenharmony_ci best_score = score; 1322bf215546Sopenharmony_ci break; 1323bf215546Sopenharmony_ci } 1324bf215546Sopenharmony_ci 1325bf215546Sopenharmony_ci if (score > best_score) { 1326bf215546Sopenharmony_ci best_score = score; 1327bf215546Sopenharmony_ci best_node = node; 1328bf215546Sopenharmony_ci } 1329bf215546Sopenharmony_ci } 1330bf215546Sopenharmony_ci } 1331bf215546Sopenharmony_ci 1332bf215546Sopenharmony_ci if (best_node) { 1333bf215546Sopenharmony_ci gpir_debug("scheduling %d (score = %d)%s\n", best_node->index, 1334bf215546Sopenharmony_ci best_score, best_node->sched.max_node ? " (max)" : ""); 1335bf215546Sopenharmony_ci ASSERTED int score = schedule_try_node(ctx, best_node, false); 1336bf215546Sopenharmony_ci assert(score != INT_MIN); 1337bf215546Sopenharmony_ci return true; 1338bf215546Sopenharmony_ci } 1339bf215546Sopenharmony_ci 1340bf215546Sopenharmony_ci return false; 1341bf215546Sopenharmony_ci} 1342bf215546Sopenharmony_ci 1343bf215546Sopenharmony_cistatic void place_move(sched_ctx *ctx, gpir_node *node) 1344bf215546Sopenharmony_ci{ 1345bf215546Sopenharmony_ci /* For complex1 that is consumed by a postlog2, we cannot allow any moves 1346bf215546Sopenharmony_ci * in between. Convert the postlog2 to a move and insert a new postlog2, 1347bf215546Sopenharmony_ci * and try to schedule it again in try_node(). 1348bf215546Sopenharmony_ci */ 1349bf215546Sopenharmony_ci gpir_node *postlog2 = consuming_postlog2(node); 1350bf215546Sopenharmony_ci if (postlog2) { 1351bf215546Sopenharmony_ci postlog2->op = gpir_op_mov; 1352bf215546Sopenharmony_ci create_postlog2(ctx, node); 1353bf215546Sopenharmony_ci return; 1354bf215546Sopenharmony_ci } 1355bf215546Sopenharmony_ci 1356bf215546Sopenharmony_ci gpir_node *move = create_move(ctx, node); 1357bf215546Sopenharmony_ci gpir_node_foreach_succ_safe(move, dep) { 1358bf215546Sopenharmony_ci gpir_node *succ = dep->succ; 1359bf215546Sopenharmony_ci if (!succ->sched.instr || 1360bf215546Sopenharmony_ci ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) { 1361bf215546Sopenharmony_ci gpir_node_replace_pred(dep, node); 1362bf215546Sopenharmony_ci if (dep->type == GPIR_DEP_INPUT) 1363bf215546Sopenharmony_ci gpir_node_replace_child(succ, move, node); 1364bf215546Sopenharmony_ci } 1365bf215546Sopenharmony_ci } 1366bf215546Sopenharmony_ci ASSERTED int score = schedule_try_node(ctx, move, false); 1367bf215546Sopenharmony_ci assert(score != INT_MIN); 1368bf215546Sopenharmony_ci} 1369bf215546Sopenharmony_ci 1370bf215546Sopenharmony_ci/* For next-max nodes, not every node can be offloaded to a move in the 1371bf215546Sopenharmony_ci * complex slot. If we run out of non-complex slots, then such nodes cannot 1372bf215546Sopenharmony_ci * have moves placed for them. There should always be sufficient 1373bf215546Sopenharmony_ci * complex-capable nodes so that this isn't a problem. We also disallow moves 1374bf215546Sopenharmony_ci * for schedule_first nodes here. 1375bf215546Sopenharmony_ci */ 1376bf215546Sopenharmony_cistatic bool can_place_move(sched_ctx *ctx, gpir_node *node) 1377bf215546Sopenharmony_ci{ 1378bf215546Sopenharmony_ci if (gpir_op_infos[node->op].schedule_first) 1379bf215546Sopenharmony_ci return false; 1380bf215546Sopenharmony_ci 1381bf215546Sopenharmony_ci if (!node->sched.next_max_node) 1382bf215546Sopenharmony_ci return true; 1383bf215546Sopenharmony_ci 1384bf215546Sopenharmony_ci if (node->sched.complex_allowed) 1385bf215546Sopenharmony_ci return true; 1386bf215546Sopenharmony_ci 1387bf215546Sopenharmony_ci return ctx->instr->alu_non_cplx_slot_free > 0; 1388bf215546Sopenharmony_ci} 1389bf215546Sopenharmony_ci 1390bf215546Sopenharmony_cistatic bool sched_move(sched_ctx *ctx) 1391bf215546Sopenharmony_ci{ 1392bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1393bf215546Sopenharmony_ci if (node->sched.max_node) { 1394bf215546Sopenharmony_ci place_move(ctx, node); 1395bf215546Sopenharmony_ci return true; 1396bf215546Sopenharmony_ci } 1397bf215546Sopenharmony_ci } 1398bf215546Sopenharmony_ci 1399bf215546Sopenharmony_ci if (ctx->instr->alu_num_slot_needed_by_store > 0) { 1400bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1401bf215546Sopenharmony_ci if (used_by_store(node, ctx->instr)) { 1402bf215546Sopenharmony_ci place_move(ctx, node); 1403bf215546Sopenharmony_ci /* If we have a store of a load, then we need to make sure that we 1404bf215546Sopenharmony_ci * immediately schedule the dependent load, or create a move 1405bf215546Sopenharmony_ci * instruction for it, like we would with a normal instruction. 1406bf215546Sopenharmony_ci * The rest of the code isn't set up to handle load nodes in the 1407bf215546Sopenharmony_ci * ready list -- see the comments in _schedule_try_node(). 1408bf215546Sopenharmony_ci */ 1409bf215546Sopenharmony_ci if (node->type == gpir_node_type_load) { 1410bf215546Sopenharmony_ci if (!schedule_try_place_node(ctx, node, false)) { 1411bf215546Sopenharmony_ci create_move(ctx, node); 1412bf215546Sopenharmony_ci } 1413bf215546Sopenharmony_ci } 1414bf215546Sopenharmony_ci return true; 1415bf215546Sopenharmony_ci } 1416bf215546Sopenharmony_ci } 1417bf215546Sopenharmony_ci } 1418bf215546Sopenharmony_ci 1419bf215546Sopenharmony_ci /* complex1 is a bit a special case, since it has a latency of 2 cycles. 1420bf215546Sopenharmony_ci * Once it is fully ready, we need to group all its uses in the same 1421bf215546Sopenharmony_ci * instruction, and then we need to avoid creating any moves in the next 1422bf215546Sopenharmony_ci * cycle in order to get it scheduled. Failing to do any of these things 1423bf215546Sopenharmony_ci * could result in a cycle penalty, or even worse, an infinite loop of 1424bf215546Sopenharmony_ci * inserting moves. If it is a next-max node and ready, then it has a use 1425bf215546Sopenharmony_ci * in the previous cycle. If it has a use in the current cycle as well, 1426bf215546Sopenharmony_ci * then we want to insert a move node to make it ready in two cycles -- if 1427bf215546Sopenharmony_ci * we don't, then there will be at least a one cycle penalty. Otherwise, it 1428bf215546Sopenharmony_ci * will be ready next cycle, and we shouldn't insert a move node, or else 1429bf215546Sopenharmony_ci * we'll also have a one cycle penalty. 1430bf215546Sopenharmony_ci */ 1431bf215546Sopenharmony_ci if (ctx->instr->alu_num_slot_free > 0) { 1432bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1433bf215546Sopenharmony_ci if (!can_place_move(ctx, node)) 1434bf215546Sopenharmony_ci continue; 1435bf215546Sopenharmony_ci 1436bf215546Sopenharmony_ci if (node->sched.next_max_node && node->op == gpir_op_complex1 && 1437bf215546Sopenharmony_ci node->sched.ready) { 1438bf215546Sopenharmony_ci bool skip = true; 1439bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 1440bf215546Sopenharmony_ci if (dep->type != GPIR_DEP_INPUT) 1441bf215546Sopenharmony_ci continue; 1442bf215546Sopenharmony_ci 1443bf215546Sopenharmony_ci gpir_node *succ = dep->succ; 1444bf215546Sopenharmony_ci 1445bf215546Sopenharmony_ci if (!succ->sched.instr || 1446bf215546Sopenharmony_ci succ->sched.instr->index != ctx->instr->index - 1) { 1447bf215546Sopenharmony_ci skip = false; 1448bf215546Sopenharmony_ci break; 1449bf215546Sopenharmony_ci } 1450bf215546Sopenharmony_ci } 1451bf215546Sopenharmony_ci 1452bf215546Sopenharmony_ci if (skip) 1453bf215546Sopenharmony_ci continue; 1454bf215546Sopenharmony_ci 1455bf215546Sopenharmony_ci place_move(ctx, node); 1456bf215546Sopenharmony_ci return true; 1457bf215546Sopenharmony_ci } 1458bf215546Sopenharmony_ci } 1459bf215546Sopenharmony_ci } 1460bf215546Sopenharmony_ci 1461bf215546Sopenharmony_ci /* Once we've made all the required moves, we're free to use any extra 1462bf215546Sopenharmony_ci * slots to schedule more moves for next max nodes. Besides sometimes being 1463bf215546Sopenharmony_ci * necessary, this can free up extra space in the next instruction. We walk 1464bf215546Sopenharmony_ci * from back to front so that we pick nodes less likely to be scheduled 1465bf215546Sopenharmony_ci * next first -- an extra move would be unnecessary there. But make sure 1466bf215546Sopenharmony_ci * not to handle the complex1 case handled above. 1467bf215546Sopenharmony_ci */ 1468bf215546Sopenharmony_ci if (ctx->instr->alu_num_slot_free > 0) { 1469bf215546Sopenharmony_ci list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) { 1470bf215546Sopenharmony_ci if (!can_place_move(ctx, node)) 1471bf215546Sopenharmony_ci continue; 1472bf215546Sopenharmony_ci 1473bf215546Sopenharmony_ci if (node->sched.next_max_node && 1474bf215546Sopenharmony_ci !(node->op == gpir_op_complex1 && node->sched.ready)) { 1475bf215546Sopenharmony_ci place_move(ctx, node); 1476bf215546Sopenharmony_ci return true; 1477bf215546Sopenharmony_ci } 1478bf215546Sopenharmony_ci } 1479bf215546Sopenharmony_ci } 1480bf215546Sopenharmony_ci 1481bf215546Sopenharmony_ci /* We may have skipped complex1 above, but if we run out of space, we still 1482bf215546Sopenharmony_ci * need to insert the move. 1483bf215546Sopenharmony_ci */ 1484bf215546Sopenharmony_ci 1485bf215546Sopenharmony_ci if (ctx->instr->alu_num_unscheduled_next_max > 1486bf215546Sopenharmony_ci ctx->instr->alu_max_allowed_next_max) { 1487bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1488bf215546Sopenharmony_ci if (!can_place_move(ctx, node)) 1489bf215546Sopenharmony_ci continue; 1490bf215546Sopenharmony_ci 1491bf215546Sopenharmony_ci if (node->sched.next_max_node) { 1492bf215546Sopenharmony_ci place_move(ctx, node); 1493bf215546Sopenharmony_ci return true; 1494bf215546Sopenharmony_ci } 1495bf215546Sopenharmony_ci } 1496bf215546Sopenharmony_ci } 1497bf215546Sopenharmony_ci 1498bf215546Sopenharmony_ci 1499bf215546Sopenharmony_ci return false; 1500bf215546Sopenharmony_ci} 1501bf215546Sopenharmony_ci 1502bf215546Sopenharmony_cistatic bool gpir_sched_instr_pass(sched_ctx *ctx) 1503bf215546Sopenharmony_ci{ 1504bf215546Sopenharmony_ci if (try_node(ctx)) 1505bf215546Sopenharmony_ci return true; 1506bf215546Sopenharmony_ci 1507bf215546Sopenharmony_ci if (sched_move(ctx)) 1508bf215546Sopenharmony_ci return true; 1509bf215546Sopenharmony_ci 1510bf215546Sopenharmony_ci return false; 1511bf215546Sopenharmony_ci} 1512bf215546Sopenharmony_ci 1513bf215546Sopenharmony_cistatic void schedule_print_pre_one_instr(sched_ctx *ctx) 1514bf215546Sopenharmony_ci{ 1515bf215546Sopenharmony_ci if (!(lima_debug & LIMA_DEBUG_GP)) 1516bf215546Sopenharmony_ci return; 1517bf215546Sopenharmony_ci 1518bf215546Sopenharmony_ci printf("instr %d for ready list:", ctx->instr->index); 1519bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1520bf215546Sopenharmony_ci printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p', 1521bf215546Sopenharmony_ci node->sched.dist, gpir_get_slots_required(node), 1522bf215546Sopenharmony_ci node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none")); 1523bf215546Sopenharmony_ci } 1524bf215546Sopenharmony_ci printf("\nlive physregs: "); 1525bf215546Sopenharmony_ci for (unsigned i = 0; i < 16; i++) { 1526bf215546Sopenharmony_ci if (ctx->live_physregs & (0xfull << (4 * i))) { 1527bf215546Sopenharmony_ci printf("$%d.", i); 1528bf215546Sopenharmony_ci for (unsigned j = 0; j < 4; j++) { 1529bf215546Sopenharmony_ci if (ctx->live_physregs & (1ull << (4 * i + j))) 1530bf215546Sopenharmony_ci printf("%c", "xyzw"[j]); 1531bf215546Sopenharmony_ci } 1532bf215546Sopenharmony_ci printf(" "); 1533bf215546Sopenharmony_ci } 1534bf215546Sopenharmony_ci } 1535bf215546Sopenharmony_ci printf("\n"); 1536bf215546Sopenharmony_ci} 1537bf215546Sopenharmony_ci 1538bf215546Sopenharmony_cistatic void schedule_print_post_one_instr(gpir_instr *instr) 1539bf215546Sopenharmony_ci{ 1540bf215546Sopenharmony_ci if (!(lima_debug & LIMA_DEBUG_GP)) 1541bf215546Sopenharmony_ci return; 1542bf215546Sopenharmony_ci 1543bf215546Sopenharmony_ci printf("post schedule instr"); 1544bf215546Sopenharmony_ci for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) { 1545bf215546Sopenharmony_ci if (instr->slots[i]) 1546bf215546Sopenharmony_ci printf(" %d/%d", i, instr->slots[i]->index); 1547bf215546Sopenharmony_ci } 1548bf215546Sopenharmony_ci printf("\n"); 1549bf215546Sopenharmony_ci} 1550bf215546Sopenharmony_ci 1551bf215546Sopenharmony_ci 1552bf215546Sopenharmony_cistatic bool schedule_one_instr(sched_ctx *ctx) 1553bf215546Sopenharmony_ci{ 1554bf215546Sopenharmony_ci gpir_instr *instr = gpir_instr_create(ctx->block); 1555bf215546Sopenharmony_ci if (unlikely(!instr)) 1556bf215546Sopenharmony_ci return false; 1557bf215546Sopenharmony_ci 1558bf215546Sopenharmony_ci ctx->instr = instr; 1559bf215546Sopenharmony_ci 1560bf215546Sopenharmony_ci sched_find_max_nodes(ctx); 1561bf215546Sopenharmony_ci schedule_print_pre_one_instr(ctx); 1562bf215546Sopenharmony_ci 1563bf215546Sopenharmony_ci while (gpir_sched_instr_pass(ctx)) { 1564bf215546Sopenharmony_ci assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx)); 1565bf215546Sopenharmony_ci#ifndef NDEBUG 1566bf215546Sopenharmony_ci verify_max_nodes(ctx); 1567bf215546Sopenharmony_ci verify_ready_list(ctx); 1568bf215546Sopenharmony_ci#endif 1569bf215546Sopenharmony_ci } 1570bf215546Sopenharmony_ci 1571bf215546Sopenharmony_ci schedule_print_post_one_instr(instr); 1572bf215546Sopenharmony_ci return true; 1573bf215546Sopenharmony_ci} 1574bf215546Sopenharmony_ci 1575bf215546Sopenharmony_cistatic bool schedule_block(gpir_block *block) 1576bf215546Sopenharmony_ci{ 1577bf215546Sopenharmony_ci /* calculate distance */ 1578bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &block->node_list, list) { 1579bf215546Sopenharmony_ci if (gpir_node_is_root(node)) 1580bf215546Sopenharmony_ci schedule_update_distance(node); 1581bf215546Sopenharmony_ci } 1582bf215546Sopenharmony_ci 1583bf215546Sopenharmony_ci sched_ctx ctx; 1584bf215546Sopenharmony_ci list_inithead(&ctx.ready_list); 1585bf215546Sopenharmony_ci ctx.block = block; 1586bf215546Sopenharmony_ci ctx.ready_list_slots = 0; 1587bf215546Sopenharmony_ci ctx.live_physregs = block->live_out_phys; 1588bf215546Sopenharmony_ci 1589bf215546Sopenharmony_ci for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) { 1590bf215546Sopenharmony_ci list_inithead(&ctx.physreg_reads[i]); 1591bf215546Sopenharmony_ci } 1592bf215546Sopenharmony_ci 1593bf215546Sopenharmony_ci /* construct the ready list from root nodes */ 1594bf215546Sopenharmony_ci list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { 1595bf215546Sopenharmony_ci /* Add to physreg_reads */ 1596bf215546Sopenharmony_ci if (node->op == gpir_op_load_reg) { 1597bf215546Sopenharmony_ci gpir_load_node *load = gpir_node_to_load(node); 1598bf215546Sopenharmony_ci unsigned index = 4 * load->index + load->component; 1599bf215546Sopenharmony_ci list_addtail(&load->reg_link, &ctx.physreg_reads[index]); 1600bf215546Sopenharmony_ci } 1601bf215546Sopenharmony_ci 1602bf215546Sopenharmony_ci if (gpir_node_is_root(node)) 1603bf215546Sopenharmony_ci schedule_insert_ready_list(&ctx, node); 1604bf215546Sopenharmony_ci } 1605bf215546Sopenharmony_ci 1606bf215546Sopenharmony_ci list_inithead(&block->node_list); 1607bf215546Sopenharmony_ci while (!list_is_empty(&ctx.ready_list)) { 1608bf215546Sopenharmony_ci if (!schedule_one_instr(&ctx)) 1609bf215546Sopenharmony_ci return false; 1610bf215546Sopenharmony_ci } 1611bf215546Sopenharmony_ci 1612bf215546Sopenharmony_ci return true; 1613bf215546Sopenharmony_ci} 1614bf215546Sopenharmony_ci 1615bf215546Sopenharmony_cistatic void schedule_build_dependency(gpir_block *block) 1616bf215546Sopenharmony_ci{ 1617bf215546Sopenharmony_ci /* merge dummy_f/m to the node created from */ 1618bf215546Sopenharmony_ci list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { 1619bf215546Sopenharmony_ci if (node->op == gpir_op_dummy_m) { 1620bf215546Sopenharmony_ci gpir_alu_node *alu = gpir_node_to_alu(node); 1621bf215546Sopenharmony_ci gpir_node *origin = alu->children[0]; 1622bf215546Sopenharmony_ci gpir_node *dummy_f = alu->children[1]; 1623bf215546Sopenharmony_ci 1624bf215546Sopenharmony_ci gpir_node_foreach_succ(node, dep) { 1625bf215546Sopenharmony_ci gpir_node *succ = dep->succ; 1626bf215546Sopenharmony_ci /* origin and node may have same succ (by VREG/INPUT or 1627bf215546Sopenharmony_ci * VREG/VREG dep), so use gpir_node_add_dep() instead of 1628bf215546Sopenharmony_ci * gpir_node_replace_pred() */ 1629bf215546Sopenharmony_ci gpir_node_add_dep(succ, origin, dep->type); 1630bf215546Sopenharmony_ci gpir_node_replace_child(succ, node, origin); 1631bf215546Sopenharmony_ci } 1632bf215546Sopenharmony_ci gpir_node_delete(dummy_f); 1633bf215546Sopenharmony_ci gpir_node_delete(node); 1634bf215546Sopenharmony_ci } 1635bf215546Sopenharmony_ci } 1636bf215546Sopenharmony_ci} 1637bf215546Sopenharmony_ci 1638bf215546Sopenharmony_cistatic void print_statistic(gpir_compiler *comp, int save_index) 1639bf215546Sopenharmony_ci{ 1640bf215546Sopenharmony_ci int num_nodes[gpir_op_num] = {0}; 1641bf215546Sopenharmony_ci int num_created_nodes[gpir_op_num] = {0}; 1642bf215546Sopenharmony_ci 1643bf215546Sopenharmony_ci list_for_each_entry(gpir_block, block, &comp->block_list, list) { 1644bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &block->node_list, list) { 1645bf215546Sopenharmony_ci num_nodes[node->op]++; 1646bf215546Sopenharmony_ci if (node->index >= save_index) 1647bf215546Sopenharmony_ci num_created_nodes[node->op]++; 1648bf215546Sopenharmony_ci } 1649bf215546Sopenharmony_ci } 1650bf215546Sopenharmony_ci 1651bf215546Sopenharmony_ci printf("====== gpir scheduler statistic ======\n"); 1652bf215546Sopenharmony_ci printf("---- how many nodes are scheduled ----\n"); 1653bf215546Sopenharmony_ci int n = 0, l = 0; 1654bf215546Sopenharmony_ci for (int i = 0; i < gpir_op_num; i++) { 1655bf215546Sopenharmony_ci if (num_nodes[i]) { 1656bf215546Sopenharmony_ci printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]); 1657bf215546Sopenharmony_ci n += num_nodes[i]; 1658bf215546Sopenharmony_ci if (!(++l % 4)) 1659bf215546Sopenharmony_ci printf("\n"); 1660bf215546Sopenharmony_ci } 1661bf215546Sopenharmony_ci } 1662bf215546Sopenharmony_ci if (l % 4) 1663bf215546Sopenharmony_ci printf("\n"); 1664bf215546Sopenharmony_ci printf("\ntotal: %d\n", n); 1665bf215546Sopenharmony_ci 1666bf215546Sopenharmony_ci printf("---- how many nodes are created ----\n"); 1667bf215546Sopenharmony_ci n = l = 0; 1668bf215546Sopenharmony_ci for (int i = 0; i < gpir_op_num; i++) { 1669bf215546Sopenharmony_ci if (num_created_nodes[i]) { 1670bf215546Sopenharmony_ci printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]); 1671bf215546Sopenharmony_ci n += num_created_nodes[i]; 1672bf215546Sopenharmony_ci if (!(++l % 4)) 1673bf215546Sopenharmony_ci printf("\n"); 1674bf215546Sopenharmony_ci } 1675bf215546Sopenharmony_ci } 1676bf215546Sopenharmony_ci if (l % 4) 1677bf215546Sopenharmony_ci printf("\n"); 1678bf215546Sopenharmony_ci printf("\ntotal: %d\n", n); 1679bf215546Sopenharmony_ci printf("------------------------------------\n"); 1680bf215546Sopenharmony_ci} 1681bf215546Sopenharmony_ci 1682bf215546Sopenharmony_cibool gpir_schedule_prog(gpir_compiler *comp) 1683bf215546Sopenharmony_ci{ 1684bf215546Sopenharmony_ci int save_index = comp->cur_index; 1685bf215546Sopenharmony_ci 1686bf215546Sopenharmony_ci /* init schedule info */ 1687bf215546Sopenharmony_ci int index = 0; 1688bf215546Sopenharmony_ci list_for_each_entry(gpir_block, block, &comp->block_list, list) { 1689bf215546Sopenharmony_ci block->sched.instr_index = 0; 1690bf215546Sopenharmony_ci list_for_each_entry(gpir_node, node, &block->node_list, list) { 1691bf215546Sopenharmony_ci node->sched.instr = NULL; 1692bf215546Sopenharmony_ci node->sched.pos = -1; 1693bf215546Sopenharmony_ci node->sched.index = index++; 1694bf215546Sopenharmony_ci node->sched.dist = -1; 1695bf215546Sopenharmony_ci /* TODO when we support multiple basic blocks, we need a way to keep 1696bf215546Sopenharmony_ci * track of this for physregs allocated before the scheduler. 1697bf215546Sopenharmony_ci */ 1698bf215546Sopenharmony_ci node->sched.physreg_store = NULL; 1699bf215546Sopenharmony_ci node->sched.ready = false; 1700bf215546Sopenharmony_ci node->sched.inserted = false; 1701bf215546Sopenharmony_ci node->sched.complex_allowed = false; 1702bf215546Sopenharmony_ci node->sched.max_node = false; 1703bf215546Sopenharmony_ci node->sched.next_max_node = false; 1704bf215546Sopenharmony_ci } 1705bf215546Sopenharmony_ci } 1706bf215546Sopenharmony_ci 1707bf215546Sopenharmony_ci /* build dependency */ 1708bf215546Sopenharmony_ci list_for_each_entry(gpir_block, block, &comp->block_list, list) { 1709bf215546Sopenharmony_ci schedule_build_dependency(block); 1710bf215546Sopenharmony_ci } 1711bf215546Sopenharmony_ci 1712bf215546Sopenharmony_ci //gpir_debug("after scheduler build reg dependency\n"); 1713bf215546Sopenharmony_ci //gpir_node_print_prog_dep(comp); 1714bf215546Sopenharmony_ci 1715bf215546Sopenharmony_ci list_for_each_entry(gpir_block, block, &comp->block_list, list) { 1716bf215546Sopenharmony_ci if (!schedule_block(block)) { 1717bf215546Sopenharmony_ci gpir_error("fail schedule block\n"); 1718bf215546Sopenharmony_ci return false; 1719bf215546Sopenharmony_ci } 1720bf215546Sopenharmony_ci } 1721bf215546Sopenharmony_ci 1722bf215546Sopenharmony_ci if (lima_debug & LIMA_DEBUG_GP) { 1723bf215546Sopenharmony_ci print_statistic(comp, save_index); 1724bf215546Sopenharmony_ci gpir_instr_print_prog(comp); 1725bf215546Sopenharmony_ci } 1726bf215546Sopenharmony_ci 1727bf215546Sopenharmony_ci return true; 1728bf215546Sopenharmony_ci} 1729