1/* 2 * Copyright (c) 2017 Lima Project 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sub license, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the 12 * next paragraph) shall be included in all copies or substantial portions 13 * of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25#include <limits.h> 26 27#include "gpir.h" 28 29/* 30 * GP scheduling algorithm (by Connor Abbott <cwabbott0@gmail.com>) 31 * 32 * The GP pipeline has three main stages: 33 * 34 * -------------------------------------------------------- 35 * | | 36 * | Register/Attr/Temp Fetch | 37 * | | 38 * -------------------------------------------------------- 39 * | | | | | | | 40 * | Mul0 | Mul1 | Add0 | Add1 | Cplx | Pass | 41 * | | | | | | | 42 * -------------------------------------------------------- 43 * | | | | 44 * | Complex1 | Temp/Register/Varying | Pass | 45 * | Stage 2 | Store | Stage 2 | 46 * | | | | 47 * -------------------------------------------------------- 48 * 49 * Because of this setup, storing a register has a latency of three cycles. 50 * Also, the register file is organized into 4-component vectors, and the 51 * load stage can only load two vectors at a time. Aside from these highly 52 * constrained register load/store units, there is an explicit bypass 53 * network, where each unit (mul0/mul1/etc.) can access the results of the 54 * any unit from the previous two cycles directly, except for the complex 55 * unit whose result can only be accessed for one cycle (since it's expected 56 * to be used directly by the complex2 instruction in the following cycle). 57 * 58 * Because of the very restricted register file, and because only rarely are 59 * all the units in use at the same time, it can be very beneficial to use 60 * the unused units to "thread" a value from source to destination by using 61 * moves in the otherwise-unused units, without involving the register file 62 * at all. It's very difficult to fully exploit this with a traditional 63 * scheduler, so we need to do something a little un-traditional. The 512 64 * instruction limit means that for more complex shaders, we need to do as 65 * well as possible or else the app won't even work. 66 * 67 * The scheduler works by considering the bypass network as a kind of 68 * register file. It's a quite unusual register file, since registers have to 69 * be assigned "on the fly" as we schedule operations, but with some care, we 70 * can use something conceptually similar to a linear-scan allocator to 71 * successfully schedule nodes to instructions without running into 72 * conflicts. 73 * 74 * Values in the IR are separated into normal values, or "value registers", 75 * which is what normal nodes like add, mul, etc. produce, and which only 76 * live inside one basic block, and registers, which can span multiple basic 77 * blocks but have to be accessed via special load_reg/store_reg nodes. RA 78 * assigns physical registers to both value registers and normal registers, 79 * treating load_reg/store_reg as a move instruction, but these are only used 80 * directly for normal registers -- the physreg assigned to a value register 81 * is "fake," and is only used inside the scheduler. Before scheduling we 82 * insert read-after-write dependencies, even for value registers, as if 83 * we're going to use those, but then we throw them away. For example, if we 84 * had something like: 85 * 86 * (*)r2 = add (*)r1, (*)r2 87 * (*)r1 = load_reg r0 88 * 89 * we'd insert a write-after-read dependency between the add and load_reg, 90 * even though the starred registers aren't actually used by the scheduler 91 * after this step. This step is crucial since it guarantees that during any 92 * point in the schedule, the number of live registers + live value registers 93 * will never exceed the capacity of the register file and the bypass network 94 * combined. This is because each live register/value register will have a 95 * different fake number, thanks to the fake dependencies inserted before 96 * scheduling. This allows us to not have to worry about spilling to 97 * temporaries, which is only done ahead of time. 98 * 99 * The scheduler is a bottom-up scheduler. It keeps track of each live value 100 * register, and decides on-the-fly which value registers to keep in the 101 * bypass network and which to "spill" to registers. Of particular importance 102 * is the "ready list," which consists of "input nodes" (nodes that produce a 103 * value that can be consumed via the bypass network), both "partially ready" 104 * (only some of the uses have been scheduled) and "fully ready" (all uses 105 * have been scheduled), as well as other non-input nodes like register 106 * stores. Each input node on the ready list represents a live value register 107 * before the current instruction. There must be at most 11 such input nodes 108 * at all times, since there are only 11 slots in the next two instructions 109 * which can reach the current instruction. 110 * 111 * An input node is a "max node" if it has a use two cycles ago, which must be 112 * connected to a definition this cycle. Otherwise it may be a "next max node" 113 * if it will be a max node on the next instruction (i.e. it has a use at most 114 * one cycle ago), or it may be neither if all of its uses are this cycle. As 115 * we keep adding instructions to the front, input nodes graduate from 116 * neither, to next max, to max, unless we decide to insert a move to keep it 117 * alive longer, at which point any uses after the current instruction are 118 * rewritten to be uses of the move so that the original node returns to 119 * neither. The scheduler decides which nodes to try freely, but we have to 120 * reserve slots for two different reasons: (1) out of the 5 non-complex 121 * slots, we reserve a slot for each max node, so that we can connect a 122 * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a 123 * slot for every next-max node above 5, so that for the next instruction 124 * there are no more than 5 max nodes. When a max or next-max node gets 125 * scheduled, the corresponding reservation is reduced by one. At the end, we 126 * insert moves for every slot that was reserved. The reservation is actually 127 * managed by nir_instr, and all we have to do is tell it how many to reserve 128 * at the beginning and then tell it which nodes are max/next-max nodes. When 129 * we start scheduling an instruction, there will be at most 5 max nodes 130 * thanks to the previous instruction's next-max reservation/move insertion. 131 * Since there are at most 11 total input nodes, if there are N max nodes, 132 * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 = 133 * 6 - N slots need to be reserved for next-max nodes, and so at most 134 * 6 - N + N = 6 slots need to be reserved in total, exactly the total number 135 * of slots. So, thanks to the total input node restriction, we will never 136 * need to reserve too many slots. 137 * 138 * It sometimes happens that scheduling a given node will violate this total 139 * input node restriction, or that a reservation will mean that we can't 140 * schedule it. We first schedule a node "speculatively" to see if this is a 141 * problem. If some of the node's sources are loads, then we can schedule 142 * the node and its dependent loads in one swoop to avoid going over the 143 * pressure limit. If that fails, we can try to spill a ready or 144 * partially-ready input node to a register by rewriting all of its uses to 145 * refer to a register load. This removes it from the list of ready and 146 * partially ready input nodes as all of its uses are now unscheduled. If 147 * successful, we can then proceed with scheduling the original node. All of 148 * this happens "speculatively," meaning that afterwards the node is removed 149 * and the entire state of the scheduler is reverted to before it was tried, to 150 * ensure that we never get into an invalid state and run out of spots for 151 * moves. In try_nodes(), we try to schedule each node speculatively on the 152 * ready list, keeping only the nodes that could be successfully scheduled, so 153 * that when we finally decide which node to actually schedule, we know it 154 * will succeed. This is how we decide on the fly which values go in 155 * registers and which go in the bypass network. Note that "unspilling" a 156 * value is simply a matter of scheduling the store_reg instruction created 157 * when we spill. 158 * 159 * The careful accounting of live value registers, reservations for moves, and 160 * speculative scheduling guarantee that we never run into a failure case 161 * while scheduling. However, we need to make sure that this scheduler will 162 * not get stuck in an infinite loop, i.e. that we'll always make forward 163 * progress by eventually scheduling a non-move node. If we run out of value 164 * registers, then we may have to spill a node to a register. If we 165 * were to schedule one of the fully-ready nodes, then we'd have 11 + N live 166 * value registers before the current instruction. But since there are at most 167 * 64+11 live registers and register values total thanks to the fake 168 * dependencies we inserted before scheduling, there are at most 64 - N live 169 * physical registers, and therefore there are at least N registers available 170 * for spilling. Not all these registers will be available immediately, since 171 * in order to spill a node to a given register we have to ensure that there 172 * are slots available to rewrite every use to a load instruction, and that 173 * may not be the case. There may also be intervening writes which prevent 174 * some registers from being used. However, these are all temporary problems, 175 * since as we create each instruction, we create additional register load 176 * slots that can be freely used for spilling, and we create more move nodes 177 * which means that the uses of the nodes we're trying to spill keep moving 178 * forward. This means that eventually, these problems will go away, at which 179 * point we'll be able to spill a node successfully, so eventually we'll be 180 * able to schedule the first node on the ready list. 181 */ 182 183typedef struct { 184 /* This is the list of ready and partially-ready nodes. A partially-ready 185 * node must have at least one input dependency already scheduled. 186 */ 187 struct list_head ready_list; 188 189 /* The number of ready or partially-ready nodes with at least one input 190 * dependency already scheduled. In other words, the number of live value 191 * registers. This must be at most 11. 192 */ 193 int ready_list_slots; 194 195 /* The physical registers live into the current instruction. */ 196 uint64_t live_physregs; 197 198 /* The current instruction. */ 199 gpir_instr *instr; 200 201 /* The current basic block. */ 202 gpir_block *block; 203 204 /* True if at least one node failed to schedule due to lack of available 205 * value registers. 206 */ 207 bool try_spill_all; 208 209 /* The number of max nodes needed to spill to successfully schedule the 210 * instruction. 211 */ 212 int max_node_spill_needed; 213 214 /* The number of max and next-max nodes needed to spill to successfully 215 * schedule the instruction. 216 */ 217 int total_spill_needed; 218 219 /* For each physical register, a linked list of loads associated with it in 220 * this block. When we spill a value to a given register, and there are 221 * existing loads associated with it that haven't been scheduled yet, we 222 * have to make sure that the corresponding unspill happens after the last 223 * original use has happened, i.e. is scheduled before. 224 */ 225 struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM]; 226} sched_ctx; 227 228static int gpir_min_dist_alu(gpir_dep *dep) 229{ 230 switch (dep->pred->op) { 231 case gpir_op_load_uniform: 232 case gpir_op_load_temp: 233 case gpir_op_load_reg: 234 case gpir_op_load_attribute: 235 return 0; 236 237 case gpir_op_complex1: 238 return 2; 239 240 default: 241 return 1; 242 } 243} 244 245static int gpir_get_min_dist(gpir_dep *dep) 246{ 247 switch (dep->type) { 248 case GPIR_DEP_INPUT: 249 switch (dep->succ->op) { 250 case gpir_op_store_temp: 251 case gpir_op_store_reg: 252 case gpir_op_store_varying: 253 /* Stores must use an alu node as input. Also, complex1 takes two 254 * cycles, which means that its result cannot be stored to a register 255 * as part of the normal path, and therefore it must also have a move 256 * inserted. 257 */ 258 if (dep->pred->type == gpir_node_type_load || 259 dep->pred->op == gpir_op_complex1) 260 return INT_MAX >> 2; 261 else 262 return 0; 263 264 default: 265 return gpir_min_dist_alu(dep); 266 } 267 268 case GPIR_DEP_OFFSET: 269 assert(dep->succ->op == gpir_op_store_temp); 270 return gpir_min_dist_alu(dep); 271 272 case GPIR_DEP_READ_AFTER_WRITE: 273 if (dep->succ->op == gpir_op_load_temp && 274 dep->pred->op == gpir_op_store_temp) { 275 return 4; 276 } else if (dep->succ->op == gpir_op_load_reg && 277 dep->pred->op == gpir_op_store_reg) { 278 return 3; 279 } else if ((dep->pred->op == gpir_op_store_temp_load_off0 || 280 dep->pred->op == gpir_op_store_temp_load_off1 || 281 dep->pred->op == gpir_op_store_temp_load_off2) && 282 dep->succ->op == gpir_op_load_uniform) { 283 return 4; 284 } else { 285 /* Fake dependency */ 286 return 0; 287 } 288 289 case GPIR_DEP_WRITE_AFTER_READ: 290 return 0; 291 } 292 293 return 0; 294} 295 296static int gpir_max_dist_alu(gpir_dep *dep) 297{ 298 switch (dep->pred->op) { 299 case gpir_op_load_uniform: 300 case gpir_op_load_temp: 301 return 0; 302 case gpir_op_load_attribute: 303 return 1; 304 case gpir_op_load_reg: 305 if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 || 306 dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3) 307 return 0; 308 else 309 return 1; 310 case gpir_op_exp2_impl: 311 case gpir_op_log2_impl: 312 case gpir_op_rcp_impl: 313 case gpir_op_rsqrt_impl: 314 case gpir_op_store_temp_load_off0: 315 case gpir_op_store_temp_load_off1: 316 case gpir_op_store_temp_load_off2: 317 return 1; 318 case gpir_op_mov: 319 if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX) 320 return 1; 321 else 322 return 2; 323 default: 324 return 2; 325 } 326} 327 328static int gpir_get_max_dist(gpir_dep *dep) 329{ 330 switch (dep->type) { 331 case GPIR_DEP_INPUT: 332 switch (dep->succ->op) { 333 case gpir_op_store_temp: 334 case gpir_op_store_reg: 335 case gpir_op_store_varying: 336 return 0; 337 338 default: 339 return gpir_max_dist_alu(dep); 340 } 341 342 case GPIR_DEP_OFFSET: 343 assert(dep->succ->op == gpir_op_store_temp); 344 return gpir_max_dist_alu(dep); 345 346 default: 347 return INT_MAX >> 2; /* Don't want to overflow... */ 348 } 349} 350 351static void schedule_update_distance(gpir_node *node) 352{ 353 if (gpir_node_is_leaf(node)) { 354 node->sched.dist = 0; 355 return; 356 } 357 358 gpir_node_foreach_pred(node, dep) { 359 gpir_node *pred = dep->pred; 360 361 if (pred->sched.dist < 0) 362 schedule_update_distance(pred); 363 364 int dist = pred->sched.dist + gpir_min_dist_alu(dep); 365 if (node->sched.dist < dist) 366 node->sched.dist = dist; 367 } 368} 369 370static bool gpir_is_input_node(gpir_node *node) 371{ 372 gpir_node_foreach_succ(node, dep) { 373 if (dep->type == GPIR_DEP_INPUT) 374 return true; 375 } 376 return false; 377} 378 379 380/* Get the number of slots required for a node on the ready list. 381 */ 382static int gpir_get_slots_required(gpir_node *node) 383{ 384 if (!gpir_is_input_node(node)) 385 return 0; 386 387 /* Note that we assume every node only consumes one slot, even dual-slot 388 * instructions. While dual-slot instructions may consume more than one 389 * slot, we can always safely insert a move if it turns out that there 390 * isn't enough space for them. There's the risk that we get stuck in an 391 * infinite loop if all the fully ready nodes are dual-slot nodes, but we 392 * rely on spilling to registers to save us here. 393 */ 394 return 1; 395} 396 397static void ASSERTED verify_ready_list(sched_ctx *ctx) 398{ 399 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 400 if (!gpir_is_input_node(node)) { 401 assert(node->sched.ready); 402 } 403 404 if (node->sched.ready) { 405 /* Every successor must have been scheduled */ 406 gpir_node_foreach_succ(node, dep) { 407 assert(dep->succ->sched.instr); 408 } 409 } else { 410 /* There must be at least one successor that's not scheduled. */ 411 bool unscheduled = false; 412 gpir_node_foreach_succ(node, dep) { 413 unscheduled |= !(dep->succ->sched.instr); 414 } 415 416 assert(unscheduled); 417 } 418 } 419} 420 421static void schedule_insert_ready_list(sched_ctx *ctx, 422 gpir_node *insert_node) 423{ 424 /* if this node is fully ready or partially ready 425 * fully ready: all successors have been scheduled 426 * partially ready: part of input successors have been scheduled 427 * 428 * either fully ready or partially ready node need be inserted to 429 * the ready list, but we only schedule a move node for partially 430 * ready node. 431 */ 432 bool ready = true, insert = false; 433 gpir_node_foreach_succ(insert_node, dep) { 434 gpir_node *succ = dep->succ; 435 if (succ->sched.instr) { 436 if (dep->type == GPIR_DEP_INPUT) 437 insert = true; 438 } 439 else 440 ready = false; 441 } 442 443 insert_node->sched.ready = ready; 444 /* for root node */ 445 insert |= ready; 446 447 if (!insert || insert_node->sched.inserted) 448 return; 449 450 struct list_head *insert_pos = &ctx->ready_list; 451 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 452 if ((insert_node->sched.dist > node->sched.dist || 453 gpir_op_infos[insert_node->op].schedule_first) && 454 !gpir_op_infos[node->op].schedule_first) { 455 insert_pos = &node->list; 456 break; 457 } 458 } 459 460 list_addtail(&insert_node->list, insert_pos); 461 insert_node->sched.inserted = true; 462 ctx->ready_list_slots += gpir_get_slots_required(insert_node); 463} 464 465static int gpir_get_max_start(gpir_node *node) 466{ 467 int max_start = 0; 468 469 /* find the max start instr constrainted by all successors */ 470 gpir_node_foreach_succ(node, dep) { 471 gpir_node *succ = dep->succ; 472 if (!succ->sched.instr) 473 continue; 474 475 int start = succ->sched.instr->index + gpir_get_min_dist(dep); 476 if (start > max_start) 477 max_start = start; 478 } 479 480 return max_start; 481} 482 483static int gpir_get_min_end(gpir_node *node) 484{ 485 int min_end = INT_MAX; 486 487 /* find the min end instr constrainted by all successors */ 488 gpir_node_foreach_succ(node, dep) { 489 gpir_node *succ = dep->succ; 490 if (!succ->sched.instr) 491 continue; 492 493 int end = succ->sched.instr->index + gpir_get_max_dist(dep); 494 if (end < min_end) 495 min_end = end; 496 } 497 498 return min_end; 499} 500 501static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node) 502{ 503 gpir_load_node *load = gpir_node_to_load(node); 504 505 for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) { 506 if (!instr->slots[i]) 507 continue; 508 509 gpir_load_node *iload = gpir_node_to_load(instr->slots[i]); 510 if (load->node.op == iload->node.op && 511 load->index == iload->index && 512 load->component == iload->component) 513 return &iload->node; 514 } 515 return NULL; 516} 517 518/* Simply place the node into the given instruction without trying to deal 519 * with liveness or the ready list. This will only fail if the instruction 520 * cannot be placed due to a lack of available slots. In addition to normal 521 * node placement, this is also used for placing loads when spilling to 522 * registers. 523 */ 524static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node) 525{ 526 if (node->type == gpir_node_type_load) { 527 gpir_node *load = gpir_sched_instr_has_load(instr, node); 528 if (load) { 529 /* This node may have a store as a successor, in which case we have to 530 * fail it exactly like below in order to later create a move node in 531 * between. 532 */ 533 if (instr->index < gpir_get_max_start(node)) 534 return false; 535 536 gpir_debug("same load %d in instr %d for node %d\n", 537 load->index, instr->index, node->index); 538 539 /* not really merge two node, just fake scheduled same place */ 540 node->sched.instr = load->sched.instr; 541 node->sched.pos = load->sched.pos; 542 return true; 543 } 544 } 545 546 if (node->op == gpir_op_store_reg) { 547 /* This register may be loaded in the next basic block, in which case 548 * there still needs to be a 2 instruction gap. We do what the blob 549 * seems to do and simply disable stores in the last two instructions of 550 * the basic block. 551 * 552 * TODO: We may be able to do better than this, but we have to check 553 * first if storing a register works across branches. 554 */ 555 if (instr->index < 2) 556 return false; 557 } 558 559 node->sched.instr = instr; 560 561 int max_node_spill_needed = INT_MAX; 562 int total_spill_needed = INT_MAX; 563 int *slots = gpir_op_infos[node->op].slots; 564 for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) { 565 node->sched.pos = slots[i]; 566 if (instr->index >= gpir_get_max_start(node) && 567 instr->index <= gpir_get_min_end(node) && 568 gpir_instr_try_insert_node(instr, node)) 569 return true; 570 if (ctx->instr->non_cplx_slot_difference || 571 ctx->instr->slot_difference) { 572 /* If one of these fields is non-zero, then we could insert the node 573 * here after spilling. To get an accurate count of how many nodes we 574 * need to spill, we need to choose one of the positions where there 575 * were nonzero slot differences, preferably one with the smallest 576 * difference (so we don't have to spill as much). 577 */ 578 if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed || 579 ctx->instr->slot_difference < total_spill_needed) { 580 max_node_spill_needed = ctx->instr->non_cplx_slot_difference; 581 total_spill_needed = ctx->instr->slot_difference; 582 } 583 } 584 } 585 586 if (max_node_spill_needed != INT_MAX) { 587 /* Indicate how many spill nodes are needed. */ 588 ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed, 589 max_node_spill_needed); 590 ctx->total_spill_needed = MAX2(ctx->total_spill_needed, 591 total_spill_needed); 592 } 593 node->sched.instr = NULL; 594 node->sched.pos = -1; 595 return false; 596} 597 598/* Try to place just the node given, updating the ready list. If "speculative" 599 * is true, then this is part of the pre-commit phase. If false, then we have 600 * committed to placing this node, so update liveness and ready list 601 * information. 602 */ 603 604static bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node, 605 bool speculative) 606{ 607 if (!_try_place_node(ctx, ctx->instr, node)) { 608 if (!speculative) 609 gpir_debug("failed to place %d\n", node->index); 610 return false; 611 } 612 613 ctx->ready_list_slots -= gpir_get_slots_required(node); 614 615 if (!speculative) { 616 gpir_debug("placed node %d\n", node->index); 617 618 /* We assume here that writes are placed before reads. If this changes, 619 * then this needs to be updated. 620 */ 621 if (node->op == gpir_op_store_reg) { 622 gpir_store_node *store = gpir_node_to_store(node); 623 ctx->live_physregs &= 624 ~(1ull << (4 * store->index + store->component)); 625 if (store->child->sched.physreg_store == store) 626 store->child->sched.physreg_store = NULL; 627 } 628 629 if (node->op == gpir_op_load_reg) { 630 gpir_load_node *load = gpir_node_to_load(node); 631 ctx->live_physregs |= 632 (1ull << (4 * load->index + load->component)); 633 } 634 635 list_del(&node->list); 636 list_add(&node->list, &ctx->block->node_list); 637 gpir_node_foreach_pred(node, dep) { 638 gpir_node *pred = dep->pred; 639 schedule_insert_ready_list(ctx, pred); 640 } 641 } else { 642 gpir_node_foreach_pred(node, dep) { 643 gpir_node *pred = dep->pred; 644 if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT) 645 ctx->ready_list_slots += gpir_get_slots_required(pred); 646 } 647 } 648 649 return true; 650} 651 652/* Create a new node with "node" as the child, replace all uses of "node" with 653 * this new node, and replace "node" with it in the ready list. 654 */ 655static gpir_node *create_replacement(sched_ctx *ctx, gpir_node *node, 656 gpir_op op) 657{ 658 659 gpir_alu_node *new_node = gpir_node_create(node->block, op); 660 if (unlikely(!new_node)) 661 return NULL; 662 663 new_node->children[0] = node; 664 new_node->num_child = 1; 665 666 new_node->node.sched.instr = NULL; 667 new_node->node.sched.pos = -1; 668 new_node->node.sched.dist = node->sched.dist; 669 new_node->node.sched.max_node = node->sched.max_node; 670 new_node->node.sched.next_max_node = node->sched.next_max_node; 671 new_node->node.sched.complex_allowed = node->sched.complex_allowed; 672 673 ctx->ready_list_slots--; 674 list_del(&node->list); 675 node->sched.max_node = false; 676 node->sched.next_max_node = false; 677 node->sched.ready = false; 678 node->sched.inserted = false; 679 gpir_node_replace_succ(&new_node->node, node); 680 gpir_node_add_dep(&new_node->node, node, GPIR_DEP_INPUT); 681 schedule_insert_ready_list(ctx, &new_node->node); 682 return &new_node->node; 683} 684 685static gpir_node *create_move(sched_ctx *ctx, gpir_node *node) 686{ 687 gpir_node *move = create_replacement(ctx, node, gpir_op_mov); 688 gpir_debug("create move %d for %d\n", move->index, node->index); 689 return move; 690} 691 692static gpir_node *create_postlog2(sched_ctx *ctx, gpir_node *node) 693{ 694 assert(node->op == gpir_op_complex1); 695 gpir_node *postlog2 = create_replacement(ctx, node, gpir_op_postlog2); 696 gpir_debug("create postlog2 %d for %d\n", postlog2->index, node->index); 697 return postlog2; 698} 699 700/* Once we schedule the successor, would the predecessor be fully ready? */ 701static bool pred_almost_ready(gpir_dep *dep) 702{ 703 bool fully_ready = true; 704 gpir_node_foreach_succ(dep->pred, other_dep) { 705 gpir_node *succ = other_dep->succ; 706 if (!succ->sched.instr && dep->succ != other_dep->succ) { 707 fully_ready = false; 708 break; 709 } 710 } 711 712 return fully_ready; 713} 714 715/* Recursively try to schedule a node and all its dependent nodes that can fit 716 * in the same instruction. There is a simple heuristic scoring system to try 717 * to group together nodes that load different components of the same input, 718 * to avoid bottlenecking for operations like matrix multiplies that are 719 * mostly input-bound. 720 */ 721 722static int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative) 723{ 724 if (!schedule_try_place_node(ctx, node, speculative)) 725 return INT_MIN; 726 727 int score = 0; 728 729 gpir_node_foreach_pred(node, dep) { 730 if (dep->type != GPIR_DEP_INPUT) 731 continue; 732 733 int pred_score = INT_MIN; 734 if (pred_almost_ready(dep)) { 735 if (dep->pred->type == gpir_node_type_load || 736 node->type == gpir_node_type_store) { 737 pred_score = _schedule_try_node(ctx, dep->pred, speculative); 738 } 739 } 740 if (dep->pred->type == gpir_node_type_load || 741 node->type == gpir_node_type_store) { 742 if (pred_score == INT_MIN) { 743 if (node->op == gpir_op_mov) { 744 /* The only moves on the ready list are for loads that we 745 * couldn't schedule immediately, as created below. If we 746 * couldn't schedule the load, there's no point scheduling 747 * the move. The normal move threading logic will ensure 748 * that another move is created if we're about to go too far 749 * from the uses of this move. 750 */ 751 assert(speculative); 752 return INT_MIN; 753 } else if (!speculative && dep->pred->type == gpir_node_type_load) { 754 /* We couldn't schedule the load right away, so it will have 755 * to happen in some earlier instruction and then be moved 756 * into a value register and threaded to the use by "node". 757 * We create the move right away, so that later we'll fail 758 * to schedule it if there isn't a slot for a move 759 * available. 760 */ 761 create_move(ctx, dep->pred); 762 } 763 /* Penalize nodes whose dependent ops we couldn't schedule. 764 */ 765 score--; 766 } else { 767 score += pred_score; 768 continue; 769 } 770 } 771 } 772 773 return score; 774} 775 776/* If we speculatively tried a node, undo everything. 777 */ 778 779static void schedule_undo_node(sched_ctx *ctx, gpir_node *node) 780{ 781 gpir_instr_remove_node(ctx->instr, node); 782 783 gpir_node_foreach_pred(node, dep) { 784 gpir_node *pred = dep->pred; 785 if (pred->sched.instr) { 786 schedule_undo_node(ctx, pred); 787 } 788 } 789} 790 791/* Try to schedule a node. We also try to schedule any predecessors that can 792 * be part of the same instruction. If "speculative" is true, then we don't 793 * actually change any state, only returning the score were the node to be 794 * scheduled, with INT_MIN meaning "cannot be scheduled at all". 795 */ 796static int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative) 797{ 798 int prev_slots = ctx->ready_list_slots; 799 800 int score = _schedule_try_node(ctx, node, speculative); 801 802 if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) { 803 assert(speculative); 804 ctx->total_spill_needed = MAX2(ctx->total_spill_needed, 805 ctx->ready_list_slots - GPIR_VALUE_REG_NUM); 806 score = INT_MIN; 807 } 808 809 if (speculative) { 810 ctx->ready_list_slots = prev_slots; 811 if (node->sched.instr) 812 schedule_undo_node(ctx, node); 813 } 814 815 return score; 816} 817 818/* This is called when we want to spill "node" by inserting loads at its uses. 819 * It returns all the possible registers we can use so that all the loads will 820 * successfully be inserted. Also return the first instruction we'll need to 821 * insert a load for. 822 */ 823 824static uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node, 825 int *min_index) 826{ 827 uint64_t available = ~0ull; 828 gpir_node_foreach_succ(node, dep) { 829 if (dep->type != GPIR_DEP_INPUT) 830 continue; 831 832 gpir_node *use = dep->succ; 833 gpir_instr *instr = use->sched.instr; 834 835 if (!instr) { 836 /* This use isn't scheduled, so no need to spill it. */ 837 continue; 838 } 839 840 if (use->type == gpir_node_type_store) { 841 /* We're trying to spill something that was recently stored... just 842 * bail out. 843 */ 844 return 0; 845 } 846 847 if (use->op == gpir_op_mov && instr == ctx->instr) { 848 /* We try to spill the sources of this move, so we can free up space 849 * in the current instruction. 850 * 851 * TODO: should we go back further? It might let us schedule the 852 * write earlier in some cases, but then we might fail to spill. 853 */ 854 available &= get_available_regs(ctx, use, min_index); 855 } else { 856 if (instr->index < *min_index) 857 *min_index = instr->index; 858 859 uint64_t use_available = 0; 860 861 if (instr->reg0_use_count == 0) 862 use_available = ~0ull; 863 else if (!instr->reg0_is_attr) 864 use_available = 0xfull << (4 * instr->reg0_index); 865 866 if (instr->reg1_use_count == 0) 867 use_available = ~0ull; 868 else 869 use_available |= 0xfull << (4 * instr->reg1_index); 870 871 available &= use_available; 872 } 873 } 874 875 return available; 876} 877 878/* Using "min_index" returned by get_available_regs(), figure out which 879 * registers are killed by a write after or during the current instruction and 880 * hence we can't use for spilling. Writes that haven't been scheduled yet 881 * should be reflected in live_physregs. 882 */ 883 884static uint64_t get_killed_regs(sched_ctx *ctx, int min_index) 885{ 886 uint64_t killed = 0; 887 888 list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) { 889 if (instr->index <= min_index) 890 break; 891 892 for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3; 893 slot++) { 894 if (!instr->slots[slot]) 895 continue; 896 897 gpir_store_node *store = gpir_node_to_store(instr->slots[slot]); 898 if (store->node.op != gpir_op_store_reg) 899 continue; 900 901 killed |= 1ull << (4 * store->index + store->component); 902 } 903 } 904 905 return killed; 906} 907 908/* Actually spill a node so that it is no longer in the ready list. Note that 909 * this must exactly follow the logic of get_available_regs() or else the 910 * loads could fail to schedule. 911 */ 912 913static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store) 914{ 915 gpir_node_foreach_succ_safe(node, dep) { 916 if (dep->type != GPIR_DEP_INPUT) 917 continue; 918 919 gpir_node *use = dep->succ; 920 gpir_instr *instr = use->sched.instr; 921 922 if (!instr) 923 continue; 924 925 if (use->op == gpir_op_mov && instr == ctx->instr) { 926 spill_node(ctx, use, store); 927 } else { 928 gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg); 929 load->index = store->index; 930 load->component = store->component; 931 list_add(&load->node.list, &ctx->block->node_list); 932 gpir_node_replace_child(dep->succ, dep->pred, &load->node); 933 gpir_node_replace_pred(dep, &load->node); 934 gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE); 935 gpir_debug("spilling use %d of node %d to load node %d\n", 936 use->index, node->index, load->node.index); 937 ASSERTED bool result = _try_place_node(ctx, use->sched.instr, &load->node); 938 assert(result); 939 } 940 } 941 942 if (node->op == gpir_op_mov) { 943 /* We replaced all the uses of the move, so it's dead now. */ 944 gpir_instr_remove_node(node->sched.instr, node); 945 gpir_node_delete(node); 946 } else { 947 /* We deleted all the uses of the node except the store, so it's not 948 * live anymore. 949 */ 950 list_del(&node->list); 951 node->sched.inserted = false; 952 ctx->ready_list_slots--; 953 if (node->sched.max_node) { 954 node->sched.max_node = false; 955 ctx->instr->alu_num_slot_needed_by_max--; 956 } 957 if (node->sched.next_max_node) { 958 node->sched.next_max_node = false; 959 ctx->instr->alu_num_unscheduled_next_max--; 960 } 961 } 962} 963 964static bool used_by_store(gpir_node *node, gpir_instr *instr) 965{ 966 gpir_node_foreach_succ(node, dep) { 967 if (dep->type != GPIR_DEP_INPUT) 968 continue; 969 970 if (dep->succ->type == gpir_node_type_store && 971 dep->succ->sched.instr == instr) 972 return true; 973 } 974 975 return false; 976} 977 978static gpir_node *consuming_postlog2(gpir_node *node) 979{ 980 if (node->op != gpir_op_complex1) 981 return NULL; 982 983 gpir_node_foreach_succ(node, dep) { 984 if (dep->type != GPIR_DEP_INPUT) 985 continue; 986 if (dep->succ->op == gpir_op_postlog2) 987 return dep->succ; 988 else 989 return NULL; 990 } 991 992 return NULL; 993} 994 995static bool try_spill_node(sched_ctx *ctx, gpir_node *node) 996{ 997 assert(node->op != gpir_op_mov); 998 999 if (used_by_store(node, ctx->instr)) 1000 return false; 1001 1002 gpir_debug("trying to spill %d\n", node->index); 1003 1004 int min_instr = INT_MAX; 1005 uint64_t available = get_available_regs(ctx, node, &min_instr); 1006 available &= ~get_killed_regs(ctx, min_instr); 1007 1008 if (node->sched.physreg_store) { 1009 gpir_store_node *store = node->sched.physreg_store; 1010 if (!(available & (1ull << (4 * store->index + store->component)))) 1011 return false; 1012 } else { 1013 available &= ~ctx->live_physregs; 1014 1015 if (available == 0) 1016 return false; 1017 1018 /* Don't spill complex1 if it's used postlog2, turn the postlog2 into a 1019 * move, replace the complex1 with postlog2 and spill that instead. The 1020 * store needs a move anyways so the postlog2 is usually free. 1021 */ 1022 gpir_node *postlog2 = consuming_postlog2(node); 1023 if (postlog2) { 1024 postlog2->op = gpir_op_mov; 1025 node = create_postlog2(ctx, node); 1026 } 1027 1028 /* TODO: use a better heuristic for choosing an available register? */ 1029 int physreg = ffsll(available) - 1; 1030 1031 ctx->live_physregs |= (1ull << physreg); 1032 1033 gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg); 1034 store->index = physreg / 4; 1035 store->component = physreg % 4; 1036 store->child = node; 1037 store->node.sched.max_node = false; 1038 store->node.sched.next_max_node = false; 1039 store->node.sched.complex_allowed = false; 1040 store->node.sched.pos = -1; 1041 store->node.sched.instr = NULL; 1042 store->node.sched.inserted = false; 1043 store->node.sched.dist = node->sched.dist; 1044 if (node->op == gpir_op_complex1) { 1045 /* Complex1 cannot be directly stored, and has a latency of 2 */ 1046 store->node.sched.dist += 2; 1047 } 1048 node->sched.physreg_store = store; 1049 gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); 1050 1051 list_for_each_entry(gpir_load_node, load, 1052 &ctx->physreg_reads[physreg], reg_link) { 1053 gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ); 1054 if (load->node.sched.ready) { 1055 list_del(&load->node.list); 1056 load->node.sched.ready = false; 1057 } 1058 } 1059 1060 node->sched.ready = false; 1061 schedule_insert_ready_list(ctx, &store->node); 1062 } 1063 1064 gpir_debug("spilling %d to $%d.%c, store %d\n", node->index, 1065 node->sched.physreg_store->index, 1066 "xyzw"[node->sched.physreg_store->component], 1067 node->sched.physreg_store->node.index); 1068 1069 spill_node(ctx, node, node->sched.physreg_store); 1070 1071 return true; 1072} 1073 1074static bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node) 1075{ 1076 /* First, try to spill max nodes. */ 1077 list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) { 1078 if (ctx->max_node_spill_needed <= 0) 1079 break; 1080 1081 /* orig_node is the node we're trying to schedule, so spilling it makes 1082 * no sense. Also don't try to spill any nodes in front of it, since 1083 * they might be scheduled instead. 1084 */ 1085 if (node == orig_node) 1086 break; 1087 1088 if (node->op == gpir_op_mov) { 1089 /* Don't try to spill loads, since that only adds another load and 1090 * store which is likely pointless. 1091 */ 1092 continue; 1093 } 1094 1095 if (!gpir_is_input_node(node) || !node->sched.max_node) 1096 continue; 1097 1098 if (try_spill_node(ctx, node)) { 1099 ctx->max_node_spill_needed--; 1100 ctx->total_spill_needed--; 1101 } 1102 } 1103 1104 /* Now, try to spill the remaining nodes. */ 1105 list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) { 1106 if (ctx->total_spill_needed <= 0) 1107 break; 1108 1109 if (node == orig_node) 1110 break; 1111 1112 if (node->op == gpir_op_mov) 1113 continue; 1114 1115 if (!gpir_is_input_node(node) || 1116 !(node->sched.max_node || node->sched.next_max_node)) 1117 continue; 1118 1119 if (try_spill_node(ctx, node)) 1120 ctx->total_spill_needed--; 1121 } 1122 1123 return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0; 1124} 1125 1126static int ASSERTED gpir_get_curr_ready_list_slots(sched_ctx *ctx) 1127{ 1128 int total = 0; 1129 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1130 total += gpir_get_slots_required(node); 1131 } 1132 1133 return total; 1134} 1135 1136/* What gpir_get_min_end() would return if node were replaced with a move 1137 * instruction not in the complex slot. Normally this is 2 + min_end, except 1138 * for some store instructions which must have the move node in the same 1139 * instruction. 1140 */ 1141static int gpir_get_min_end_as_move(gpir_node *node) 1142{ 1143 int min = INT_MAX; 1144 gpir_node_foreach_succ(node, dep) { 1145 gpir_node *succ = dep->succ; 1146 if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) { 1147 switch (succ->op) { 1148 case gpir_op_store_temp: 1149 case gpir_op_store_reg: 1150 case gpir_op_store_varying: 1151 continue; 1152 default: 1153 break; 1154 } 1155 if (min > succ->sched.instr->index + 2) 1156 min = succ->sched.instr->index + 2; 1157 } 1158 } 1159 return min; 1160} 1161 1162/* The second source for add0, add1, mul0, and mul1 units cannot be complex. 1163 * The hardware overwrites the add second sources with 0 and mul second 1164 * sources with 1. This can be a problem if we need to insert more next-max 1165 * moves but we only have values that can't use the complex unit for moves. 1166 * 1167 * Fortunately, we only need to insert a next-max move if there are more than 1168 * 5 next-max nodes, but there are only 4 sources in the previous instruction 1169 * that make values not complex-capable, which means there can be at most 4 1170 * non-complex-capable values. Hence there will always be at least two values 1171 * that can be rewritten to use a move in the complex slot. However, we have 1172 * to be careful not to waste those values by putting both of them in a 1173 * non-complex slot. This is handled for us by gpir_instr, which will reject 1174 * such instructions. We just need to tell it which nodes can use complex, and 1175 * it will do the accounting to figure out what is safe. 1176 */ 1177 1178static bool can_use_complex(gpir_node *node) 1179{ 1180 gpir_node_foreach_succ(node, dep) { 1181 if (dep->type != GPIR_DEP_INPUT) 1182 continue; 1183 1184 gpir_node *succ = dep->succ; 1185 if (succ->type != gpir_node_type_alu || 1186 !succ->sched.instr) 1187 continue; 1188 1189 /* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1} 1190 */ 1191 gpir_alu_node *alu = gpir_node_to_alu(succ); 1192 switch (alu->node.op) { 1193 case gpir_op_complex1: 1194 /* complex1 puts its third source in the fourth slot */ 1195 if (alu->children[1] == node || alu->children[2] == node) 1196 return false; 1197 break; 1198 case gpir_op_complex2: 1199 /* complex2 has its source duplicated, since it actually takes two 1200 * sources but we only ever use it with both sources the same. Hence 1201 * its source can never be the complex slot. 1202 */ 1203 return false; 1204 case gpir_op_select: 1205 /* Select has its sources rearranged */ 1206 if (alu->children[0] == node) 1207 return false; 1208 break; 1209 default: 1210 assert(alu->num_child <= 2); 1211 if (alu->num_child == 2 && alu->children[1] == node) 1212 return false; 1213 break; 1214 } 1215 } 1216 1217 return true; 1218} 1219 1220/* Initialize node->sched.max_node and node->sched.next_max_node for every 1221 * input node on the ready list. We should only need to do this once per 1222 * instruction, at the beginning, since we never add max nodes to the ready 1223 * list. 1224 */ 1225 1226static void sched_find_max_nodes(sched_ctx *ctx) 1227{ 1228 ctx->instr->alu_num_unscheduled_next_max = 0; 1229 ctx->instr->alu_num_slot_needed_by_max = 0; 1230 1231 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1232 if (!gpir_is_input_node(node)) 1233 continue; 1234 1235 int min_end_move = gpir_get_min_end_as_move(node); 1236 node->sched.max_node = (min_end_move == ctx->instr->index); 1237 node->sched.next_max_node = (min_end_move == ctx->instr->index + 1); 1238 if (node->sched.next_max_node) 1239 node->sched.complex_allowed = can_use_complex(node); 1240 1241 if (node->sched.max_node) 1242 ctx->instr->alu_num_slot_needed_by_max++; 1243 if (node->sched.next_max_node) 1244 ctx->instr->alu_num_unscheduled_next_max++; 1245 } 1246} 1247 1248/* Verify the invariants described in gpir.h, as well as making sure the 1249 * counts are correct. 1250 */ 1251static void ASSERTED verify_max_nodes(sched_ctx *ctx) 1252{ 1253 int alu_num_slot_needed_by_max = 0; 1254 int alu_num_unscheduled_next_max = 0; 1255 int alu_num_slot_needed_by_store = 0; 1256 int alu_num_slot_needed_by_non_cplx_store = 0; 1257 ASSERTED int alu_max_allowed_next_max = 5; 1258 1259 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1260 if (!gpir_is_input_node(node)) 1261 continue; 1262 1263 if (node->sched.max_node) 1264 alu_num_slot_needed_by_max++; 1265 if (node->sched.next_max_node) 1266 alu_num_unscheduled_next_max++; 1267 if (used_by_store(node, ctx->instr)) { 1268 alu_num_slot_needed_by_store++; 1269 if (node->sched.next_max_node && !node->sched.complex_allowed) 1270 alu_num_slot_needed_by_non_cplx_store++; 1271 } 1272 } 1273 1274 if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] && 1275 ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1) 1276 alu_max_allowed_next_max = 4; 1277 1278 assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max); 1279 assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max); 1280 assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max); 1281 assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store); 1282 assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store == 1283 alu_num_slot_needed_by_non_cplx_store); 1284 assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0)); 1285 assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store); 1286} 1287 1288static bool try_node(sched_ctx *ctx) 1289{ 1290 gpir_node *best_node = NULL; 1291 int best_score = INT_MIN; 1292 1293 /* Spilling will delete arbitrary nodes after the current one in the ready 1294 * list, which means that we always need to look up the next node in the 1295 * list at the end of each iteration. While list_for_each_entry() works for 1296 * this purpose, its sanity checking assumes that you don't want to modify 1297 * the list at all. We know better here, so we have to open-code 1298 * list_for_each_entry() without the check in order to not assert. 1299 */ 1300 for (gpir_node *node = list_entry(ctx->ready_list.next, gpir_node, list); 1301 &node->list != &ctx->ready_list; 1302 node = list_entry(node->list.next, gpir_node, list)) { 1303 if (best_score != INT_MIN) { 1304 if (node->sched.dist < best_node->sched.dist) 1305 break; 1306 } 1307 1308 if (node->sched.ready) { 1309 ctx->total_spill_needed = 0; 1310 ctx->max_node_spill_needed = 0; 1311 int score = schedule_try_node(ctx, node, true); 1312 if (score == INT_MIN && !best_node && 1313 ctx->total_spill_needed > 0 && 1314 try_spill_nodes(ctx, node)) { 1315 score = schedule_try_node(ctx, node, true); 1316 } 1317 1318 /* schedule_first nodes must be scheduled if possible */ 1319 if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) { 1320 best_node = node; 1321 best_score = score; 1322 break; 1323 } 1324 1325 if (score > best_score) { 1326 best_score = score; 1327 best_node = node; 1328 } 1329 } 1330 } 1331 1332 if (best_node) { 1333 gpir_debug("scheduling %d (score = %d)%s\n", best_node->index, 1334 best_score, best_node->sched.max_node ? " (max)" : ""); 1335 ASSERTED int score = schedule_try_node(ctx, best_node, false); 1336 assert(score != INT_MIN); 1337 return true; 1338 } 1339 1340 return false; 1341} 1342 1343static void place_move(sched_ctx *ctx, gpir_node *node) 1344{ 1345 /* For complex1 that is consumed by a postlog2, we cannot allow any moves 1346 * in between. Convert the postlog2 to a move and insert a new postlog2, 1347 * and try to schedule it again in try_node(). 1348 */ 1349 gpir_node *postlog2 = consuming_postlog2(node); 1350 if (postlog2) { 1351 postlog2->op = gpir_op_mov; 1352 create_postlog2(ctx, node); 1353 return; 1354 } 1355 1356 gpir_node *move = create_move(ctx, node); 1357 gpir_node_foreach_succ_safe(move, dep) { 1358 gpir_node *succ = dep->succ; 1359 if (!succ->sched.instr || 1360 ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) { 1361 gpir_node_replace_pred(dep, node); 1362 if (dep->type == GPIR_DEP_INPUT) 1363 gpir_node_replace_child(succ, move, node); 1364 } 1365 } 1366 ASSERTED int score = schedule_try_node(ctx, move, false); 1367 assert(score != INT_MIN); 1368} 1369 1370/* For next-max nodes, not every node can be offloaded to a move in the 1371 * complex slot. If we run out of non-complex slots, then such nodes cannot 1372 * have moves placed for them. There should always be sufficient 1373 * complex-capable nodes so that this isn't a problem. We also disallow moves 1374 * for schedule_first nodes here. 1375 */ 1376static bool can_place_move(sched_ctx *ctx, gpir_node *node) 1377{ 1378 if (gpir_op_infos[node->op].schedule_first) 1379 return false; 1380 1381 if (!node->sched.next_max_node) 1382 return true; 1383 1384 if (node->sched.complex_allowed) 1385 return true; 1386 1387 return ctx->instr->alu_non_cplx_slot_free > 0; 1388} 1389 1390static bool sched_move(sched_ctx *ctx) 1391{ 1392 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1393 if (node->sched.max_node) { 1394 place_move(ctx, node); 1395 return true; 1396 } 1397 } 1398 1399 if (ctx->instr->alu_num_slot_needed_by_store > 0) { 1400 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1401 if (used_by_store(node, ctx->instr)) { 1402 place_move(ctx, node); 1403 /* If we have a store of a load, then we need to make sure that we 1404 * immediately schedule the dependent load, or create a move 1405 * instruction for it, like we would with a normal instruction. 1406 * The rest of the code isn't set up to handle load nodes in the 1407 * ready list -- see the comments in _schedule_try_node(). 1408 */ 1409 if (node->type == gpir_node_type_load) { 1410 if (!schedule_try_place_node(ctx, node, false)) { 1411 create_move(ctx, node); 1412 } 1413 } 1414 return true; 1415 } 1416 } 1417 } 1418 1419 /* complex1 is a bit a special case, since it has a latency of 2 cycles. 1420 * Once it is fully ready, we need to group all its uses in the same 1421 * instruction, and then we need to avoid creating any moves in the next 1422 * cycle in order to get it scheduled. Failing to do any of these things 1423 * could result in a cycle penalty, or even worse, an infinite loop of 1424 * inserting moves. If it is a next-max node and ready, then it has a use 1425 * in the previous cycle. If it has a use in the current cycle as well, 1426 * then we want to insert a move node to make it ready in two cycles -- if 1427 * we don't, then there will be at least a one cycle penalty. Otherwise, it 1428 * will be ready next cycle, and we shouldn't insert a move node, or else 1429 * we'll also have a one cycle penalty. 1430 */ 1431 if (ctx->instr->alu_num_slot_free > 0) { 1432 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1433 if (!can_place_move(ctx, node)) 1434 continue; 1435 1436 if (node->sched.next_max_node && node->op == gpir_op_complex1 && 1437 node->sched.ready) { 1438 bool skip = true; 1439 gpir_node_foreach_succ(node, dep) { 1440 if (dep->type != GPIR_DEP_INPUT) 1441 continue; 1442 1443 gpir_node *succ = dep->succ; 1444 1445 if (!succ->sched.instr || 1446 succ->sched.instr->index != ctx->instr->index - 1) { 1447 skip = false; 1448 break; 1449 } 1450 } 1451 1452 if (skip) 1453 continue; 1454 1455 place_move(ctx, node); 1456 return true; 1457 } 1458 } 1459 } 1460 1461 /* Once we've made all the required moves, we're free to use any extra 1462 * slots to schedule more moves for next max nodes. Besides sometimes being 1463 * necessary, this can free up extra space in the next instruction. We walk 1464 * from back to front so that we pick nodes less likely to be scheduled 1465 * next first -- an extra move would be unnecessary there. But make sure 1466 * not to handle the complex1 case handled above. 1467 */ 1468 if (ctx->instr->alu_num_slot_free > 0) { 1469 list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) { 1470 if (!can_place_move(ctx, node)) 1471 continue; 1472 1473 if (node->sched.next_max_node && 1474 !(node->op == gpir_op_complex1 && node->sched.ready)) { 1475 place_move(ctx, node); 1476 return true; 1477 } 1478 } 1479 } 1480 1481 /* We may have skipped complex1 above, but if we run out of space, we still 1482 * need to insert the move. 1483 */ 1484 1485 if (ctx->instr->alu_num_unscheduled_next_max > 1486 ctx->instr->alu_max_allowed_next_max) { 1487 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1488 if (!can_place_move(ctx, node)) 1489 continue; 1490 1491 if (node->sched.next_max_node) { 1492 place_move(ctx, node); 1493 return true; 1494 } 1495 } 1496 } 1497 1498 1499 return false; 1500} 1501 1502static bool gpir_sched_instr_pass(sched_ctx *ctx) 1503{ 1504 if (try_node(ctx)) 1505 return true; 1506 1507 if (sched_move(ctx)) 1508 return true; 1509 1510 return false; 1511} 1512 1513static void schedule_print_pre_one_instr(sched_ctx *ctx) 1514{ 1515 if (!(lima_debug & LIMA_DEBUG_GP)) 1516 return; 1517 1518 printf("instr %d for ready list:", ctx->instr->index); 1519 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { 1520 printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p', 1521 node->sched.dist, gpir_get_slots_required(node), 1522 node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none")); 1523 } 1524 printf("\nlive physregs: "); 1525 for (unsigned i = 0; i < 16; i++) { 1526 if (ctx->live_physregs & (0xfull << (4 * i))) { 1527 printf("$%d.", i); 1528 for (unsigned j = 0; j < 4; j++) { 1529 if (ctx->live_physregs & (1ull << (4 * i + j))) 1530 printf("%c", "xyzw"[j]); 1531 } 1532 printf(" "); 1533 } 1534 } 1535 printf("\n"); 1536} 1537 1538static void schedule_print_post_one_instr(gpir_instr *instr) 1539{ 1540 if (!(lima_debug & LIMA_DEBUG_GP)) 1541 return; 1542 1543 printf("post schedule instr"); 1544 for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) { 1545 if (instr->slots[i]) 1546 printf(" %d/%d", i, instr->slots[i]->index); 1547 } 1548 printf("\n"); 1549} 1550 1551 1552static bool schedule_one_instr(sched_ctx *ctx) 1553{ 1554 gpir_instr *instr = gpir_instr_create(ctx->block); 1555 if (unlikely(!instr)) 1556 return false; 1557 1558 ctx->instr = instr; 1559 1560 sched_find_max_nodes(ctx); 1561 schedule_print_pre_one_instr(ctx); 1562 1563 while (gpir_sched_instr_pass(ctx)) { 1564 assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx)); 1565#ifndef NDEBUG 1566 verify_max_nodes(ctx); 1567 verify_ready_list(ctx); 1568#endif 1569 } 1570 1571 schedule_print_post_one_instr(instr); 1572 return true; 1573} 1574 1575static bool schedule_block(gpir_block *block) 1576{ 1577 /* calculate distance */ 1578 list_for_each_entry(gpir_node, node, &block->node_list, list) { 1579 if (gpir_node_is_root(node)) 1580 schedule_update_distance(node); 1581 } 1582 1583 sched_ctx ctx; 1584 list_inithead(&ctx.ready_list); 1585 ctx.block = block; 1586 ctx.ready_list_slots = 0; 1587 ctx.live_physregs = block->live_out_phys; 1588 1589 for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) { 1590 list_inithead(&ctx.physreg_reads[i]); 1591 } 1592 1593 /* construct the ready list from root nodes */ 1594 list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { 1595 /* Add to physreg_reads */ 1596 if (node->op == gpir_op_load_reg) { 1597 gpir_load_node *load = gpir_node_to_load(node); 1598 unsigned index = 4 * load->index + load->component; 1599 list_addtail(&load->reg_link, &ctx.physreg_reads[index]); 1600 } 1601 1602 if (gpir_node_is_root(node)) 1603 schedule_insert_ready_list(&ctx, node); 1604 } 1605 1606 list_inithead(&block->node_list); 1607 while (!list_is_empty(&ctx.ready_list)) { 1608 if (!schedule_one_instr(&ctx)) 1609 return false; 1610 } 1611 1612 return true; 1613} 1614 1615static void schedule_build_dependency(gpir_block *block) 1616{ 1617 /* merge dummy_f/m to the node created from */ 1618 list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { 1619 if (node->op == gpir_op_dummy_m) { 1620 gpir_alu_node *alu = gpir_node_to_alu(node); 1621 gpir_node *origin = alu->children[0]; 1622 gpir_node *dummy_f = alu->children[1]; 1623 1624 gpir_node_foreach_succ(node, dep) { 1625 gpir_node *succ = dep->succ; 1626 /* origin and node may have same succ (by VREG/INPUT or 1627 * VREG/VREG dep), so use gpir_node_add_dep() instead of 1628 * gpir_node_replace_pred() */ 1629 gpir_node_add_dep(succ, origin, dep->type); 1630 gpir_node_replace_child(succ, node, origin); 1631 } 1632 gpir_node_delete(dummy_f); 1633 gpir_node_delete(node); 1634 } 1635 } 1636} 1637 1638static void print_statistic(gpir_compiler *comp, int save_index) 1639{ 1640 int num_nodes[gpir_op_num] = {0}; 1641 int num_created_nodes[gpir_op_num] = {0}; 1642 1643 list_for_each_entry(gpir_block, block, &comp->block_list, list) { 1644 list_for_each_entry(gpir_node, node, &block->node_list, list) { 1645 num_nodes[node->op]++; 1646 if (node->index >= save_index) 1647 num_created_nodes[node->op]++; 1648 } 1649 } 1650 1651 printf("====== gpir scheduler statistic ======\n"); 1652 printf("---- how many nodes are scheduled ----\n"); 1653 int n = 0, l = 0; 1654 for (int i = 0; i < gpir_op_num; i++) { 1655 if (num_nodes[i]) { 1656 printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]); 1657 n += num_nodes[i]; 1658 if (!(++l % 4)) 1659 printf("\n"); 1660 } 1661 } 1662 if (l % 4) 1663 printf("\n"); 1664 printf("\ntotal: %d\n", n); 1665 1666 printf("---- how many nodes are created ----\n"); 1667 n = l = 0; 1668 for (int i = 0; i < gpir_op_num; i++) { 1669 if (num_created_nodes[i]) { 1670 printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]); 1671 n += num_created_nodes[i]; 1672 if (!(++l % 4)) 1673 printf("\n"); 1674 } 1675 } 1676 if (l % 4) 1677 printf("\n"); 1678 printf("\ntotal: %d\n", n); 1679 printf("------------------------------------\n"); 1680} 1681 1682bool gpir_schedule_prog(gpir_compiler *comp) 1683{ 1684 int save_index = comp->cur_index; 1685 1686 /* init schedule info */ 1687 int index = 0; 1688 list_for_each_entry(gpir_block, block, &comp->block_list, list) { 1689 block->sched.instr_index = 0; 1690 list_for_each_entry(gpir_node, node, &block->node_list, list) { 1691 node->sched.instr = NULL; 1692 node->sched.pos = -1; 1693 node->sched.index = index++; 1694 node->sched.dist = -1; 1695 /* TODO when we support multiple basic blocks, we need a way to keep 1696 * track of this for physregs allocated before the scheduler. 1697 */ 1698 node->sched.physreg_store = NULL; 1699 node->sched.ready = false; 1700 node->sched.inserted = false; 1701 node->sched.complex_allowed = false; 1702 node->sched.max_node = false; 1703 node->sched.next_max_node = false; 1704 } 1705 } 1706 1707 /* build dependency */ 1708 list_for_each_entry(gpir_block, block, &comp->block_list, list) { 1709 schedule_build_dependency(block); 1710 } 1711 1712 //gpir_debug("after scheduler build reg dependency\n"); 1713 //gpir_node_print_prog_dep(comp); 1714 1715 list_for_each_entry(gpir_block, block, &comp->block_list, list) { 1716 if (!schedule_block(block)) { 1717 gpir_error("fail schedule block\n"); 1718 return false; 1719 } 1720 } 1721 1722 if (lima_debug & LIMA_DEBUG_GP) { 1723 print_statistic(comp, save_index); 1724 gpir_instr_print_prog(comp); 1725 } 1726 1727 return true; 1728} 1729