1/* 2 * Copyright © 2019 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "nir_schedule.h" 25#include "util/dag.h" 26#include "util/u_dynarray.h" 27 28/** @file 29 * 30 * Implements basic-block-level prepass instruction scheduling in NIR to 31 * manage register pressure. 32 * 33 * This is based on the Goodman/Hsu paper (1988, cached copy at 34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We 35 * make up the DDG for NIR (which can be mostly done using the NIR def/use 36 * chains for SSA instructions, plus some edges for ordering register writes 37 * vs reads, and some more for ordering intrinsics). Then we pick heads off 38 * of the DDG using their heuristic to emit the NIR instructions back into the 39 * block in their new order. 40 * 41 * The hard case for prepass scheduling on GPUs seems to always be consuming 42 * texture/ubo results. The register pressure heuristic doesn't want to pick 43 * an instr that starts consuming texture results because it usually won't be 44 * the only usage, so that instruction will increase pressure. 45 * 46 * If you try to force consumption of tex results always, then in a case where 47 * single sample is used for many outputs, you'll end up picking every other 48 * user and expanding register pressure. The partially_evaluated_path flag 49 * helps tremendously, in that if you happen for whatever reason to pick a 50 * texture sample's output, then you'll try to finish off that sample. Future 51 * work may include doing some local search before locking in a choice, to try 52 * to more reliably find the case where just a few choices going against the 53 * heuristic can manage to free the whole vector. 54 */ 55 56static bool debug; 57 58/** 59 * Represents a node in the DDG for a NIR instruction. 60 */ 61typedef struct { 62 struct dag_node dag; /* must be first for our u_dynarray_foreach */ 63 nir_instr *instr; 64 bool partially_evaluated_path; 65 66 /* Approximate estimate of the delay between starting this instruction and 67 * its results being available. 68 * 69 * Accuracy is not too important, given that we're prepass scheduling here 70 * and just trying to reduce excess dependencies introduced by a register 71 * allocator by stretching out the live intervals of expensive 72 * instructions. 73 */ 74 uint32_t delay; 75 76 /* Cost of the maximum-delay path from this node to the leaves. */ 77 uint32_t max_delay; 78 79 /* scoreboard->time value when this instruction can be scheduled without 80 * any stalls expected. 81 */ 82 uint32_t ready_time; 83} nir_schedule_node; 84 85typedef struct { 86 struct dag *dag; 87 88 nir_shader *shader; 89 90 /* Mapping from nir_register * or nir_ssa_def * to a struct set of 91 * instructions remaining to be scheduled using the register. 92 */ 93 struct hash_table *remaining_uses; 94 95 /* Map from nir_instr to nir_schedule_node * */ 96 struct hash_table *instr_map; 97 98 /* Set of nir_register * or nir_ssa_def * that have had any instruction 99 * scheduled on them. 100 */ 101 struct set *live_values; 102 103 /* An abstract approximation of the number of nir_scheduler_node->delay 104 * units since the start of the shader. 105 */ 106 uint32_t time; 107 108 /* Number of channels currently used by the NIR instructions that have been 109 * scheduled. 110 */ 111 int pressure; 112 113 /* Options specified by the backend */ 114 const nir_schedule_options *options; 115} nir_schedule_scoreboard; 116 117/* When walking the instructions in reverse, we use this flag to swap 118 * before/after in add_dep(). 119 */ 120enum direction { F, R }; 121 122struct nir_schedule_class_dep { 123 int klass; 124 nir_schedule_node *node; 125 struct nir_schedule_class_dep *next; 126}; 127 128typedef struct { 129 nir_schedule_scoreboard *scoreboard; 130 131 /* Map from nir_register to nir_schedule_node * */ 132 struct hash_table *reg_map; 133 134 /* Scheduler nodes for last instruction involved in some class of dependency. 135 */ 136 nir_schedule_node *load_input; 137 nir_schedule_node *store_shared; 138 nir_schedule_node *unknown_intrinsic; 139 nir_schedule_node *discard; 140 nir_schedule_node *jump; 141 142 struct nir_schedule_class_dep *class_deps; 143 144 enum direction dir; 145} nir_deps_state; 146 147static void * 148_mesa_hash_table_search_data(struct hash_table *ht, void *key) 149{ 150 struct hash_entry *entry = _mesa_hash_table_search(ht, key); 151 if (!entry) 152 return NULL; 153 return entry->data; 154} 155 156static nir_schedule_node * 157nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr) 158{ 159 return _mesa_hash_table_search_data(instr_map, instr); 160} 161 162static struct set * 163nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src) 164{ 165 if (src->is_ssa) { 166 return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa); 167 } else { 168 return _mesa_hash_table_search_data(scoreboard->remaining_uses, 169 src->reg.reg); 170 } 171} 172 173static int 174nir_schedule_def_pressure(nir_ssa_def *def) 175{ 176 return def->num_components; 177} 178 179static int 180nir_schedule_src_pressure(nir_src *src) 181{ 182 if (src->is_ssa) 183 return nir_schedule_def_pressure(src->ssa); 184 else 185 return src->reg.reg->num_components; 186} 187 188static int 189nir_schedule_dest_pressure(nir_dest *dest) 190{ 191 if (dest->is_ssa) 192 return nir_schedule_def_pressure(&dest->ssa); 193 else 194 return dest->reg.reg->num_components; 195} 196 197/** 198 * Adds a dependency such that @after must appear in the final program after 199 * @before. 200 * 201 * We add @before as a child of @after, so that DAG heads are the outputs of 202 * the program and we make our scheduling decisions bottom to top. 203 */ 204static void 205add_dep(nir_deps_state *state, 206 nir_schedule_node *before, 207 nir_schedule_node *after) 208{ 209 if (!before || !after) 210 return; 211 212 assert(before != after); 213 214 if (state->dir == F) 215 dag_add_edge(&before->dag, &after->dag, 0); 216 else 217 dag_add_edge(&after->dag, &before->dag, 0); 218} 219 220 221static void 222add_read_dep(nir_deps_state *state, 223 nir_schedule_node *before, 224 nir_schedule_node *after) 225{ 226 add_dep(state, before, after); 227} 228 229static void 230add_write_dep(nir_deps_state *state, 231 nir_schedule_node **before, 232 nir_schedule_node *after) 233{ 234 add_dep(state, *before, after); 235 *before = after; 236} 237 238static bool 239nir_schedule_reg_src_deps(nir_src *src, void *in_state) 240{ 241 nir_deps_state *state = in_state; 242 243 if (src->is_ssa) 244 return true; 245 246 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, 247 src->reg.reg); 248 if (!entry) 249 return true; 250 nir_schedule_node *dst_n = entry->data; 251 252 nir_schedule_node *src_n = 253 nir_schedule_get_node(state->scoreboard->instr_map, 254 src->parent_instr); 255 256 add_dep(state, dst_n, src_n); 257 258 return true; 259} 260 261static bool 262nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state) 263{ 264 nir_deps_state *state = in_state; 265 266 if (dest->is_ssa) 267 return true; 268 269 nir_schedule_node *dest_n = 270 nir_schedule_get_node(state->scoreboard->instr_map, 271 dest->reg.parent_instr); 272 273 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, 274 dest->reg.reg); 275 if (!entry) { 276 _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n); 277 return true; 278 } 279 nir_schedule_node **before = (nir_schedule_node **)&entry->data; 280 281 add_write_dep(state, before, dest_n); 282 283 return true; 284} 285 286static bool 287nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state) 288{ 289 nir_deps_state *state = in_state; 290 struct hash_table *instr_map = state->scoreboard->instr_map; 291 nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr); 292 293 nir_foreach_use(src, def) { 294 nir_schedule_node *use_n = nir_schedule_get_node(instr_map, 295 src->parent_instr); 296 297 add_read_dep(state, def_n, use_n); 298 } 299 300 return true; 301} 302 303static struct nir_schedule_class_dep * 304nir_schedule_get_class_dep(nir_deps_state *state, 305 int klass) 306{ 307 for (struct nir_schedule_class_dep *class_dep = state->class_deps; 308 class_dep != NULL; 309 class_dep = class_dep->next) { 310 if (class_dep->klass == klass) 311 return class_dep; 312 } 313 314 struct nir_schedule_class_dep *class_dep = 315 ralloc(state->reg_map, struct nir_schedule_class_dep); 316 317 class_dep->klass = klass; 318 class_dep->node = NULL; 319 class_dep->next = state->class_deps; 320 321 state->class_deps = class_dep; 322 323 return class_dep; 324} 325 326static void 327nir_schedule_intrinsic_deps(nir_deps_state *state, 328 nir_intrinsic_instr *instr) 329{ 330 nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map, 331 &instr->instr); 332 const nir_schedule_options *options = state->scoreboard->options; 333 nir_schedule_dependency dep; 334 335 if (options->intrinsic_cb && 336 options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) { 337 struct nir_schedule_class_dep *class_dep = 338 nir_schedule_get_class_dep(state, dep.klass); 339 340 switch (dep.type) { 341 case NIR_SCHEDULE_READ_DEPENDENCY: 342 add_read_dep(state, class_dep->node, n); 343 break; 344 case NIR_SCHEDULE_WRITE_DEPENDENCY: 345 add_write_dep(state, &class_dep->node, n); 346 break; 347 } 348 } 349 350 switch (instr->intrinsic) { 351 case nir_intrinsic_load_uniform: 352 case nir_intrinsic_load_ubo: 353 case nir_intrinsic_load_front_face: 354 break; 355 356 case nir_intrinsic_discard: 357 case nir_intrinsic_discard_if: 358 case nir_intrinsic_demote: 359 case nir_intrinsic_demote_if: 360 case nir_intrinsic_terminate: 361 case nir_intrinsic_terminate_if: 362 /* We are adding two dependencies: 363 * 364 * * A individual one that we could use to add a read_dep while handling 365 * nir_instr_type_tex 366 * 367 * * Include it on the unknown intrinsic set, as we want discard to be 368 * serialized in in the same order relative to intervening stores or 369 * atomic accesses to SSBOs and images 370 */ 371 add_write_dep(state, &state->discard, n); 372 add_write_dep(state, &state->unknown_intrinsic, n); 373 break; 374 375 case nir_intrinsic_store_output: 376 /* For some hardware and stages, output stores affect the same shared 377 * memory as input loads. 378 */ 379 if ((state->scoreboard->options->stages_with_shared_io_memory & 380 (1 << state->scoreboard->shader->info.stage))) 381 add_write_dep(state, &state->load_input, n); 382 383 /* Make sure that preceding discards stay before the store_output */ 384 add_read_dep(state, state->discard, n); 385 386 break; 387 388 case nir_intrinsic_load_input: 389 case nir_intrinsic_load_per_vertex_input: 390 add_read_dep(state, state->load_input, n); 391 break; 392 393 case nir_intrinsic_load_shared: 394 case nir_intrinsic_load_shared2_amd: 395 /* Don't move load_shared beyond a following store_shared, as it could 396 * change their value 397 */ 398 add_read_dep(state, state->store_shared, n); 399 break; 400 401 case nir_intrinsic_store_shared: 402 case nir_intrinsic_store_shared2_amd: 403 add_write_dep(state, &state->store_shared, n); 404 break; 405 406 case nir_intrinsic_control_barrier: 407 case nir_intrinsic_memory_barrier_shared: 408 case nir_intrinsic_group_memory_barrier: 409 /* A generic memory barrier can be emitted when multiple synchronization 410 * semantics are involved, including shared memory. 411 */ 412 case nir_intrinsic_memory_barrier: 413 add_write_dep(state, &state->store_shared, n); 414 415 /* Serialize against ssbos/atomics/etc. */ 416 add_write_dep(state, &state->unknown_intrinsic, n); 417 break; 418 419 case nir_intrinsic_scoped_barrier: { 420 const nir_variable_mode modes = nir_intrinsic_memory_modes(instr); 421 422 if (modes & nir_var_mem_shared) 423 add_write_dep(state, &state->store_shared, n); 424 425 /* Serialize against other categories. */ 426 add_write_dep(state, &state->unknown_intrinsic, n); 427 428 break; 429 } 430 431 default: 432 /* Attempt to handle other intrinsics that we haven't individually 433 * categorized by serializing them in the same order relative to each 434 * other. 435 */ 436 add_write_dep(state, &state->unknown_intrinsic, n); 437 break; 438 } 439} 440 441/** 442 * Common code for dependencies that need to be tracked both forward and 443 * backward. 444 * 445 * This is for things like "all reads of r4 have to happen between the r4 446 * writes that surround them". 447 */ 448static void 449nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n) 450{ 451 nir_instr *instr = n->instr; 452 453 /* For NIR SSA defs, we only need to do a single pass of making the uses 454 * depend on the def. 455 */ 456 if (state->dir == F) 457 nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state); 458 459 /* For NIR regs, track the last writer in the scheduler state so that we 460 * can keep the writes in order and let reads get reordered only between 461 * each write. 462 */ 463 nir_foreach_src(instr, nir_schedule_reg_src_deps, state); 464 465 nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state); 466 467 /* Make sure any other instructions keep their positions relative to 468 * jumps. 469 */ 470 if (instr->type != nir_instr_type_jump) 471 add_read_dep(state, state->jump, n); 472 473 switch (instr->type) { 474 case nir_instr_type_ssa_undef: 475 case nir_instr_type_load_const: 476 case nir_instr_type_alu: 477 case nir_instr_type_deref: 478 break; 479 480 case nir_instr_type_tex: 481 /* Don't move texture ops before a discard, as that could increase 482 * memory bandwidth for reading the discarded samples. 483 */ 484 add_read_dep(state, state->discard, n); 485 break; 486 487 case nir_instr_type_jump: 488 add_write_dep(state, &state->jump, n); 489 break; 490 491 case nir_instr_type_call: 492 unreachable("Calls should have been lowered"); 493 break; 494 495 case nir_instr_type_parallel_copy: 496 unreachable("Parallel copies should have been lowered"); 497 break; 498 499 case nir_instr_type_phi: 500 unreachable("nir_schedule() should be called after lowering from SSA"); 501 break; 502 503 case nir_instr_type_intrinsic: 504 nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr)); 505 break; 506 } 507} 508 509static void 510calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) 511{ 512 nir_deps_state state = { 513 .scoreboard = scoreboard, 514 .dir = F, 515 .reg_map = _mesa_pointer_hash_table_create(NULL), 516 }; 517 518 nir_foreach_instr(instr, block) { 519 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, 520 instr); 521 nir_schedule_calculate_deps(&state, node); 522 } 523 524 ralloc_free(state.reg_map); 525} 526 527static void 528calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) 529{ 530 nir_deps_state state = { 531 .scoreboard = scoreboard, 532 .dir = R, 533 .reg_map = _mesa_pointer_hash_table_create(NULL), 534 }; 535 536 nir_foreach_instr_reverse(instr, block) { 537 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, 538 instr); 539 nir_schedule_calculate_deps(&state, node); 540 } 541 542 ralloc_free(state.reg_map); 543} 544 545typedef struct { 546 nir_schedule_scoreboard *scoreboard; 547 int regs_freed; 548} nir_schedule_regs_freed_state; 549 550static bool 551nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state) 552{ 553 nir_schedule_regs_freed_state *state = in_state; 554 nir_schedule_scoreboard *scoreboard = state->scoreboard; 555 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); 556 557 if (remaining_uses->entries == 1 && 558 _mesa_set_search(remaining_uses, src->parent_instr)) { 559 state->regs_freed += nir_schedule_src_pressure(src); 560 } 561 562 return true; 563} 564 565static bool 566nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state) 567{ 568 nir_schedule_regs_freed_state *state = in_state; 569 570 state->regs_freed -= nir_schedule_def_pressure(def); 571 572 return true; 573} 574 575static bool 576nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state) 577{ 578 nir_schedule_regs_freed_state *state = in_state; 579 nir_schedule_scoreboard *scoreboard = state->scoreboard; 580 581 if (dest->is_ssa) 582 return true; 583 584 nir_register *reg = dest->reg.reg; 585 586 /* Only the first def of a reg counts against register pressure. */ 587 if (!_mesa_set_search(scoreboard->live_values, reg)) 588 state->regs_freed -= nir_schedule_dest_pressure(dest); 589 590 return true; 591} 592 593static int 594nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n) 595{ 596 nir_schedule_regs_freed_state state = { 597 .scoreboard = scoreboard, 598 }; 599 600 nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state); 601 602 nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state); 603 604 nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state); 605 606 return state.regs_freed; 607} 608 609/** 610 * Chooses an instruction that will minimise the register pressure as much as 611 * possible. This should only be used as a fallback when the regular scheduling 612 * generates a shader whose register allocation fails. 613 */ 614static nir_schedule_node * 615nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard) 616{ 617 nir_schedule_node *chosen = NULL; 618 619 /* Find the leader in the ready (shouldn't-stall) set with the mininum 620 * cost. 621 */ 622 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 623 if (scoreboard->time < n->ready_time) 624 continue; 625 626 if (!chosen || chosen->max_delay > n->max_delay) 627 chosen = n; 628 } 629 if (chosen) { 630 if (debug) { 631 fprintf(stderr, "chose (ready fallback): "); 632 nir_print_instr(chosen->instr, stderr); 633 fprintf(stderr, "\n"); 634 } 635 636 return chosen; 637 } 638 639 /* Otherwise, choose the leader with the minimum cost. */ 640 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 641 if (!chosen || chosen->max_delay > n->max_delay) 642 chosen = n; 643 } 644 if (debug) { 645 fprintf(stderr, "chose (leader fallback): "); 646 nir_print_instr(chosen->instr, stderr); 647 fprintf(stderr, "\n"); 648 } 649 650 return chosen; 651} 652 653/** 654 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code 655 * Scheduling for Parallelism) heuristic. 656 * 657 * Picks an instruction on the critical that's ready to execute without 658 * stalls, if possible, otherwise picks the instruction on the critical path. 659 */ 660static nir_schedule_node * 661nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard) 662{ 663 nir_schedule_node *chosen = NULL; 664 665 /* Find the leader in the ready (shouldn't-stall) set with the maximum 666 * cost. 667 */ 668 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 669 if (scoreboard->time < n->ready_time) 670 continue; 671 672 if (!chosen || chosen->max_delay < n->max_delay) 673 chosen = n; 674 } 675 if (chosen) { 676 if (debug) { 677 fprintf(stderr, "chose (ready): "); 678 nir_print_instr(chosen->instr, stderr); 679 fprintf(stderr, "\n"); 680 } 681 682 return chosen; 683 } 684 685 /* Otherwise, choose the leader with the maximum cost. */ 686 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 687 if (!chosen || chosen->max_delay < n->max_delay) 688 chosen = n; 689 } 690 if (debug) { 691 fprintf(stderr, "chose (leader): "); 692 nir_print_instr(chosen->instr, stderr); 693 fprintf(stderr, "\n"); 694 } 695 696 return chosen; 697} 698 699/** 700 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code 701 * Scheduling for Register pressure) heuristic. 702 */ 703static nir_schedule_node * 704nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard) 705{ 706 nir_schedule_node *chosen = NULL; 707 708 /* Find a ready inst with regs freed and pick the one with max cost. */ 709 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 710 if (n->ready_time > scoreboard->time) 711 continue; 712 713 int regs_freed = nir_schedule_regs_freed(scoreboard, n); 714 715 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { 716 chosen = n; 717 } 718 } 719 if (chosen) { 720 if (debug) { 721 fprintf(stderr, "chose (freed+ready): "); 722 nir_print_instr(chosen->instr, stderr); 723 fprintf(stderr, "\n"); 724 } 725 726 return chosen; 727 } 728 729 /* Find a leader with regs freed and pick the one with max cost. */ 730 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 731 int regs_freed = nir_schedule_regs_freed(scoreboard, n); 732 733 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { 734 chosen = n; 735 } 736 } 737 if (chosen) { 738 if (debug) { 739 fprintf(stderr, "chose (regs freed): "); 740 nir_print_instr(chosen->instr, stderr); 741 fprintf(stderr, "\n"); 742 } 743 744 return chosen; 745 } 746 747 /* Find a partially evaluated path and try to finish it off */ 748 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 749 if (n->partially_evaluated_path && 750 (!chosen || chosen->max_delay < n->max_delay)) { 751 chosen = n; 752 } 753 } 754 if (chosen) { 755 if (debug) { 756 fprintf(stderr, "chose (partial path): "); 757 nir_print_instr(chosen->instr, stderr); 758 fprintf(stderr, "\n"); 759 } 760 761 return chosen; 762 } 763 764 /* Contra the paper, pick a leader with no effect on used regs. This may 765 * open up new opportunities, as otherwise a single-operand instr consuming 766 * a value will tend to block finding freeing that value. This had a 767 * massive effect on reducing spilling on V3D. 768 * 769 * XXX: Should this prioritize ready? 770 */ 771 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 772 if (nir_schedule_regs_freed(scoreboard, n) != 0) 773 continue; 774 775 if (!chosen || chosen->max_delay < n->max_delay) 776 chosen = n; 777 } 778 if (chosen) { 779 if (debug) { 780 fprintf(stderr, "chose (regs no-op): "); 781 nir_print_instr(chosen->instr, stderr); 782 fprintf(stderr, "\n"); 783 } 784 785 return chosen; 786 } 787 788 /* Pick the max delay of the remaining ready set. */ 789 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 790 if (n->ready_time > scoreboard->time) 791 continue; 792 if (!chosen || chosen->max_delay < n->max_delay) 793 chosen = n; 794 } 795 if (chosen) { 796 if (debug) { 797 fprintf(stderr, "chose (ready max delay): "); 798 nir_print_instr(chosen->instr, stderr); 799 fprintf(stderr, "\n"); 800 } 801 return chosen; 802 } 803 804 /* Pick the max delay of the remaining leaders. */ 805 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 806 if (!chosen || chosen->max_delay < n->max_delay) 807 chosen = n; 808 } 809 810 if (debug) { 811 fprintf(stderr, "chose (max delay): "); 812 nir_print_instr(chosen->instr, stderr); 813 fprintf(stderr, "\n"); 814 } 815 816 return chosen; 817} 818 819static void 820dump_state(nir_schedule_scoreboard *scoreboard) 821{ 822 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 823 fprintf(stderr, "maxdel %5d ", n->max_delay); 824 nir_print_instr(n->instr, stderr); 825 fprintf(stderr, "\n"); 826 827 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 828 nir_schedule_node *child = (nir_schedule_node *)edge->child; 829 830 fprintf(stderr, " -> (%d parents) ", child->dag.parent_count); 831 nir_print_instr(child->instr, stderr); 832 fprintf(stderr, "\n"); 833 } 834 } 835} 836 837static void 838nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard, 839 void *reg_or_def, 840 nir_instr *reg_or_def_parent, 841 int pressure) 842{ 843 /* Make the value live if it's the first time it's been used. */ 844 if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) { 845 _mesa_set_add(scoreboard->live_values, reg_or_def); 846 scoreboard->pressure += pressure; 847 } 848 849 /* Make the value dead if it's the last remaining use. Be careful when one 850 * instruction uses a value twice to not decrement pressure twice. 851 */ 852 struct set *remaining_uses = 853 _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def); 854 struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent); 855 if (entry) { 856 _mesa_set_remove(remaining_uses, entry); 857 858 if (remaining_uses->entries == 0) 859 scoreboard->pressure -= pressure; 860 } 861} 862 863static bool 864nir_schedule_mark_src_scheduled(nir_src *src, void *state) 865{ 866 nir_schedule_scoreboard *scoreboard = state; 867 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); 868 869 struct set_entry *entry = _mesa_set_search(remaining_uses, 870 src->parent_instr); 871 if (entry) { 872 /* Once we've used an SSA value in one instruction, bump the priority of 873 * the other uses so the SSA value can get fully consumed. 874 * 875 * We don't do this for registers, and it's would be a hassle and it's 876 * unclear if that would help or not. Also, skip it for constants, as 877 * they're often folded as immediates into backend instructions and have 878 * many unrelated instructions all referencing the same value (0). 879 */ 880 if (src->is_ssa && 881 src->ssa->parent_instr->type != nir_instr_type_load_const) { 882 nir_foreach_use(other_src, src->ssa) { 883 if (other_src->parent_instr == src->parent_instr) 884 continue; 885 886 nir_schedule_node *n = 887 nir_schedule_get_node(scoreboard->instr_map, 888 other_src->parent_instr); 889 890 if (n && !n->partially_evaluated_path) { 891 if (debug) { 892 fprintf(stderr, " New partially evaluated path: "); 893 nir_print_instr(n->instr, stderr); 894 fprintf(stderr, "\n"); 895 } 896 897 n->partially_evaluated_path = true; 898 } 899 } 900 } 901 } 902 903 nir_schedule_mark_use(scoreboard, 904 src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg, 905 src->parent_instr, 906 nir_schedule_src_pressure(src)); 907 908 return true; 909} 910 911static bool 912nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state) 913{ 914 nir_schedule_scoreboard *scoreboard = state; 915 916 nir_schedule_mark_use(scoreboard, def, def->parent_instr, 917 nir_schedule_def_pressure(def)); 918 919 return true; 920} 921 922static bool 923nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state) 924{ 925 nir_schedule_scoreboard *scoreboard = state; 926 927 /* SSA defs were handled in nir_schedule_mark_def_scheduled() 928 */ 929 if (dest->is_ssa) 930 return true; 931 932 /* XXX: This is not actually accurate for regs -- the last use of a reg may 933 * have a live interval that extends across control flow. We should 934 * calculate the live ranges of regs, and have scheduler nodes for the CF 935 * nodes that also "use" the reg. 936 */ 937 nir_schedule_mark_use(scoreboard, dest->reg.reg, 938 dest->reg.parent_instr, 939 nir_schedule_dest_pressure(dest)); 940 941 return true; 942} 943 944static void 945nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard, 946 nir_schedule_node *n) 947{ 948 nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard); 949 nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard); 950 nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard); 951 952 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 953 nir_schedule_node *child = (nir_schedule_node *)edge->child; 954 955 child->ready_time = MAX2(child->ready_time, 956 scoreboard->time + n->delay); 957 958 if (child->dag.parent_count == 1) { 959 if (debug) { 960 fprintf(stderr, " New DAG head: "); 961 nir_print_instr(child->instr, stderr); 962 fprintf(stderr, "\n"); 963 } 964 } 965 } 966 967 dag_prune_head(scoreboard->dag, &n->dag); 968 969 scoreboard->time = MAX2(n->ready_time, scoreboard->time); 970 scoreboard->time++; 971} 972 973static void 974nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block) 975{ 976 while (!list_is_empty(&scoreboard->dag->heads)) { 977 if (debug) { 978 fprintf(stderr, "current list:\n"); 979 dump_state(scoreboard); 980 } 981 982 nir_schedule_node *chosen; 983 if (scoreboard->options->fallback) 984 chosen = nir_schedule_choose_instruction_fallback(scoreboard); 985 else if (scoreboard->pressure < scoreboard->options->threshold) 986 chosen = nir_schedule_choose_instruction_csp(scoreboard); 987 else 988 chosen = nir_schedule_choose_instruction_csr(scoreboard); 989 990 /* Now that we've scheduled a new instruction, some of its children may 991 * be promoted to the list of instructions ready to be scheduled. 992 */ 993 nir_schedule_mark_node_scheduled(scoreboard, chosen); 994 995 /* Move the instruction to the end (so our first chosen instructions are 996 * the start of the program). 997 */ 998 exec_node_remove(&chosen->instr->node); 999 exec_list_push_tail(&block->instr_list, &chosen->instr->node); 1000 1001 if (debug) 1002 fprintf(stderr, "\n"); 1003 } 1004} 1005 1006static uint32_t 1007nir_schedule_get_delay(nir_schedule_scoreboard *scoreboard, nir_instr *instr) 1008{ 1009 if (scoreboard->options->instr_delay_cb) { 1010 void *cb_data = scoreboard->options->instr_delay_cb_data; 1011 return scoreboard->options->instr_delay_cb(instr, cb_data); 1012 } 1013 1014 switch (instr->type) { 1015 case nir_instr_type_ssa_undef: 1016 case nir_instr_type_load_const: 1017 case nir_instr_type_alu: 1018 case nir_instr_type_deref: 1019 case nir_instr_type_jump: 1020 case nir_instr_type_parallel_copy: 1021 case nir_instr_type_call: 1022 case nir_instr_type_phi: 1023 return 1; 1024 1025 case nir_instr_type_intrinsic: 1026 switch (nir_instr_as_intrinsic(instr)->intrinsic) { 1027 case nir_intrinsic_load_ubo: 1028 case nir_intrinsic_load_ssbo: 1029 case nir_intrinsic_load_scratch: 1030 case nir_intrinsic_load_shared: 1031 case nir_intrinsic_image_load: 1032 return 50; 1033 default: 1034 return 1; 1035 } 1036 break; 1037 1038 case nir_instr_type_tex: 1039 /* Pick some large number to try to fetch textures early and sample them 1040 * late. 1041 */ 1042 return 100; 1043 } 1044 1045 return 0; 1046} 1047 1048static void 1049nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state) 1050{ 1051 nir_schedule_node *n = (nir_schedule_node *)node; 1052 uint32_t max_delay = 0; 1053 1054 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1055 nir_schedule_node *child = (nir_schedule_node *)edge->child; 1056 max_delay = MAX2(child->max_delay, max_delay); 1057 } 1058 1059 n->max_delay = MAX2(n->max_delay, max_delay + n->delay); 1060 } 1061 1062static void 1063nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block) 1064{ 1065 void *mem_ctx = ralloc_context(NULL); 1066 scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx); 1067 1068 scoreboard->dag = dag_create(mem_ctx); 1069 1070 nir_foreach_instr(instr, block) { 1071 nir_schedule_node *n = 1072 rzalloc(mem_ctx, nir_schedule_node); 1073 1074 n->instr = instr; 1075 n->delay = nir_schedule_get_delay(scoreboard, instr); 1076 dag_init_node(scoreboard->dag, &n->dag); 1077 1078 _mesa_hash_table_insert(scoreboard->instr_map, instr, n); 1079 } 1080 1081 calculate_forward_deps(scoreboard, block); 1082 calculate_reverse_deps(scoreboard, block); 1083 1084 dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL); 1085 1086 nir_schedule_instructions(scoreboard, block); 1087 1088 ralloc_free(mem_ctx); 1089 scoreboard->instr_map = NULL; 1090} 1091 1092static bool 1093nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state) 1094{ 1095 nir_schedule_scoreboard *scoreboard = state; 1096 struct set *def_uses = _mesa_pointer_set_create(scoreboard); 1097 1098 _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses); 1099 1100 _mesa_set_add(def_uses, def->parent_instr); 1101 1102 nir_foreach_use(src, def) { 1103 _mesa_set_add(def_uses, src->parent_instr); 1104 } 1105 1106 /* XXX: Handle if uses */ 1107 1108 return true; 1109} 1110 1111static nir_schedule_scoreboard * 1112nir_schedule_get_scoreboard(nir_shader *shader, 1113 const nir_schedule_options *options) 1114{ 1115 nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard); 1116 1117 scoreboard->shader = shader; 1118 scoreboard->live_values = _mesa_pointer_set_create(scoreboard); 1119 scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard); 1120 scoreboard->options = options; 1121 scoreboard->pressure = 0; 1122 1123 nir_foreach_function(function, shader) { 1124 nir_foreach_register(reg, &function->impl->registers) { 1125 struct set *register_uses = 1126 _mesa_pointer_set_create(scoreboard); 1127 1128 _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses); 1129 1130 nir_foreach_use(src, reg) { 1131 _mesa_set_add(register_uses, src->parent_instr); 1132 } 1133 1134 /* XXX: Handle if uses */ 1135 1136 nir_foreach_def(dest, reg) { 1137 _mesa_set_add(register_uses, dest->reg.parent_instr); 1138 } 1139 } 1140 1141 nir_foreach_block(block, function->impl) { 1142 nir_foreach_instr(instr, block) { 1143 nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard, 1144 scoreboard); 1145 } 1146 1147 /* XXX: We're ignoring if uses, which may prioritize scheduling other 1148 * uses of the if src even when it doesn't help. That's not many 1149 * values, though, so meh. 1150 */ 1151 } 1152 } 1153 1154 return scoreboard; 1155} 1156 1157static void 1158nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard) 1159{ 1160#ifdef NDEBUG 1161 return; 1162#endif 1163 1164 bool any_uses = false; 1165 1166 hash_table_foreach(scoreboard->remaining_uses, entry) { 1167 struct set *remaining_uses = entry->data; 1168 1169 set_foreach(remaining_uses, instr_entry) { 1170 if (!any_uses) { 1171 fprintf(stderr, "Tracked uses remain after scheduling. " 1172 "Affected instructions: \n"); 1173 any_uses = true; 1174 } 1175 nir_print_instr(instr_entry->key, stderr); 1176 fprintf(stderr, "\n"); 1177 } 1178 } 1179 1180 assert(!any_uses); 1181} 1182 1183/** 1184 * Schedules the NIR instructions to try to decrease stalls (for example, 1185 * delaying texture reads) while managing register pressure. 1186 * 1187 * The threshold represents "number of NIR register/SSA def channels live 1188 * before switching the scheduling heuristic to reduce register pressure", 1189 * since most of our GPU architectures are scalar (extending to vector with a 1190 * flag wouldn't be hard). This number should be a bit below the number of 1191 * registers available (counting any that may be occupied by system value 1192 * payload values, for example), since the heuristic may not always be able to 1193 * free a register immediately. The amount below the limit is up to you to 1194 * tune. 1195 */ 1196void 1197nir_schedule(nir_shader *shader, 1198 const nir_schedule_options *options) 1199{ 1200 nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader, 1201 options); 1202 1203 if (debug) { 1204 fprintf(stderr, "NIR shader before scheduling:\n"); 1205 nir_print_shader(shader, stderr); 1206 } 1207 1208 nir_foreach_function(function, shader) { 1209 if (!function->impl) 1210 continue; 1211 1212 nir_foreach_block(block, function->impl) { 1213 nir_schedule_block(scoreboard, block); 1214 } 1215 } 1216 1217 nir_schedule_validate_uses(scoreboard); 1218 1219 ralloc_free(scoreboard); 1220} 1221