1/* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Jason Ekstrand (jason@jlekstrand.net) 25 * 26 */ 27 28#include "nir.h" 29#include "nir_instr_set.h" 30 31/* 32 * Implements Global Code Motion. A description of GCM can be found in 33 * "Global Code Motion; Global Value Numbering" by Cliff Click. 34 * Unfortunately, the algorithm presented in the paper is broken in a 35 * number of ways. The algorithm used here differs substantially from the 36 * one in the paper but it is, in my opinion, much easier to read and 37 * verify correcness. 38 */ 39 40/* This is used to stop GCM moving instruction out of a loop if the loop 41 * contains too many instructions and moving them would create excess spilling. 42 * 43 * TODO: Figure out a better way to decide if we should remove instructions from 44 * a loop. 45 */ 46#define MAX_LOOP_INSTRUCTIONS 100 47 48struct gcm_block_info { 49 /* Number of loops this block is inside */ 50 unsigned loop_depth; 51 52 /* Number of ifs this block is inside */ 53 unsigned if_depth; 54 55 unsigned loop_instr_count; 56 57 /* The loop the block is nested inside or NULL */ 58 nir_loop *loop; 59 60 /* The last instruction inserted into this block. This is used as we 61 * traverse the instructions and insert them back into the program to 62 * put them in the right order. 63 */ 64 nir_instr *last_instr; 65}; 66 67struct gcm_instr_info { 68 nir_block *early_block; 69}; 70 71/* Flags used in the instr->pass_flags field for various instruction states */ 72enum { 73 GCM_INSTR_PINNED = (1 << 0), 74 GCM_INSTR_SCHEDULE_EARLIER_ONLY = (1 << 1), 75 GCM_INSTR_SCHEDULED_EARLY = (1 << 2), 76 GCM_INSTR_SCHEDULED_LATE = (1 << 3), 77 GCM_INSTR_PLACED = (1 << 4), 78}; 79 80struct gcm_state { 81 nir_function_impl *impl; 82 nir_instr *instr; 83 84 bool progress; 85 86 /* The list of non-pinned instructions. As we do the late scheduling, 87 * we pull non-pinned instructions out of their blocks and place them in 88 * this list. This saves us from having linked-list problems when we go 89 * to put instructions back in their blocks. 90 */ 91 struct exec_list instrs; 92 93 struct gcm_block_info *blocks; 94 95 unsigned num_instrs; 96 struct gcm_instr_info *instr_infos; 97}; 98 99static unsigned 100get_loop_instr_count(struct exec_list *cf_list) 101{ 102 unsigned loop_instr_count = 0; 103 foreach_list_typed(nir_cf_node, node, node, cf_list) { 104 switch (node->type) { 105 case nir_cf_node_block: { 106 nir_block *block = nir_cf_node_as_block(node); 107 nir_foreach_instr(instr, block) { 108 loop_instr_count++; 109 } 110 break; 111 } 112 case nir_cf_node_if: { 113 nir_if *if_stmt = nir_cf_node_as_if(node); 114 loop_instr_count += get_loop_instr_count(&if_stmt->then_list); 115 loop_instr_count += get_loop_instr_count(&if_stmt->else_list); 116 break; 117 } 118 case nir_cf_node_loop: { 119 nir_loop *loop = nir_cf_node_as_loop(node); 120 loop_instr_count += get_loop_instr_count(&loop->body); 121 break; 122 } 123 default: 124 unreachable("Invalid CF node type"); 125 } 126 } 127 128 return loop_instr_count; 129} 130 131/* Recursively walks the CFG and builds the block_info structure */ 132static void 133gcm_build_block_info(struct exec_list *cf_list, struct gcm_state *state, 134 nir_loop *loop, unsigned loop_depth, unsigned if_depth, 135 unsigned loop_instr_count) 136{ 137 foreach_list_typed(nir_cf_node, node, node, cf_list) { 138 switch (node->type) { 139 case nir_cf_node_block: { 140 nir_block *block = nir_cf_node_as_block(node); 141 state->blocks[block->index].if_depth = if_depth; 142 state->blocks[block->index].loop_depth = loop_depth; 143 state->blocks[block->index].loop_instr_count = loop_instr_count; 144 state->blocks[block->index].loop = loop; 145 break; 146 } 147 case nir_cf_node_if: { 148 nir_if *if_stmt = nir_cf_node_as_if(node); 149 gcm_build_block_info(&if_stmt->then_list, state, loop, loop_depth, 150 if_depth + 1, ~0u); 151 gcm_build_block_info(&if_stmt->else_list, state, loop, loop_depth, 152 if_depth + 1, ~0u); 153 break; 154 } 155 case nir_cf_node_loop: { 156 nir_loop *loop = nir_cf_node_as_loop(node); 157 gcm_build_block_info(&loop->body, state, loop, loop_depth + 1, if_depth, 158 get_loop_instr_count(&loop->body)); 159 break; 160 } 161 default: 162 unreachable("Invalid CF node type"); 163 } 164 } 165} 166 167static bool 168is_src_scalarizable(nir_src *src) 169{ 170 assert(src->is_ssa); 171 172 nir_instr *src_instr = src->ssa->parent_instr; 173 switch (src_instr->type) { 174 case nir_instr_type_alu: { 175 nir_alu_instr *src_alu = nir_instr_as_alu(src_instr); 176 177 /* ALU operations with output_size == 0 should be scalarized. We 178 * will also see a bunch of vecN operations from scalarizing ALU 179 * operations and, since they can easily be copy-propagated, they 180 * are ok too. 181 */ 182 return nir_op_infos[src_alu->op].output_size == 0 || 183 src_alu->op == nir_op_vec2 || 184 src_alu->op == nir_op_vec3 || 185 src_alu->op == nir_op_vec4; 186 } 187 188 case nir_instr_type_load_const: 189 /* These are trivially scalarizable */ 190 return true; 191 192 case nir_instr_type_ssa_undef: 193 return true; 194 195 case nir_instr_type_intrinsic: { 196 nir_intrinsic_instr *src_intrin = nir_instr_as_intrinsic(src_instr); 197 198 switch (src_intrin->intrinsic) { 199 case nir_intrinsic_load_deref: { 200 /* Don't scalarize if we see a load of a local variable because it 201 * might turn into one of the things we can't scalarize. 202 */ 203 nir_deref_instr *deref = nir_src_as_deref(src_intrin->src[0]); 204 return !nir_deref_mode_may_be(deref, (nir_var_function_temp | 205 nir_var_shader_temp)); 206 } 207 208 case nir_intrinsic_interp_deref_at_centroid: 209 case nir_intrinsic_interp_deref_at_sample: 210 case nir_intrinsic_interp_deref_at_offset: 211 case nir_intrinsic_load_uniform: 212 case nir_intrinsic_load_ubo: 213 case nir_intrinsic_load_ssbo: 214 case nir_intrinsic_load_global: 215 case nir_intrinsic_load_global_constant: 216 case nir_intrinsic_load_input: 217 return true; 218 default: 219 break; 220 } 221 222 return false; 223 } 224 225 default: 226 /* We can't scalarize this type of instruction */ 227 return false; 228 } 229} 230 231static bool 232is_binding_uniform(nir_src src) 233{ 234 nir_binding binding = nir_chase_binding(src); 235 if (!binding.success) 236 return false; 237 238 for (unsigned i = 0; i < binding.num_indices; i++) { 239 if (!nir_src_is_always_uniform(binding.indices[i])) 240 return false; 241 } 242 243 return true; 244} 245 246static void 247pin_intrinsic(nir_intrinsic_instr *intrin) 248{ 249 nir_instr *instr = &intrin->instr; 250 251 if (!nir_intrinsic_can_reorder(intrin)) { 252 instr->pass_flags = GCM_INSTR_PINNED; 253 return; 254 } 255 256 instr->pass_flags = 0; 257 258 /* If the intrinsic requires a uniform source, we can't safely move it across non-uniform 259 * control flow if it's not uniform at the point it's defined. 260 * Stores and atomics can never be re-ordered, so we don't have to consider them here. 261 */ 262 bool non_uniform = nir_intrinsic_has_access(intrin) && 263 (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM); 264 if (!non_uniform && 265 (intrin->intrinsic == nir_intrinsic_load_ubo || 266 intrin->intrinsic == nir_intrinsic_load_ssbo || 267 intrin->intrinsic == nir_intrinsic_get_ubo_size || 268 intrin->intrinsic == nir_intrinsic_get_ssbo_size || 269 nir_intrinsic_has_image_dim(intrin) || 270 ((intrin->intrinsic == nir_intrinsic_load_deref || 271 intrin->intrinsic == nir_intrinsic_deref_buffer_array_length) && 272 nir_deref_mode_may_be(nir_src_as_deref(intrin->src[0]), 273 nir_var_mem_ubo | nir_var_mem_ssbo)))) { 274 if (!is_binding_uniform(intrin->src[0])) 275 instr->pass_flags = GCM_INSTR_PINNED; 276 } else if (intrin->intrinsic == nir_intrinsic_load_push_constant) { 277 if (!nir_src_is_always_uniform(intrin->src[0])) 278 instr->pass_flags = GCM_INSTR_PINNED; 279 } else if (intrin->intrinsic == nir_intrinsic_load_deref && 280 nir_deref_mode_is(nir_src_as_deref(intrin->src[0]), 281 nir_var_mem_push_const)) { 282 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); 283 while (deref->deref_type != nir_deref_type_var) { 284 if ((deref->deref_type == nir_deref_type_array || 285 deref->deref_type == nir_deref_type_ptr_as_array) && 286 !nir_src_is_always_uniform(deref->arr.index)) { 287 instr->pass_flags = GCM_INSTR_PINNED; 288 return; 289 } 290 deref = nir_deref_instr_parent(deref); 291 if (!deref) { 292 instr->pass_flags = GCM_INSTR_PINNED; 293 return; 294 } 295 } 296 } 297} 298 299/* Walks the instruction list and marks immovable instructions as pinned or 300 * placed. 301 * 302 * This function also serves to initialize the instr->pass_flags field. 303 * After this is completed, all instructions' pass_flags fields will be set 304 * to either GCM_INSTR_PINNED, GCM_INSTR_PLACED or 0. 305 */ 306static void 307gcm_pin_instructions(nir_function_impl *impl, struct gcm_state *state) 308{ 309 state->num_instrs = 0; 310 311 nir_foreach_block(block, impl) { 312 nir_foreach_instr_safe(instr, block) { 313 /* Index the instructions for use in gcm_state::instrs */ 314 instr->index = state->num_instrs++; 315 316 switch (instr->type) { 317 case nir_instr_type_alu: 318 switch (nir_instr_as_alu(instr)->op) { 319 case nir_op_fddx: 320 case nir_op_fddy: 321 case nir_op_fddx_fine: 322 case nir_op_fddy_fine: 323 case nir_op_fddx_coarse: 324 case nir_op_fddy_coarse: 325 /* These can only go in uniform control flow */ 326 instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY; 327 break; 328 329 case nir_op_mov: 330 if (!is_src_scalarizable(&(nir_instr_as_alu(instr)->src[0].src))) { 331 instr->pass_flags = GCM_INSTR_PINNED; 332 break; 333 } 334 FALLTHROUGH; 335 336 default: 337 instr->pass_flags = 0; 338 break; 339 } 340 break; 341 342 case nir_instr_type_tex: { 343 nir_tex_instr *tex = nir_instr_as_tex(instr); 344 if (nir_tex_instr_has_implicit_derivative(tex)) 345 instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY; 346 347 for (unsigned i = 0; i < tex->num_srcs; i++) { 348 nir_tex_src *src = &tex->src[i]; 349 switch (src->src_type) { 350 case nir_tex_src_texture_deref: 351 if (!tex->texture_non_uniform && !is_binding_uniform(src->src)) 352 instr->pass_flags = GCM_INSTR_PINNED; 353 break; 354 case nir_tex_src_sampler_deref: 355 if (!tex->sampler_non_uniform && !is_binding_uniform(src->src)) 356 instr->pass_flags = GCM_INSTR_PINNED; 357 break; 358 case nir_tex_src_texture_offset: 359 case nir_tex_src_texture_handle: 360 if (!tex->texture_non_uniform && !nir_src_is_always_uniform(src->src)) 361 instr->pass_flags = GCM_INSTR_PINNED; 362 break; 363 case nir_tex_src_sampler_offset: 364 case nir_tex_src_sampler_handle: 365 if (!tex->sampler_non_uniform && !nir_src_is_always_uniform(src->src)) 366 instr->pass_flags = GCM_INSTR_PINNED; 367 break; 368 default: 369 break; 370 } 371 } 372 break; 373 } 374 375 case nir_instr_type_deref: 376 case nir_instr_type_load_const: 377 instr->pass_flags = 0; 378 break; 379 380 case nir_instr_type_intrinsic: 381 pin_intrinsic(nir_instr_as_intrinsic(instr)); 382 break; 383 384 case nir_instr_type_call: 385 instr->pass_flags = GCM_INSTR_PINNED; 386 break; 387 388 case nir_instr_type_jump: 389 case nir_instr_type_ssa_undef: 390 case nir_instr_type_phi: 391 instr->pass_flags = GCM_INSTR_PLACED; 392 break; 393 394 default: 395 unreachable("Invalid instruction type in GCM"); 396 } 397 398 if (!(instr->pass_flags & GCM_INSTR_PLACED)) { 399 /* If this is an unplaced instruction, go ahead and pull it out of 400 * the program and put it on the instrs list. This has a couple 401 * of benifits. First, it makes the scheduling algorithm more 402 * efficient because we can avoid walking over basic blocks. 403 * Second, it keeps us from causing linked list confusion when 404 * we're trying to put everything in its proper place at the end 405 * of the pass. 406 * 407 * Note that we don't use nir_instr_remove here because that also 408 * cleans up uses and defs and we want to keep that information. 409 */ 410 exec_node_remove(&instr->node); 411 exec_list_push_tail(&state->instrs, &instr->node); 412 } 413 } 414 } 415} 416 417static void 418gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state); 419 420/** Update an instructions schedule for the given source 421 * 422 * This function is called iteratively as we walk the sources of an 423 * instruction. It ensures that the given source instruction has been 424 * scheduled and then update this instruction's block if the source 425 * instruction is lower down the tree. 426 */ 427static bool 428gcm_schedule_early_src(nir_src *src, void *void_state) 429{ 430 struct gcm_state *state = void_state; 431 nir_instr *instr = state->instr; 432 433 assert(src->is_ssa); 434 435 gcm_schedule_early_instr(src->ssa->parent_instr, void_state); 436 437 /* While the index isn't a proper dominance depth, it does have the 438 * property that if A dominates B then A->index <= B->index. Since we 439 * know that this instruction must have been dominated by all of its 440 * sources at some point (even if it's gone through value-numbering), 441 * all of the sources must lie on the same branch of the dominance tree. 442 * Therefore, we can just go ahead and just compare indices. 443 */ 444 struct gcm_instr_info *src_info = 445 &state->instr_infos[src->ssa->parent_instr->index]; 446 struct gcm_instr_info *info = &state->instr_infos[instr->index]; 447 if (info->early_block->index < src_info->early_block->index) 448 info->early_block = src_info->early_block; 449 450 /* We need to restore the state instruction because it may have been 451 * changed through the gcm_schedule_early_instr call above. Since we 452 * may still be iterating through sources and future calls to 453 * gcm_schedule_early_src for the same instruction will still need it. 454 */ 455 state->instr = instr; 456 457 return true; 458} 459 460/** Schedules an instruction early 461 * 462 * This function performs a recursive depth-first search starting at the 463 * given instruction and proceeding through the sources to schedule 464 * instructions as early as they can possibly go in the dominance tree. 465 * The instructions are "scheduled" by updating the early_block field of 466 * the corresponding gcm_instr_state entry. 467 */ 468static void 469gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state) 470{ 471 if (instr->pass_flags & GCM_INSTR_SCHEDULED_EARLY) 472 return; 473 474 instr->pass_flags |= GCM_INSTR_SCHEDULED_EARLY; 475 476 /* Pinned/placed instructions always get scheduled in their original block so 477 * we don't need to do anything. Also, bailing here keeps us from ever 478 * following the sources of phi nodes which can be back-edges. 479 */ 480 if (instr->pass_flags & GCM_INSTR_PINNED || 481 instr->pass_flags & GCM_INSTR_PLACED) { 482 state->instr_infos[instr->index].early_block = instr->block; 483 return; 484 } 485 486 /* Start with the instruction at the top. As we iterate over the 487 * sources, it will get moved down as needed. 488 */ 489 state->instr_infos[instr->index].early_block = nir_start_block(state->impl); 490 state->instr = instr; 491 492 nir_foreach_src(instr, gcm_schedule_early_src, state); 493} 494 495static bool 496set_block_for_loop_instr(struct gcm_state *state, nir_instr *instr, 497 nir_block *block) 498{ 499 /* If the instruction wasn't in a loop to begin with we don't want to push 500 * it down into one. 501 */ 502 nir_loop *loop = state->blocks[instr->block->index].loop; 503 if (loop == NULL) 504 return true; 505 506 if (nir_block_dominates(instr->block, block)) 507 return true; 508 509 /* If the loop only executes a single time i.e its wrapped in a: 510 * do{ ... break; } while(true) 511 * Don't move the instruction as it will not help anything. 512 */ 513 if (loop->info->limiting_terminator == NULL && !loop->info->complex_loop && 514 nir_block_ends_in_break(nir_loop_last_block(loop))) 515 return false; 516 517 /* Being too aggressive with how we pull instructions out of loops can 518 * result in extra register pressure and spilling. For example its fairly 519 * common for loops in compute shaders to calculate SSBO offsets using 520 * the workgroup id, subgroup id and subgroup invocation, pulling all 521 * these calculations outside the loop causes register pressure. 522 * 523 * To work around these issues for now we only allow constant and texture 524 * instructions to be moved outside their original loops, or instructions 525 * where the total loop instruction count is less than 526 * MAX_LOOP_INSTRUCTIONS. 527 * 528 * TODO: figure out some more heuristics to allow more to be moved out of 529 * loops. 530 */ 531 if (state->blocks[instr->block->index].loop_instr_count < MAX_LOOP_INSTRUCTIONS) 532 return true; 533 534 if (instr->type == nir_instr_type_load_const || 535 instr->type == nir_instr_type_tex) 536 return true; 537 538 return false; 539} 540 541static bool 542set_block_to_if_block(struct gcm_state *state, nir_instr *instr, 543 nir_block *block) 544{ 545 if (instr->type == nir_instr_type_load_const) 546 return true; 547 548 /* TODO: Figure out some more heuristics to allow more to be moved into 549 * if-statements. 550 */ 551 552 return false; 553} 554 555static nir_block * 556gcm_choose_block_for_instr(nir_instr *instr, nir_block *early_block, 557 nir_block *late_block, struct gcm_state *state) 558{ 559 assert(nir_block_dominates(early_block, late_block)); 560 561 bool block_set = false; 562 563 /* First see if we can push the instruction down into an if-statements block */ 564 nir_block *best = late_block; 565 for (nir_block *block = late_block; block != NULL; block = block->imm_dom) { 566 if (state->blocks[block->index].loop_depth > 567 state->blocks[instr->block->index].loop_depth) 568 continue; 569 570 if (state->blocks[block->index].if_depth >= 571 state->blocks[best->index].if_depth && 572 set_block_to_if_block(state, instr, block)) { 573 /* If we are pushing the instruction into an if we want it to be 574 * in the earliest block not the latest to avoid creating register 575 * pressure issues. So we don't break unless we come across the 576 * block the instruction was originally in. 577 */ 578 best = block; 579 block_set = true; 580 if (block == instr->block) 581 break; 582 } else if (block == instr->block) { 583 /* If we couldn't push the instruction later just put is back where it 584 * was previously. 585 */ 586 if (!block_set) 587 best = block; 588 break; 589 } 590 591 if (block == early_block) 592 break; 593 } 594 595 /* Now see if we can evict the instruction from a loop */ 596 for (nir_block *block = late_block; block != NULL; block = block->imm_dom) { 597 if (state->blocks[block->index].loop_depth < 598 state->blocks[best->index].loop_depth) { 599 if (set_block_for_loop_instr(state, instr, block)) { 600 best = block; 601 } else if (block == instr->block) { 602 if (!block_set) 603 best = block; 604 break; 605 } 606 } 607 608 if (block == early_block) 609 break; 610 } 611 612 return best; 613} 614 615static void 616gcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state); 617 618/** Schedules the instruction associated with the given SSA def late 619 * 620 * This function works by first walking all of the uses of the given SSA 621 * definition, ensuring that they are scheduled, and then computing the LCA 622 * (least common ancestor) of its uses. It then schedules this instruction 623 * as close to the LCA as possible while trying to stay out of loops. 624 */ 625static bool 626gcm_schedule_late_def(nir_ssa_def *def, void *void_state) 627{ 628 struct gcm_state *state = void_state; 629 630 nir_block *lca = NULL; 631 632 nir_foreach_use(use_src, def) { 633 nir_instr *use_instr = use_src->parent_instr; 634 635 gcm_schedule_late_instr(use_instr, state); 636 637 /* Phi instructions are a bit special. SSA definitions don't have to 638 * dominate the sources of the phi nodes that use them; instead, they 639 * have to dominate the predecessor block corresponding to the phi 640 * source. We handle this by looking through the sources, finding 641 * any that are usingg this SSA def, and using those blocks instead 642 * of the one the phi lives in. 643 */ 644 if (use_instr->type == nir_instr_type_phi) { 645 nir_phi_instr *phi = nir_instr_as_phi(use_instr); 646 647 nir_foreach_phi_src(phi_src, phi) { 648 if (phi_src->src.ssa == def) 649 lca = nir_dominance_lca(lca, phi_src->pred); 650 } 651 } else { 652 lca = nir_dominance_lca(lca, use_instr->block); 653 } 654 } 655 656 nir_foreach_if_use(use_src, def) { 657 nir_if *if_stmt = use_src->parent_if; 658 659 /* For if statements, we consider the block to be the one immediately 660 * preceding the if CF node. 661 */ 662 nir_block *pred_block = 663 nir_cf_node_as_block(nir_cf_node_prev(&if_stmt->cf_node)); 664 665 lca = nir_dominance_lca(lca, pred_block); 666 } 667 668 nir_block *early_block = 669 state->instr_infos[def->parent_instr->index].early_block; 670 671 /* Some instructions may never be used. Flag them and the instruction 672 * placement code will get rid of them for us. 673 */ 674 if (lca == NULL) { 675 def->parent_instr->block = NULL; 676 return true; 677 } 678 679 if (def->parent_instr->pass_flags & GCM_INSTR_SCHEDULE_EARLIER_ONLY && 680 lca != def->parent_instr->block && 681 nir_block_dominates(def->parent_instr->block, lca)) { 682 lca = def->parent_instr->block; 683 } 684 685 /* We now have the LCA of all of the uses. If our invariants hold, 686 * this is dominated by the block that we chose when scheduling early. 687 * We now walk up the dominance tree and pick the lowest block that is 688 * as far outside loops as we can get. 689 */ 690 nir_block *best_block = 691 gcm_choose_block_for_instr(def->parent_instr, early_block, lca, state); 692 693 if (def->parent_instr->block != best_block) 694 state->progress = true; 695 696 def->parent_instr->block = best_block; 697 698 return true; 699} 700 701/** Schedules an instruction late 702 * 703 * This function performs a depth-first search starting at the given 704 * instruction and proceeding through its uses to schedule instructions as 705 * late as they can reasonably go in the dominance tree. The instructions 706 * are "scheduled" by updating their instr->block field. 707 * 708 * The name of this function is actually a bit of a misnomer as it doesn't 709 * schedule them "as late as possible" as the paper implies. Instead, it 710 * first finds the lates possible place it can schedule the instruction and 711 * then possibly schedules it earlier than that. The actual location is as 712 * far down the tree as we can go while trying to stay out of loops. 713 */ 714static void 715gcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state) 716{ 717 if (instr->pass_flags & GCM_INSTR_SCHEDULED_LATE) 718 return; 719 720 instr->pass_flags |= GCM_INSTR_SCHEDULED_LATE; 721 722 /* Pinned/placed instructions are already scheduled so we don't need to do 723 * anything. Also, bailing here keeps us from ever following phi nodes 724 * which can be back-edges. 725 */ 726 if (instr->pass_flags & GCM_INSTR_PLACED || 727 instr->pass_flags & GCM_INSTR_PINNED) 728 return; 729 730 nir_foreach_ssa_def(instr, gcm_schedule_late_def, state); 731} 732 733static bool 734gcm_replace_def_with_undef(nir_ssa_def *def, void *void_state) 735{ 736 struct gcm_state *state = void_state; 737 738 if (nir_ssa_def_is_unused(def)) 739 return true; 740 741 nir_ssa_undef_instr *undef = 742 nir_ssa_undef_instr_create(state->impl->function->shader, 743 def->num_components, def->bit_size); 744 nir_instr_insert(nir_before_cf_list(&state->impl->body), &undef->instr); 745 nir_ssa_def_rewrite_uses(def, &undef->def); 746 747 return true; 748} 749 750/** Places an instrution back into the program 751 * 752 * The earlier passes of GCM simply choose blocks for each instruction and 753 * otherwise leave them alone. This pass actually places the instructions 754 * into their chosen blocks. 755 * 756 * To do so, we simply insert instructions in the reverse order they were 757 * extracted. This will simply place instructions that were scheduled earlier 758 * onto the end of their new block and instructions that were scheduled later to 759 * the start of their new block. 760 */ 761static void 762gcm_place_instr(nir_instr *instr, struct gcm_state *state) 763{ 764 if (instr->pass_flags & GCM_INSTR_PLACED) 765 return; 766 767 instr->pass_flags |= GCM_INSTR_PLACED; 768 769 if (instr->block == NULL) { 770 nir_foreach_ssa_def(instr, gcm_replace_def_with_undef, state); 771 nir_instr_remove(instr); 772 return; 773 } 774 775 struct gcm_block_info *block_info = &state->blocks[instr->block->index]; 776 exec_node_remove(&instr->node); 777 778 if (block_info->last_instr) { 779 exec_node_insert_node_before(&block_info->last_instr->node, 780 &instr->node); 781 } else { 782 /* Schedule it at the end of the block */ 783 nir_instr *jump_instr = nir_block_last_instr(instr->block); 784 if (jump_instr && jump_instr->type == nir_instr_type_jump) { 785 exec_node_insert_node_before(&jump_instr->node, &instr->node); 786 } else { 787 exec_list_push_tail(&instr->block->instr_list, &instr->node); 788 } 789 } 790 791 block_info->last_instr = instr; 792} 793 794static bool 795opt_gcm_impl(nir_shader *shader, nir_function_impl *impl, bool value_number) 796{ 797 nir_metadata_require(impl, nir_metadata_block_index | 798 nir_metadata_dominance); 799 nir_metadata_require(impl, nir_metadata_loop_analysis, 800 shader->options->force_indirect_unrolling, 801 shader->options->force_indirect_unrolling_sampler); 802 803 /* A previous pass may have left pass_flags dirty, so clear it all out. */ 804 nir_foreach_block(block, impl) 805 nir_foreach_instr(instr, block) 806 instr->pass_flags = 0; 807 808 struct gcm_state state; 809 810 state.impl = impl; 811 state.instr = NULL; 812 state.progress = false; 813 exec_list_make_empty(&state.instrs); 814 state.blocks = rzalloc_array(NULL, struct gcm_block_info, impl->num_blocks); 815 816 gcm_build_block_info(&impl->body, &state, NULL, 0, 0, ~0u); 817 818 gcm_pin_instructions(impl, &state); 819 820 state.instr_infos = 821 rzalloc_array(NULL, struct gcm_instr_info, state.num_instrs); 822 823 if (value_number) { 824 struct set *gvn_set = nir_instr_set_create(NULL); 825 foreach_list_typed_safe(nir_instr, instr, node, &state.instrs) { 826 if (instr->pass_flags & GCM_INSTR_PINNED) 827 continue; 828 829 if (nir_instr_set_add_or_rewrite(gvn_set, instr, NULL)) 830 state.progress = true; 831 } 832 nir_instr_set_destroy(gvn_set); 833 } 834 835 foreach_list_typed(nir_instr, instr, node, &state.instrs) 836 gcm_schedule_early_instr(instr, &state); 837 838 foreach_list_typed(nir_instr, instr, node, &state.instrs) 839 gcm_schedule_late_instr(instr, &state); 840 841 while (!exec_list_is_empty(&state.instrs)) { 842 nir_instr *instr = exec_node_data(nir_instr, 843 state.instrs.tail_sentinel.prev, node); 844 gcm_place_instr(instr, &state); 845 } 846 847 ralloc_free(state.blocks); 848 ralloc_free(state.instr_infos); 849 850 nir_metadata_preserve(impl, nir_metadata_block_index | 851 nir_metadata_dominance | 852 nir_metadata_loop_analysis); 853 854 return state.progress; 855} 856 857bool 858nir_opt_gcm(nir_shader *shader, bool value_number) 859{ 860 bool progress = false; 861 862 nir_foreach_function(function, shader) { 863 if (function->impl) 864 progress |= opt_gcm_impl(shader, function->impl, value_number); 865 } 866 867 return progress; 868} 869