1/* 2 * Copyright © 2021 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/* This is a new block-level load instruction scheduler where loads are grouped 25 * according to their indirection level within a basic block. An indirection 26 * is when a result of one load is used as a source of another load. The result 27 * is that disjoint ALU opcode groups and load (texture) opcode groups are 28 * created where each next load group is the next level of indirection. 29 * It's done by finding the first and last load with the same indirection 30 * level, and moving all unrelated instructions between them after the last 31 * load except for load sources, which are moved before the first load. 32 * It naturally suits hardware that has limits on texture indirections, but 33 * other hardware can benefit too. Only texture, image, and SSBO load and 34 * atomic instructions are grouped. 35 * 36 * There is an option to group only those loads that use the same resource 37 * variable. This increases the chance to get more cache hits than if the loads 38 * were spread out. 39 * 40 * The increased register usage is offset by the increase in observed memory 41 * bandwidth due to more cache hits (dependent on hw behavior) and thus 42 * decrease the subgroup lifetime, which allows registers to be deallocated 43 * and reused sooner. In some bandwidth-bound cases, low register usage doesn't 44 * benefit at all. Doubling the register usage and using those registers to 45 * amplify observed bandwidth can improve performance a lot. 46 * 47 * It's recommended to run a hw-specific instruction scheduler after this to 48 * prevent spilling. 49 */ 50 51#include "nir.h" 52 53static bool 54is_memory_load(nir_instr *instr) 55{ 56 /* Count texture_size too because it has the same latency as cache hits. */ 57 if (instr->type == nir_instr_type_tex) 58 return true; 59 60 if (instr->type == nir_instr_type_intrinsic) { 61 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 62 const char *name = nir_intrinsic_infos[intr->intrinsic].name; 63 64 /* TODO: nir_intrinsics.py could do this */ 65 /* load_ubo is ignored because it's usually cheap. */ 66 if (!nir_intrinsic_writes_external_memory(intr) && 67 !strstr(name, "shared") && 68 (strstr(name, "ssbo") || strstr(name, "image"))) 69 return true; 70 } 71 72 return false; 73} 74 75static nir_instr * 76get_intrinsic_resource(nir_intrinsic_instr *intr) 77{ 78 /* This is also the list of intrinsics that are grouped. */ 79 /* load_ubo is ignored because it's usually cheap. */ 80 switch (intr->intrinsic) { 81 case nir_intrinsic_image_load: 82 case nir_intrinsic_image_deref_load: 83 case nir_intrinsic_image_sparse_load: 84 case nir_intrinsic_image_deref_sparse_load: 85 /* Group image_size too because it has the same latency as cache hits. */ 86 case nir_intrinsic_image_size: 87 case nir_intrinsic_image_deref_size: 88 case nir_intrinsic_bindless_image_load: 89 case nir_intrinsic_bindless_image_sparse_load: 90 case nir_intrinsic_load_ssbo: 91 return intr->src[0].ssa->parent_instr; 92 default: 93 return NULL; 94 } 95} 96 97/* Track only those that we want to group. */ 98static bool 99is_grouped_load(nir_instr *instr) 100{ 101 /* Count texture_size too because it has the same latency as cache hits. */ 102 if (instr->type == nir_instr_type_tex) 103 return true; 104 105 if (instr->type == nir_instr_type_intrinsic) 106 return get_intrinsic_resource(nir_instr_as_intrinsic(instr)) != NULL; 107 108 return false; 109} 110 111static bool 112can_move(nir_instr *instr, uint8_t current_indirection_level) 113{ 114 /* Grouping is done by moving everything else out of the first/last 115 * instruction range of the indirection level. 116 */ 117 if (is_grouped_load(instr) && instr->pass_flags == current_indirection_level) 118 return false; 119 120 if (instr->type == nir_instr_type_alu || 121 instr->type == nir_instr_type_deref || 122 instr->type == nir_instr_type_tex || 123 instr->type == nir_instr_type_load_const || 124 instr->type == nir_instr_type_ssa_undef) 125 return true; 126 127 if (instr->type == nir_instr_type_intrinsic && 128 nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr))) 129 return true; 130 131 return false; 132} 133 134static nir_instr * 135get_uniform_inst_resource(nir_instr *instr) 136{ 137 if (instr->type == nir_instr_type_tex) { 138 nir_tex_instr *tex = nir_instr_as_tex(instr); 139 140 if (tex->texture_non_uniform) 141 return NULL; 142 143 for (unsigned i = 0; i < tex->num_srcs; i++) { 144 switch (tex->src[i].src_type) { 145 case nir_tex_src_texture_deref: 146 case nir_tex_src_texture_handle: 147 return tex->src[i].src.ssa->parent_instr; 148 default: 149 break; 150 } 151 } 152 return NULL; 153 } 154 155 if (instr->type == nir_instr_type_intrinsic) 156 return get_intrinsic_resource(nir_instr_as_intrinsic(instr)); 157 158 return NULL; 159} 160 161struct check_sources_state 162{ 163 nir_block *block; 164 uint32_t first_index; 165}; 166 167static bool 168has_only_sources_less_than(nir_src *src, void *data) 169{ 170 struct check_sources_state *state = (struct check_sources_state *)data; 171 172 /* true if nir_foreach_src should keep going */ 173 return state->block != src->ssa->parent_instr->block || 174 src->ssa->parent_instr->index < state->first_index; 175} 176 177static void 178group_loads(nir_instr *first, nir_instr *last) 179{ 180 /* Walk the instruction range between the first and last backward, and 181 * move those that have no uses within the range after the last one. 182 */ 183 for (nir_instr *instr = exec_node_data_backward(nir_instr, 184 last->node.prev, node); 185 instr != first; 186 instr = exec_node_data_backward(nir_instr, instr->node.prev, node)) { 187 /* Only move instructions without side effects. */ 188 if (!can_move(instr, first->pass_flags)) 189 continue; 190 191 nir_ssa_def *def = nir_instr_ssa_def(instr); 192 if (def) { 193 bool all_uses_after_last = true; 194 195 nir_foreach_use(use, def) { 196 if (use->parent_instr->block == instr->block && 197 use->parent_instr->index <= last->index) { 198 all_uses_after_last = false; 199 break; 200 } 201 } 202 203 if (all_uses_after_last) { 204 nir_instr *move_instr = instr; 205 /* Set the last instruction because we'll delete the current one. */ 206 instr = exec_node_data_forward(nir_instr, instr->node.next, node); 207 208 /* Move the instruction after the last and update its index 209 * to indicate that it's after it. 210 */ 211 nir_instr_move(nir_after_instr(last), move_instr); 212 move_instr->index = last->index + 1; 213 } 214 } 215 } 216 217 struct check_sources_state state; 218 state.block = first->block; 219 state.first_index = first->index; 220 221 /* Walk the instruction range between the first and last forward, and move 222 * those that have no sources within the range before the first one. 223 */ 224 for (nir_instr *instr = exec_node_data_forward(nir_instr, 225 first->node.next, node); 226 instr != last; 227 instr = exec_node_data_forward(nir_instr, instr->node.next, node)) { 228 /* Only move instructions without side effects. */ 229 if (!can_move(instr, first->pass_flags)) 230 continue; 231 232 if (nir_foreach_src(instr, has_only_sources_less_than, &state)) { 233 nir_instr *move_instr = instr; 234 /* Set the last instruction because we'll delete the current one. */ 235 instr = exec_node_data_backward(nir_instr, instr->node.prev, node); 236 237 /* Move the instruction before the first and update its index 238 * to indicate that it's before it. 239 */ 240 nir_instr_move(nir_before_instr(first), move_instr); 241 move_instr->index = first->index - 1; 242 } 243 } 244} 245 246static bool 247is_pseudo_inst(nir_instr *instr) 248{ 249 /* Other instructions do not usually contribute to the shader binary size. */ 250 return instr->type != nir_instr_type_alu && 251 instr->type != nir_instr_type_call && 252 instr->type != nir_instr_type_tex && 253 instr->type != nir_instr_type_intrinsic; 254} 255 256static void 257set_instr_indices(nir_block *block) 258{ 259 /* Start with 1 because we'll move instruction before the first one 260 * and will want to label it 0. 261 */ 262 unsigned counter = 1; 263 nir_instr *last = NULL; 264 265 nir_foreach_instr(instr, block) { 266 /* Make sure grouped instructions don't have the same index as pseudo 267 * instructions. 268 */ 269 if (last && is_pseudo_inst(last) && is_grouped_load(instr)) 270 counter++; 271 272 /* Set each instruction's index within the block. */ 273 instr->index = counter; 274 275 /* Only count non-pseudo instructions. */ 276 if (!is_pseudo_inst(instr)) 277 counter++; 278 279 last = instr; 280 } 281} 282 283static void 284handle_load_range(nir_instr **first, nir_instr **last, 285 nir_instr *current, unsigned max_distance) 286{ 287 if (*first && *last && 288 (!current || current->index > (*first)->index + max_distance)) { 289 assert(*first != *last); 290 group_loads(*first, *last); 291 set_instr_indices((*first)->block); 292 *first = NULL; 293 *last = NULL; 294 } 295} 296 297static bool 298is_barrier(nir_instr *instr) 299{ 300 if (instr->type == nir_instr_type_intrinsic) { 301 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 302 const char *name = nir_intrinsic_infos[intr->intrinsic].name; 303 304 305 if (intr->intrinsic == nir_intrinsic_discard || 306 intr->intrinsic == nir_intrinsic_discard_if || 307 intr->intrinsic == nir_intrinsic_terminate || 308 intr->intrinsic == nir_intrinsic_terminate_if || 309 /* TODO: nir_intrinsics.py could do this */ 310 strstr(name, "barrier")) 311 return true; 312 } 313 314 return false; 315} 316 317struct indirection_state 318{ 319 nir_block *block; 320 unsigned indirections; 321}; 322 323static unsigned 324get_num_indirections(nir_instr *instr); 325 326static bool 327gather_indirections(nir_src *src, void *data) 328{ 329 struct indirection_state *state = (struct indirection_state *)data; 330 nir_instr *instr = src->ssa->parent_instr; 331 332 /* We only count indirections within the same block. */ 333 if (instr->block == state->block) { 334 unsigned indirections = get_num_indirections(src->ssa->parent_instr); 335 336 if (instr->type == nir_instr_type_tex || is_memory_load(instr)) 337 indirections++; 338 339 state->indirections = MAX2(state->indirections, indirections); 340 } 341 342 return true; /* whether nir_foreach_src should keep going */ 343} 344 345/* Return the number of load indirections within the block. */ 346static unsigned 347get_num_indirections(nir_instr *instr) 348{ 349 /* Don't traverse phis because we could end up in an infinite recursion 350 * if the phi points to the current block (such as a loop body). 351 */ 352 if (instr->type == nir_instr_type_phi) 353 return 0; 354 355 if (instr->index != UINT32_MAX) 356 return instr->index; /* we've visited this instruction before */ 357 358 struct indirection_state state; 359 state.block = instr->block; 360 state.indirections = 0; 361 362 nir_foreach_src(instr, gather_indirections, &state); 363 364 instr->index = state.indirections; 365 return state.indirections; 366} 367 368static void 369process_block(nir_block *block, nir_load_grouping grouping, 370 unsigned max_distance) 371{ 372 int max_indirection = -1; 373 unsigned num_inst_per_level[256] = {0}; 374 375 /* UINT32_MAX means the instruction has not been visited. Once 376 * an instruction has been visited and its indirection level has been 377 * determined, we'll store the indirection level in the index. The next 378 * instruction that visits it will use the index instead of recomputing 379 * the indirection level, which would result in an exponetial time 380 * complexity. 381 */ 382 nir_foreach_instr(instr, block) { 383 instr->index = UINT32_MAX; /* unknown */ 384 } 385 386 /* Count the number of load indirections for each load instruction 387 * within this block. Store it in pass_flags. 388 */ 389 nir_foreach_instr(instr, block) { 390 if (is_grouped_load(instr)) { 391 unsigned indirections = get_num_indirections(instr); 392 393 /* pass_flags has only 8 bits */ 394 indirections = MIN2(indirections, 255); 395 num_inst_per_level[indirections]++; 396 instr->pass_flags = indirections; 397 398 max_indirection = MAX2(max_indirection, (int)indirections); 399 } 400 } 401 402 /* 255 contains all indirection levels >= 255, so ignore them. */ 403 max_indirection = MIN2(max_indirection, 254); 404 405 /* Each indirection level is grouped. */ 406 for (int level = 0; level <= max_indirection; level++) { 407 if (num_inst_per_level[level] <= 1) 408 continue; 409 410 set_instr_indices(block); 411 412 nir_instr *resource = NULL; 413 nir_instr *first_load = NULL, *last_load = NULL; 414 415 /* Find the first and last instruction that use the same 416 * resource and are within a certain distance of each other. 417 * If found, group them by moving all movable instructions 418 * between them out. 419 */ 420 nir_foreach_instr(current, block) { 421 /* Don't group across barriers. */ 422 if (is_barrier(current)) { 423 /* Group unconditionally. */ 424 handle_load_range(&first_load, &last_load, NULL, 0); 425 first_load = NULL; 426 last_load = NULL; 427 continue; 428 } 429 430 /* Only group load instructions with the same indirection level. */ 431 if (is_grouped_load(current) && current->pass_flags == level) { 432 nir_instr *current_resource; 433 434 switch (grouping) { 435 case nir_group_all: 436 if (!first_load) 437 first_load = current; 438 else 439 last_load = current; 440 break; 441 442 case nir_group_same_resource_only: 443 current_resource = get_uniform_inst_resource(current); 444 445 if (current_resource) { 446 if (!first_load) { 447 first_load = current; 448 resource = current_resource; 449 } else if (current_resource == resource) { 450 last_load = current; 451 } 452 } 453 } 454 } 455 456 /* Group only if we exceeded the maximum distance. */ 457 handle_load_range(&first_load, &last_load, current, max_distance); 458 } 459 460 /* Group unconditionally. */ 461 handle_load_range(&first_load, &last_load, NULL, 0); 462 } 463} 464 465/* max_distance is the maximum distance between the first and last instruction 466 * in a group. 467 */ 468void 469nir_group_loads(nir_shader *shader, nir_load_grouping grouping, 470 unsigned max_distance) 471{ 472 nir_foreach_function(function, shader) { 473 if (function->impl) { 474 nir_foreach_block(block, function->impl) { 475 process_block(block, grouping, max_distance); 476 } 477 478 nir_metadata_preserve(function->impl, 479 nir_metadata_block_index | 480 nir_metadata_dominance | 481 nir_metadata_loop_analysis); 482 } 483 } 484} 485