1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2021 Advanced Micro Devices, Inc. 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci/* This is a new block-level load instruction scheduler where loads are grouped 25bf215546Sopenharmony_ci * according to their indirection level within a basic block. An indirection 26bf215546Sopenharmony_ci * is when a result of one load is used as a source of another load. The result 27bf215546Sopenharmony_ci * is that disjoint ALU opcode groups and load (texture) opcode groups are 28bf215546Sopenharmony_ci * created where each next load group is the next level of indirection. 29bf215546Sopenharmony_ci * It's done by finding the first and last load with the same indirection 30bf215546Sopenharmony_ci * level, and moving all unrelated instructions between them after the last 31bf215546Sopenharmony_ci * load except for load sources, which are moved before the first load. 32bf215546Sopenharmony_ci * It naturally suits hardware that has limits on texture indirections, but 33bf215546Sopenharmony_ci * other hardware can benefit too. Only texture, image, and SSBO load and 34bf215546Sopenharmony_ci * atomic instructions are grouped. 35bf215546Sopenharmony_ci * 36bf215546Sopenharmony_ci * There is an option to group only those loads that use the same resource 37bf215546Sopenharmony_ci * variable. This increases the chance to get more cache hits than if the loads 38bf215546Sopenharmony_ci * were spread out. 39bf215546Sopenharmony_ci * 40bf215546Sopenharmony_ci * The increased register usage is offset by the increase in observed memory 41bf215546Sopenharmony_ci * bandwidth due to more cache hits (dependent on hw behavior) and thus 42bf215546Sopenharmony_ci * decrease the subgroup lifetime, which allows registers to be deallocated 43bf215546Sopenharmony_ci * and reused sooner. In some bandwidth-bound cases, low register usage doesn't 44bf215546Sopenharmony_ci * benefit at all. Doubling the register usage and using those registers to 45bf215546Sopenharmony_ci * amplify observed bandwidth can improve performance a lot. 46bf215546Sopenharmony_ci * 47bf215546Sopenharmony_ci * It's recommended to run a hw-specific instruction scheduler after this to 48bf215546Sopenharmony_ci * prevent spilling. 49bf215546Sopenharmony_ci */ 50bf215546Sopenharmony_ci 51bf215546Sopenharmony_ci#include "nir.h" 52bf215546Sopenharmony_ci 53bf215546Sopenharmony_cistatic bool 54bf215546Sopenharmony_ciis_memory_load(nir_instr *instr) 55bf215546Sopenharmony_ci{ 56bf215546Sopenharmony_ci /* Count texture_size too because it has the same latency as cache hits. */ 57bf215546Sopenharmony_ci if (instr->type == nir_instr_type_tex) 58bf215546Sopenharmony_ci return true; 59bf215546Sopenharmony_ci 60bf215546Sopenharmony_ci if (instr->type == nir_instr_type_intrinsic) { 61bf215546Sopenharmony_ci nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 62bf215546Sopenharmony_ci const char *name = nir_intrinsic_infos[intr->intrinsic].name; 63bf215546Sopenharmony_ci 64bf215546Sopenharmony_ci /* TODO: nir_intrinsics.py could do this */ 65bf215546Sopenharmony_ci /* load_ubo is ignored because it's usually cheap. */ 66bf215546Sopenharmony_ci if (!nir_intrinsic_writes_external_memory(intr) && 67bf215546Sopenharmony_ci !strstr(name, "shared") && 68bf215546Sopenharmony_ci (strstr(name, "ssbo") || strstr(name, "image"))) 69bf215546Sopenharmony_ci return true; 70bf215546Sopenharmony_ci } 71bf215546Sopenharmony_ci 72bf215546Sopenharmony_ci return false; 73bf215546Sopenharmony_ci} 74bf215546Sopenharmony_ci 75bf215546Sopenharmony_cistatic nir_instr * 76bf215546Sopenharmony_ciget_intrinsic_resource(nir_intrinsic_instr *intr) 77bf215546Sopenharmony_ci{ 78bf215546Sopenharmony_ci /* This is also the list of intrinsics that are grouped. */ 79bf215546Sopenharmony_ci /* load_ubo is ignored because it's usually cheap. */ 80bf215546Sopenharmony_ci switch (intr->intrinsic) { 81bf215546Sopenharmony_ci case nir_intrinsic_image_load: 82bf215546Sopenharmony_ci case nir_intrinsic_image_deref_load: 83bf215546Sopenharmony_ci case nir_intrinsic_image_sparse_load: 84bf215546Sopenharmony_ci case nir_intrinsic_image_deref_sparse_load: 85bf215546Sopenharmony_ci /* Group image_size too because it has the same latency as cache hits. */ 86bf215546Sopenharmony_ci case nir_intrinsic_image_size: 87bf215546Sopenharmony_ci case nir_intrinsic_image_deref_size: 88bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_load: 89bf215546Sopenharmony_ci case nir_intrinsic_bindless_image_sparse_load: 90bf215546Sopenharmony_ci case nir_intrinsic_load_ssbo: 91bf215546Sopenharmony_ci return intr->src[0].ssa->parent_instr; 92bf215546Sopenharmony_ci default: 93bf215546Sopenharmony_ci return NULL; 94bf215546Sopenharmony_ci } 95bf215546Sopenharmony_ci} 96bf215546Sopenharmony_ci 97bf215546Sopenharmony_ci/* Track only those that we want to group. */ 98bf215546Sopenharmony_cistatic bool 99bf215546Sopenharmony_ciis_grouped_load(nir_instr *instr) 100bf215546Sopenharmony_ci{ 101bf215546Sopenharmony_ci /* Count texture_size too because it has the same latency as cache hits. */ 102bf215546Sopenharmony_ci if (instr->type == nir_instr_type_tex) 103bf215546Sopenharmony_ci return true; 104bf215546Sopenharmony_ci 105bf215546Sopenharmony_ci if (instr->type == nir_instr_type_intrinsic) 106bf215546Sopenharmony_ci return get_intrinsic_resource(nir_instr_as_intrinsic(instr)) != NULL; 107bf215546Sopenharmony_ci 108bf215546Sopenharmony_ci return false; 109bf215546Sopenharmony_ci} 110bf215546Sopenharmony_ci 111bf215546Sopenharmony_cistatic bool 112bf215546Sopenharmony_cican_move(nir_instr *instr, uint8_t current_indirection_level) 113bf215546Sopenharmony_ci{ 114bf215546Sopenharmony_ci /* Grouping is done by moving everything else out of the first/last 115bf215546Sopenharmony_ci * instruction range of the indirection level. 116bf215546Sopenharmony_ci */ 117bf215546Sopenharmony_ci if (is_grouped_load(instr) && instr->pass_flags == current_indirection_level) 118bf215546Sopenharmony_ci return false; 119bf215546Sopenharmony_ci 120bf215546Sopenharmony_ci if (instr->type == nir_instr_type_alu || 121bf215546Sopenharmony_ci instr->type == nir_instr_type_deref || 122bf215546Sopenharmony_ci instr->type == nir_instr_type_tex || 123bf215546Sopenharmony_ci instr->type == nir_instr_type_load_const || 124bf215546Sopenharmony_ci instr->type == nir_instr_type_ssa_undef) 125bf215546Sopenharmony_ci return true; 126bf215546Sopenharmony_ci 127bf215546Sopenharmony_ci if (instr->type == nir_instr_type_intrinsic && 128bf215546Sopenharmony_ci nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr))) 129bf215546Sopenharmony_ci return true; 130bf215546Sopenharmony_ci 131bf215546Sopenharmony_ci return false; 132bf215546Sopenharmony_ci} 133bf215546Sopenharmony_ci 134bf215546Sopenharmony_cistatic nir_instr * 135bf215546Sopenharmony_ciget_uniform_inst_resource(nir_instr *instr) 136bf215546Sopenharmony_ci{ 137bf215546Sopenharmony_ci if (instr->type == nir_instr_type_tex) { 138bf215546Sopenharmony_ci nir_tex_instr *tex = nir_instr_as_tex(instr); 139bf215546Sopenharmony_ci 140bf215546Sopenharmony_ci if (tex->texture_non_uniform) 141bf215546Sopenharmony_ci return NULL; 142bf215546Sopenharmony_ci 143bf215546Sopenharmony_ci for (unsigned i = 0; i < tex->num_srcs; i++) { 144bf215546Sopenharmony_ci switch (tex->src[i].src_type) { 145bf215546Sopenharmony_ci case nir_tex_src_texture_deref: 146bf215546Sopenharmony_ci case nir_tex_src_texture_handle: 147bf215546Sopenharmony_ci return tex->src[i].src.ssa->parent_instr; 148bf215546Sopenharmony_ci default: 149bf215546Sopenharmony_ci break; 150bf215546Sopenharmony_ci } 151bf215546Sopenharmony_ci } 152bf215546Sopenharmony_ci return NULL; 153bf215546Sopenharmony_ci } 154bf215546Sopenharmony_ci 155bf215546Sopenharmony_ci if (instr->type == nir_instr_type_intrinsic) 156bf215546Sopenharmony_ci return get_intrinsic_resource(nir_instr_as_intrinsic(instr)); 157bf215546Sopenharmony_ci 158bf215546Sopenharmony_ci return NULL; 159bf215546Sopenharmony_ci} 160bf215546Sopenharmony_ci 161bf215546Sopenharmony_cistruct check_sources_state 162bf215546Sopenharmony_ci{ 163bf215546Sopenharmony_ci nir_block *block; 164bf215546Sopenharmony_ci uint32_t first_index; 165bf215546Sopenharmony_ci}; 166bf215546Sopenharmony_ci 167bf215546Sopenharmony_cistatic bool 168bf215546Sopenharmony_cihas_only_sources_less_than(nir_src *src, void *data) 169bf215546Sopenharmony_ci{ 170bf215546Sopenharmony_ci struct check_sources_state *state = (struct check_sources_state *)data; 171bf215546Sopenharmony_ci 172bf215546Sopenharmony_ci /* true if nir_foreach_src should keep going */ 173bf215546Sopenharmony_ci return state->block != src->ssa->parent_instr->block || 174bf215546Sopenharmony_ci src->ssa->parent_instr->index < state->first_index; 175bf215546Sopenharmony_ci} 176bf215546Sopenharmony_ci 177bf215546Sopenharmony_cistatic void 178bf215546Sopenharmony_cigroup_loads(nir_instr *first, nir_instr *last) 179bf215546Sopenharmony_ci{ 180bf215546Sopenharmony_ci /* Walk the instruction range between the first and last backward, and 181bf215546Sopenharmony_ci * move those that have no uses within the range after the last one. 182bf215546Sopenharmony_ci */ 183bf215546Sopenharmony_ci for (nir_instr *instr = exec_node_data_backward(nir_instr, 184bf215546Sopenharmony_ci last->node.prev, node); 185bf215546Sopenharmony_ci instr != first; 186bf215546Sopenharmony_ci instr = exec_node_data_backward(nir_instr, instr->node.prev, node)) { 187bf215546Sopenharmony_ci /* Only move instructions without side effects. */ 188bf215546Sopenharmony_ci if (!can_move(instr, first->pass_flags)) 189bf215546Sopenharmony_ci continue; 190bf215546Sopenharmony_ci 191bf215546Sopenharmony_ci nir_ssa_def *def = nir_instr_ssa_def(instr); 192bf215546Sopenharmony_ci if (def) { 193bf215546Sopenharmony_ci bool all_uses_after_last = true; 194bf215546Sopenharmony_ci 195bf215546Sopenharmony_ci nir_foreach_use(use, def) { 196bf215546Sopenharmony_ci if (use->parent_instr->block == instr->block && 197bf215546Sopenharmony_ci use->parent_instr->index <= last->index) { 198bf215546Sopenharmony_ci all_uses_after_last = false; 199bf215546Sopenharmony_ci break; 200bf215546Sopenharmony_ci } 201bf215546Sopenharmony_ci } 202bf215546Sopenharmony_ci 203bf215546Sopenharmony_ci if (all_uses_after_last) { 204bf215546Sopenharmony_ci nir_instr *move_instr = instr; 205bf215546Sopenharmony_ci /* Set the last instruction because we'll delete the current one. */ 206bf215546Sopenharmony_ci instr = exec_node_data_forward(nir_instr, instr->node.next, node); 207bf215546Sopenharmony_ci 208bf215546Sopenharmony_ci /* Move the instruction after the last and update its index 209bf215546Sopenharmony_ci * to indicate that it's after it. 210bf215546Sopenharmony_ci */ 211bf215546Sopenharmony_ci nir_instr_move(nir_after_instr(last), move_instr); 212bf215546Sopenharmony_ci move_instr->index = last->index + 1; 213bf215546Sopenharmony_ci } 214bf215546Sopenharmony_ci } 215bf215546Sopenharmony_ci } 216bf215546Sopenharmony_ci 217bf215546Sopenharmony_ci struct check_sources_state state; 218bf215546Sopenharmony_ci state.block = first->block; 219bf215546Sopenharmony_ci state.first_index = first->index; 220bf215546Sopenharmony_ci 221bf215546Sopenharmony_ci /* Walk the instruction range between the first and last forward, and move 222bf215546Sopenharmony_ci * those that have no sources within the range before the first one. 223bf215546Sopenharmony_ci */ 224bf215546Sopenharmony_ci for (nir_instr *instr = exec_node_data_forward(nir_instr, 225bf215546Sopenharmony_ci first->node.next, node); 226bf215546Sopenharmony_ci instr != last; 227bf215546Sopenharmony_ci instr = exec_node_data_forward(nir_instr, instr->node.next, node)) { 228bf215546Sopenharmony_ci /* Only move instructions without side effects. */ 229bf215546Sopenharmony_ci if (!can_move(instr, first->pass_flags)) 230bf215546Sopenharmony_ci continue; 231bf215546Sopenharmony_ci 232bf215546Sopenharmony_ci if (nir_foreach_src(instr, has_only_sources_less_than, &state)) { 233bf215546Sopenharmony_ci nir_instr *move_instr = instr; 234bf215546Sopenharmony_ci /* Set the last instruction because we'll delete the current one. */ 235bf215546Sopenharmony_ci instr = exec_node_data_backward(nir_instr, instr->node.prev, node); 236bf215546Sopenharmony_ci 237bf215546Sopenharmony_ci /* Move the instruction before the first and update its index 238bf215546Sopenharmony_ci * to indicate that it's before it. 239bf215546Sopenharmony_ci */ 240bf215546Sopenharmony_ci nir_instr_move(nir_before_instr(first), move_instr); 241bf215546Sopenharmony_ci move_instr->index = first->index - 1; 242bf215546Sopenharmony_ci } 243bf215546Sopenharmony_ci } 244bf215546Sopenharmony_ci} 245bf215546Sopenharmony_ci 246bf215546Sopenharmony_cistatic bool 247bf215546Sopenharmony_ciis_pseudo_inst(nir_instr *instr) 248bf215546Sopenharmony_ci{ 249bf215546Sopenharmony_ci /* Other instructions do not usually contribute to the shader binary size. */ 250bf215546Sopenharmony_ci return instr->type != nir_instr_type_alu && 251bf215546Sopenharmony_ci instr->type != nir_instr_type_call && 252bf215546Sopenharmony_ci instr->type != nir_instr_type_tex && 253bf215546Sopenharmony_ci instr->type != nir_instr_type_intrinsic; 254bf215546Sopenharmony_ci} 255bf215546Sopenharmony_ci 256bf215546Sopenharmony_cistatic void 257bf215546Sopenharmony_ciset_instr_indices(nir_block *block) 258bf215546Sopenharmony_ci{ 259bf215546Sopenharmony_ci /* Start with 1 because we'll move instruction before the first one 260bf215546Sopenharmony_ci * and will want to label it 0. 261bf215546Sopenharmony_ci */ 262bf215546Sopenharmony_ci unsigned counter = 1; 263bf215546Sopenharmony_ci nir_instr *last = NULL; 264bf215546Sopenharmony_ci 265bf215546Sopenharmony_ci nir_foreach_instr(instr, block) { 266bf215546Sopenharmony_ci /* Make sure grouped instructions don't have the same index as pseudo 267bf215546Sopenharmony_ci * instructions. 268bf215546Sopenharmony_ci */ 269bf215546Sopenharmony_ci if (last && is_pseudo_inst(last) && is_grouped_load(instr)) 270bf215546Sopenharmony_ci counter++; 271bf215546Sopenharmony_ci 272bf215546Sopenharmony_ci /* Set each instruction's index within the block. */ 273bf215546Sopenharmony_ci instr->index = counter; 274bf215546Sopenharmony_ci 275bf215546Sopenharmony_ci /* Only count non-pseudo instructions. */ 276bf215546Sopenharmony_ci if (!is_pseudo_inst(instr)) 277bf215546Sopenharmony_ci counter++; 278bf215546Sopenharmony_ci 279bf215546Sopenharmony_ci last = instr; 280bf215546Sopenharmony_ci } 281bf215546Sopenharmony_ci} 282bf215546Sopenharmony_ci 283bf215546Sopenharmony_cistatic void 284bf215546Sopenharmony_cihandle_load_range(nir_instr **first, nir_instr **last, 285bf215546Sopenharmony_ci nir_instr *current, unsigned max_distance) 286bf215546Sopenharmony_ci{ 287bf215546Sopenharmony_ci if (*first && *last && 288bf215546Sopenharmony_ci (!current || current->index > (*first)->index + max_distance)) { 289bf215546Sopenharmony_ci assert(*first != *last); 290bf215546Sopenharmony_ci group_loads(*first, *last); 291bf215546Sopenharmony_ci set_instr_indices((*first)->block); 292bf215546Sopenharmony_ci *first = NULL; 293bf215546Sopenharmony_ci *last = NULL; 294bf215546Sopenharmony_ci } 295bf215546Sopenharmony_ci} 296bf215546Sopenharmony_ci 297bf215546Sopenharmony_cistatic bool 298bf215546Sopenharmony_ciis_barrier(nir_instr *instr) 299bf215546Sopenharmony_ci{ 300bf215546Sopenharmony_ci if (instr->type == nir_instr_type_intrinsic) { 301bf215546Sopenharmony_ci nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 302bf215546Sopenharmony_ci const char *name = nir_intrinsic_infos[intr->intrinsic].name; 303bf215546Sopenharmony_ci 304bf215546Sopenharmony_ci 305bf215546Sopenharmony_ci if (intr->intrinsic == nir_intrinsic_discard || 306bf215546Sopenharmony_ci intr->intrinsic == nir_intrinsic_discard_if || 307bf215546Sopenharmony_ci intr->intrinsic == nir_intrinsic_terminate || 308bf215546Sopenharmony_ci intr->intrinsic == nir_intrinsic_terminate_if || 309bf215546Sopenharmony_ci /* TODO: nir_intrinsics.py could do this */ 310bf215546Sopenharmony_ci strstr(name, "barrier")) 311bf215546Sopenharmony_ci return true; 312bf215546Sopenharmony_ci } 313bf215546Sopenharmony_ci 314bf215546Sopenharmony_ci return false; 315bf215546Sopenharmony_ci} 316bf215546Sopenharmony_ci 317bf215546Sopenharmony_cistruct indirection_state 318bf215546Sopenharmony_ci{ 319bf215546Sopenharmony_ci nir_block *block; 320bf215546Sopenharmony_ci unsigned indirections; 321bf215546Sopenharmony_ci}; 322bf215546Sopenharmony_ci 323bf215546Sopenharmony_cistatic unsigned 324bf215546Sopenharmony_ciget_num_indirections(nir_instr *instr); 325bf215546Sopenharmony_ci 326bf215546Sopenharmony_cistatic bool 327bf215546Sopenharmony_cigather_indirections(nir_src *src, void *data) 328bf215546Sopenharmony_ci{ 329bf215546Sopenharmony_ci struct indirection_state *state = (struct indirection_state *)data; 330bf215546Sopenharmony_ci nir_instr *instr = src->ssa->parent_instr; 331bf215546Sopenharmony_ci 332bf215546Sopenharmony_ci /* We only count indirections within the same block. */ 333bf215546Sopenharmony_ci if (instr->block == state->block) { 334bf215546Sopenharmony_ci unsigned indirections = get_num_indirections(src->ssa->parent_instr); 335bf215546Sopenharmony_ci 336bf215546Sopenharmony_ci if (instr->type == nir_instr_type_tex || is_memory_load(instr)) 337bf215546Sopenharmony_ci indirections++; 338bf215546Sopenharmony_ci 339bf215546Sopenharmony_ci state->indirections = MAX2(state->indirections, indirections); 340bf215546Sopenharmony_ci } 341bf215546Sopenharmony_ci 342bf215546Sopenharmony_ci return true; /* whether nir_foreach_src should keep going */ 343bf215546Sopenharmony_ci} 344bf215546Sopenharmony_ci 345bf215546Sopenharmony_ci/* Return the number of load indirections within the block. */ 346bf215546Sopenharmony_cistatic unsigned 347bf215546Sopenharmony_ciget_num_indirections(nir_instr *instr) 348bf215546Sopenharmony_ci{ 349bf215546Sopenharmony_ci /* Don't traverse phis because we could end up in an infinite recursion 350bf215546Sopenharmony_ci * if the phi points to the current block (such as a loop body). 351bf215546Sopenharmony_ci */ 352bf215546Sopenharmony_ci if (instr->type == nir_instr_type_phi) 353bf215546Sopenharmony_ci return 0; 354bf215546Sopenharmony_ci 355bf215546Sopenharmony_ci if (instr->index != UINT32_MAX) 356bf215546Sopenharmony_ci return instr->index; /* we've visited this instruction before */ 357bf215546Sopenharmony_ci 358bf215546Sopenharmony_ci struct indirection_state state; 359bf215546Sopenharmony_ci state.block = instr->block; 360bf215546Sopenharmony_ci state.indirections = 0; 361bf215546Sopenharmony_ci 362bf215546Sopenharmony_ci nir_foreach_src(instr, gather_indirections, &state); 363bf215546Sopenharmony_ci 364bf215546Sopenharmony_ci instr->index = state.indirections; 365bf215546Sopenharmony_ci return state.indirections; 366bf215546Sopenharmony_ci} 367bf215546Sopenharmony_ci 368bf215546Sopenharmony_cistatic void 369bf215546Sopenharmony_ciprocess_block(nir_block *block, nir_load_grouping grouping, 370bf215546Sopenharmony_ci unsigned max_distance) 371bf215546Sopenharmony_ci{ 372bf215546Sopenharmony_ci int max_indirection = -1; 373bf215546Sopenharmony_ci unsigned num_inst_per_level[256] = {0}; 374bf215546Sopenharmony_ci 375bf215546Sopenharmony_ci /* UINT32_MAX means the instruction has not been visited. Once 376bf215546Sopenharmony_ci * an instruction has been visited and its indirection level has been 377bf215546Sopenharmony_ci * determined, we'll store the indirection level in the index. The next 378bf215546Sopenharmony_ci * instruction that visits it will use the index instead of recomputing 379bf215546Sopenharmony_ci * the indirection level, which would result in an exponetial time 380bf215546Sopenharmony_ci * complexity. 381bf215546Sopenharmony_ci */ 382bf215546Sopenharmony_ci nir_foreach_instr(instr, block) { 383bf215546Sopenharmony_ci instr->index = UINT32_MAX; /* unknown */ 384bf215546Sopenharmony_ci } 385bf215546Sopenharmony_ci 386bf215546Sopenharmony_ci /* Count the number of load indirections for each load instruction 387bf215546Sopenharmony_ci * within this block. Store it in pass_flags. 388bf215546Sopenharmony_ci */ 389bf215546Sopenharmony_ci nir_foreach_instr(instr, block) { 390bf215546Sopenharmony_ci if (is_grouped_load(instr)) { 391bf215546Sopenharmony_ci unsigned indirections = get_num_indirections(instr); 392bf215546Sopenharmony_ci 393bf215546Sopenharmony_ci /* pass_flags has only 8 bits */ 394bf215546Sopenharmony_ci indirections = MIN2(indirections, 255); 395bf215546Sopenharmony_ci num_inst_per_level[indirections]++; 396bf215546Sopenharmony_ci instr->pass_flags = indirections; 397bf215546Sopenharmony_ci 398bf215546Sopenharmony_ci max_indirection = MAX2(max_indirection, (int)indirections); 399bf215546Sopenharmony_ci } 400bf215546Sopenharmony_ci } 401bf215546Sopenharmony_ci 402bf215546Sopenharmony_ci /* 255 contains all indirection levels >= 255, so ignore them. */ 403bf215546Sopenharmony_ci max_indirection = MIN2(max_indirection, 254); 404bf215546Sopenharmony_ci 405bf215546Sopenharmony_ci /* Each indirection level is grouped. */ 406bf215546Sopenharmony_ci for (int level = 0; level <= max_indirection; level++) { 407bf215546Sopenharmony_ci if (num_inst_per_level[level] <= 1) 408bf215546Sopenharmony_ci continue; 409bf215546Sopenharmony_ci 410bf215546Sopenharmony_ci set_instr_indices(block); 411bf215546Sopenharmony_ci 412bf215546Sopenharmony_ci nir_instr *resource = NULL; 413bf215546Sopenharmony_ci nir_instr *first_load = NULL, *last_load = NULL; 414bf215546Sopenharmony_ci 415bf215546Sopenharmony_ci /* Find the first and last instruction that use the same 416bf215546Sopenharmony_ci * resource and are within a certain distance of each other. 417bf215546Sopenharmony_ci * If found, group them by moving all movable instructions 418bf215546Sopenharmony_ci * between them out. 419bf215546Sopenharmony_ci */ 420bf215546Sopenharmony_ci nir_foreach_instr(current, block) { 421bf215546Sopenharmony_ci /* Don't group across barriers. */ 422bf215546Sopenharmony_ci if (is_barrier(current)) { 423bf215546Sopenharmony_ci /* Group unconditionally. */ 424bf215546Sopenharmony_ci handle_load_range(&first_load, &last_load, NULL, 0); 425bf215546Sopenharmony_ci first_load = NULL; 426bf215546Sopenharmony_ci last_load = NULL; 427bf215546Sopenharmony_ci continue; 428bf215546Sopenharmony_ci } 429bf215546Sopenharmony_ci 430bf215546Sopenharmony_ci /* Only group load instructions with the same indirection level. */ 431bf215546Sopenharmony_ci if (is_grouped_load(current) && current->pass_flags == level) { 432bf215546Sopenharmony_ci nir_instr *current_resource; 433bf215546Sopenharmony_ci 434bf215546Sopenharmony_ci switch (grouping) { 435bf215546Sopenharmony_ci case nir_group_all: 436bf215546Sopenharmony_ci if (!first_load) 437bf215546Sopenharmony_ci first_load = current; 438bf215546Sopenharmony_ci else 439bf215546Sopenharmony_ci last_load = current; 440bf215546Sopenharmony_ci break; 441bf215546Sopenharmony_ci 442bf215546Sopenharmony_ci case nir_group_same_resource_only: 443bf215546Sopenharmony_ci current_resource = get_uniform_inst_resource(current); 444bf215546Sopenharmony_ci 445bf215546Sopenharmony_ci if (current_resource) { 446bf215546Sopenharmony_ci if (!first_load) { 447bf215546Sopenharmony_ci first_load = current; 448bf215546Sopenharmony_ci resource = current_resource; 449bf215546Sopenharmony_ci } else if (current_resource == resource) { 450bf215546Sopenharmony_ci last_load = current; 451bf215546Sopenharmony_ci } 452bf215546Sopenharmony_ci } 453bf215546Sopenharmony_ci } 454bf215546Sopenharmony_ci } 455bf215546Sopenharmony_ci 456bf215546Sopenharmony_ci /* Group only if we exceeded the maximum distance. */ 457bf215546Sopenharmony_ci handle_load_range(&first_load, &last_load, current, max_distance); 458bf215546Sopenharmony_ci } 459bf215546Sopenharmony_ci 460bf215546Sopenharmony_ci /* Group unconditionally. */ 461bf215546Sopenharmony_ci handle_load_range(&first_load, &last_load, NULL, 0); 462bf215546Sopenharmony_ci } 463bf215546Sopenharmony_ci} 464bf215546Sopenharmony_ci 465bf215546Sopenharmony_ci/* max_distance is the maximum distance between the first and last instruction 466bf215546Sopenharmony_ci * in a group. 467bf215546Sopenharmony_ci */ 468bf215546Sopenharmony_civoid 469bf215546Sopenharmony_cinir_group_loads(nir_shader *shader, nir_load_grouping grouping, 470bf215546Sopenharmony_ci unsigned max_distance) 471bf215546Sopenharmony_ci{ 472bf215546Sopenharmony_ci nir_foreach_function(function, shader) { 473bf215546Sopenharmony_ci if (function->impl) { 474bf215546Sopenharmony_ci nir_foreach_block(block, function->impl) { 475bf215546Sopenharmony_ci process_block(block, grouping, max_distance); 476bf215546Sopenharmony_ci } 477bf215546Sopenharmony_ci 478bf215546Sopenharmony_ci nir_metadata_preserve(function->impl, 479bf215546Sopenharmony_ci nir_metadata_block_index | 480bf215546Sopenharmony_ci nir_metadata_dominance | 481bf215546Sopenharmony_ci nir_metadata_loop_analysis); 482bf215546Sopenharmony_ci } 483bf215546Sopenharmony_ci } 484bf215546Sopenharmony_ci} 485